# proyekt

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split,KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [3]:
df=pd.read_csv('top_expensive_leagues.csv')

In [4]:
df.head()

Unnamed: 0,League ID,League Name,Country,Sport,Revenue (USD),Average Player Salary (USD),Top Team,Total Teams,Founded Year,Viewership
0,L001,Premier League,England,Football,5275330000.0,12080000.0,Liverpool,16,1886.0,485.9
1,L002,Serie A,India,American Football,7088640000.0,6730000.0,Inter Milan,13,1977.0,135.01
2,L003,IPL,England,American Football,7930520000.0,14360000.0,Chennai Super Kings,13,1979.0,157.49
3,L004,La Liga,Spain,American Football,4972890000.0,13720000.0,Atletico Madrid,22,1999.0,245.71
4,L005,Serie A,Italy,Basketball,6265740000.0,7450000.0,Juventus,16,1963.0,52.83


In [5]:
df.drop(columns=['League Name'], inplace=True)

In [6]:
df.head()

Unnamed: 0,League ID,Country,Sport,Revenue (USD),Average Player Salary (USD),Top Team,Total Teams,Founded Year,Viewership
0,L001,England,Football,5275330000.0,12080000.0,Liverpool,16,1886.0,485.9
1,L002,India,American Football,7088640000.0,6730000.0,Inter Milan,13,1977.0,135.01
2,L003,England,American Football,7930520000.0,14360000.0,Chennai Super Kings,13,1979.0,157.49
3,L004,Spain,American Football,4972890000.0,13720000.0,Atletico Madrid,22,1999.0,245.71
4,L005,Italy,Basketball,6265740000.0,7450000.0,Juventus,16,1963.0,52.83


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   League ID                    700 non-null    object 
 1   Country                      700 non-null    object 
 2   Sport                        700 non-null    object 
 3   Revenue (USD)                700 non-null    float64
 4   Average Player Salary (USD)  700 non-null    object 
 5   Top Team                     700 non-null    object 
 6   Total Teams                  700 non-null    int64  
 7   Founded Year                 700 non-null    float64
 8   Viewership                   697 non-null    float64
dtypes: float64(3), int64(1), object(5)
memory usage: 49.3+ KB


# Data preprocessing

In [8]:
# missing values
df.isnull().sum()

League ID                      0
Country                        0
Sport                          0
Revenue (USD)                  0
Average Player Salary (USD)    0
Top Team                       0
Total Teams                    0
Founded Year                   0
Viewership                     3
dtype: int64

In [9]:
df['Viewership'].fillna(df['Viewership'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Viewership'].fillna(df['Viewership'].mean(), inplace=True)


In [10]:
df.isnull().sum()

League ID                      0
Country                        0
Sport                          0
Revenue (USD)                  0
Average Player Salary (USD)    0
Top Team                       0
Total Teams                    0
Founded Year                   0
Viewership                     0
dtype: int64

In [11]:
for col in df.columns:
    if df[col].dtype=='object':
        df[col].fillna(df[col].mode()[0],inplace=True)
    else:
        df[col].fillna(df[col].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0],inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)


In [15]:
# encoding
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   League ID                    700 non-null    object 
 1   Country                      700 non-null    object 
 2   Sport                        700 non-null    object 
 3   Revenue (USD)                700 non-null    float64
 4   Average Player Salary (USD)  700 non-null    object 
 5   Top Team                     700 non-null    object 
 6   Total Teams                  700 non-null    int64  
 7   Founded Year                 700 non-null    float64
 8   Viewership                   700 non-null    float64
dtypes: float64(3), int64(1), object(5)
memory usage: 49.3+ KB


In [12]:
df['League ID'].value_counts()

League ID
L700    1
L001    1
L002    1
L003    1
L004    1
       ..
L017    1
L016    1
L015    1
L014    1
L013    1
Name: count, Length: 700, dtype: int64

In [13]:
cardinality=df.nunique()

In [14]:
cardinality

League ID                      700
Country                          7
Sport                            4
Revenue (USD)                  699
Average Player Salary (USD)    541
Top Team                        27
Total Teams                     26
Founded Year                   146
Viewership                     691
dtype: int64

In [16]:
df.head()

Unnamed: 0,League ID,Country,Sport,Revenue (USD),Average Player Salary (USD),Top Team,Total Teams,Founded Year,Viewership
0,L001,England,Football,5275330000.0,12080000.0,Liverpool,16,1886.0,485.9
1,L002,India,American Football,7088640000.0,6730000.0,Inter Milan,13,1977.0,135.01
2,L003,England,American Football,7930520000.0,14360000.0,Chennai Super Kings,13,1979.0,157.49
3,L004,Spain,American Football,4972890000.0,13720000.0,Atletico Madrid,22,1999.0,245.71
4,L005,Italy,Basketball,6265740000.0,7450000.0,Juventus,16,1963.0,52.83


In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 9 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   League ID                    700 non-null    object 
 1   Country                      700 non-null    object 
 2   Sport                        700 non-null    object 
 3   Revenue (USD)                700 non-null    float64
 4   Average Player Salary (USD)  700 non-null    object 
 5   Top Team                     700 non-null    object 
 6   Total Teams                  700 non-null    int64  
 7   Founded Year                 700 non-null    float64
 8   Viewership                   700 non-null    float64
dtypes: float64(3), int64(1), object(5)
memory usage: 49.3+ KB


In [18]:
for col in df.columns:
    if df[col].dtype=='object':
        if cardinality[col]<=5:
            df=pd.get_dummies(df,columns=[col], dtype=int, drop_first=True)
        else:
            le=LabelEncoder()
            df[col]=le.fit_transform(df[col])

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   League ID                    700 non-null    int64  
 1   Country                      700 non-null    int64  
 2   Revenue (USD)                700 non-null    float64
 3   Average Player Salary (USD)  700 non-null    int64  
 4   Top Team                     700 non-null    int64  
 5   Total Teams                  700 non-null    int64  
 6   Founded Year                 700 non-null    float64
 7   Viewership                   700 non-null    float64
 8   Sport_Basketball             700 non-null    int64  
 9   Sport_Cricket                700 non-null    int64  
 10  Sport_Football               700 non-null    int64  
dtypes: float64(3), int64(8)
memory usage: 60.3 KB


In [20]:
num_col=df.select_dtypes(include=['int32','float64']).columns

In [21]:
num_col

Index(['Revenue (USD)', 'Founded Year', 'Viewership'], dtype='object')

In [22]:
scaler=StandardScaler()
df[num_col]=scaler.fit_transform(df[num_col])

In [23]:
df.head()

Unnamed: 0,League ID,Country,Revenue (USD),Average Player Salary (USD),Top Team,Total Teams,Founded Year,Viewership,Sport_Basketball,Sport_Cricket,Sport_Football
0,0,0,0.461642,74,15,16,-0.512943,1.644673,0,0,1
1,1,3,1.298311,418,12,13,0.265422,-0.863742,0,0,0
2,2,0,1.686759,179,8,13,0.282529,-0.703038,0,0,0
3,3,5,0.322094,146,4,22,0.453598,-0.072378,0,0,0
4,4,4,0.918621,439,13,16,0.145674,-1.451224,1,0,0


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   League ID                    700 non-null    int64  
 1   Country                      700 non-null    int64  
 2   Revenue (USD)                700 non-null    float64
 3   Average Player Salary (USD)  700 non-null    int64  
 4   Top Team                     700 non-null    int64  
 5   Total Teams                  700 non-null    int64  
 6   Founded Year                 700 non-null    float64
 7   Viewership                   700 non-null    float64
 8   Sport_Basketball             700 non-null    int64  
 9   Sport_Cricket                700 non-null    int64  
 10  Sport_Football               700 non-null    int64  
dtypes: float64(3), int64(8)
memory usage: 60.3 KB


In [None]:
x=df.drop(columns=['Viewership'])       # inputlarni tanlash

In [30]:
y=df['Viewership']   # output

In [None]:
x_train,x_temp,y_train,y_temp=train_test_split(x,y,test_size=0.2,random_state=42)          # Datani qismlarga ajratish
x_test,x_val,y_test,y_val=train_test_split(x_temp,y_temp,test_size=0.5,random_state=42)

In [32]:
model=LinearRegression()
linear_model=model.fit(x_train,y_train)
y_pred=linear_model.predict(x_test)
mse=mean_squared_error(y_test,y_pred)
r2=r2_score(y_test,y_pred)

In [33]:
mse

0.8852240151971663

In [34]:
r2

0.03143084086698855

In [37]:
kf = KFold(n_splits=10, shuffle=True, random_state=42)


cv_scores = cross_val_score(linear_model, x, y, cv=kf, scoring='r2')
cv_scores = np.sqrt(-cv_scores)

print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", np.mean(cv_scores))

Cross-Validation Scores: [       nan        nan        nan 0.03232748 0.12649424 0.27274938
 0.14522832        nan 0.07299388 0.30227435]
Mean CV Score: nan


  cv_scores = np.sqrt(-cv_scores)


In [38]:
kf = KFold(n_splits=10, shuffle=True, random_state=42)


cv_scores = cross_val_score(linear_model, x, y, cv=kf, scoring='neg_mean_squared_error')
cv_scores = np.sqrt(-cv_scores)

print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", np.mean(cv_scores))

Cross-Validation Scores: [0.95419961 1.07782918 0.98313733 0.97711968 0.96375153 0.93750241
 1.03574159 0.97666896 1.05235329 1.06821077]
Mean CV Score: 1.0026514359400145


In [39]:
print(np.mean(cv_scores)-mse)

0.11742742074284818
