In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import MinMaxScaler

In [2]:
df=pd.read_csv('Top Expensive Leagues.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   League ID                    700 non-null    object 
 1   League Name                  700 non-null    object 
 2   Country                      700 non-null    object 
 3   Sport                        700 non-null    object 
 4   Revenue (USD)                700 non-null    float64
 5   Average Player Salary (USD)  700 non-null    object 
 6   Top Team                     700 non-null    object 
 7   Total Teams                  700 non-null    int64  
 8   Founded Year                 700 non-null    float64
 9   Viewership                   697 non-null    float64
dtypes: float64(3), int64(1), object(6)
memory usage: 54.8+ KB


In [4]:
df.isnull().sum()

League ID                      0
League Name                    0
Country                        0
Sport                          0
Revenue (USD)                  0
Average Player Salary (USD)    0
Top Team                       0
Total Teams                    0
Founded Year                   0
Viewership                     3
dtype: int64

In [5]:
missing_col=df.isnull().sum()>0
for col in df.columns[missing_col]:
    if df[col].dtype=='object':
        df[col].fillna(df[col].mode()[0], inplace=True)
    else:
        df[col].fillna(df[col].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)


In [6]:
df.isnull().sum()

League ID                      0
League Name                    0
Country                        0
Sport                          0
Revenue (USD)                  0
Average Player Salary (USD)    0
Top Team                       0
Total Teams                    0
Founded Year                   0
Viewership                     0
dtype: int64

ENCODING

In [7]:
categorical_col=df.select_dtypes(include=['object', 'category']).columns.to_list()

LABEL_ENCODING

In [8]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

In [9]:
df[categorical_col] = df[categorical_col].apply(label_encoder.fit_transform)

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 10 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   League ID                    700 non-null    int64  
 1   League Name                  700 non-null    int64  
 2   Country                      700 non-null    int64  
 3   Sport                        700 non-null    int64  
 4   Revenue (USD)                700 non-null    float64
 5   Average Player Salary (USD)  700 non-null    int64  
 6   Top Team                     700 non-null    int64  
 7   Total Teams                  700 non-null    int64  
 8   Founded Year                 700 non-null    float64
 9   Viewership                   700 non-null    float64
dtypes: float64(3), int64(7)
memory usage: 54.8 KB


SCALLING

In [11]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler, RobustScaler

MIN-MAX SCALING

In [13]:
df.head()

Unnamed: 0,League ID,League Name,Country,Sport,Revenue (USD),Average Player Salary (USD),Top Team,Total Teams,Founded Year,Viewership
0,0,6,0,3,5275330000.0,74,15,16,1886.0,485.9
1,1,7,3,0,7088640000.0,418,12,13,1977.0,135.01
2,2,1,0,0,7930520000.0,179,8,13,1979.0,157.49
3,3,2,5,0,4972890000.0,146,4,22,1999.0,245.71
4,4,7,4,1,6265740000.0,439,13,16,1963.0,52.83


In [14]:
min_max_scaler = MinMaxScaler()
df_scaled = pd.DataFrame(min_max_scaler.fit_transform(df), columns=df.columns)

print(df_scaled.head())

   League ID  League Name   Country     Sport  Revenue (USD)  \
0   0.000000     0.857143  0.000000  1.000000       0.659089   
1   0.001431     1.000000  0.500000  0.000000       0.886221   
2   0.002861     0.142857  0.000000  0.000000       0.991673   
3   0.004292     0.285714  0.833333  0.000000       0.621206   
4   0.005722     1.000000  0.666667  0.333333       0.783146   

   Average Player Salary (USD)  Top Team  Total Teams  Founded Year  \
0                     0.137037  0.576923     0.002981      0.927561   
1                     0.774074  0.461538     0.001490      0.975677   
2                     0.331481  0.307692     0.001490      0.976735   
3                     0.270370  0.153846     0.005961      0.987310   
4                     0.812963  0.500000     0.002981      0.968275   

   Viewership  
0    0.971768  
1    0.266703  
2    0.311873  
3    0.489139  
4    0.101573  


In [16]:
df_scaled.head()

Unnamed: 0,League ID,League Name,Country,Sport,Revenue (USD),Average Player Salary (USD),Top Team,Total Teams,Founded Year,Viewership
0,0.0,0.857143,0.0,1.0,0.659089,0.137037,0.576923,0.002981,0.927561,0.971768
1,0.001431,1.0,0.5,0.0,0.886221,0.774074,0.461538,0.00149,0.975677,0.266703
2,0.002861,0.142857,0.0,0.0,0.991673,0.331481,0.307692,0.00149,0.976735,0.311873
3,0.004292,0.285714,0.833333,0.0,0.621206,0.27037,0.153846,0.005961,0.98731,0.489139
4,0.005722,1.0,0.666667,0.333333,0.783146,0.812963,0.5,0.002981,0.968275,0.101573


STANDARD SCALING    #Restart+Encoding+Cleaning

In [13]:
s_scaler = StandardScaler()
df_scaled = pd.DataFrame(s_scaler.fit_transform(df), columns=df.columns)

print("Standard Scaled DataFrame:")
print(df_scaled.head())

Standard Scaled DataFrame:
   League ID  League Name   Country     Sport  Revenue (USD)  \
0  -1.729578     1.035002 -1.507441  1.293781       0.461642   
1  -1.724629     1.464464  0.013763 -1.383007       1.298311   
2  -1.719681    -1.112305 -1.507441 -1.383007       1.686759   
3  -1.714732    -0.682844  1.027900 -1.383007       0.322094   
4  -1.709783     1.464464  0.520831 -0.490745       0.918621   

   Average Player Salary (USD)  Top Team  Total Teams  Founded Year  \
0                    -1.270306  0.038856    -0.106994     -0.512943   
1                     0.940247 -0.399843    -0.130928      0.265422   
2                    -0.595573 -0.984775    -0.130928      0.282529   
3                    -0.807632 -1.569708    -0.059127      0.453598   
4                     1.075194 -0.253610    -0.106994      0.145674   

   Viewership  
0    1.644673  
1   -0.863742  
2   -0.703038  
3   -0.072378  
4   -1.451224  


In [14]:
df_scaled.head()

Unnamed: 0,League ID,League Name,Country,Sport,Revenue (USD),Average Player Salary (USD),Top Team,Total Teams,Founded Year,Viewership
0,-1.729578,1.035002,-1.507441,1.293781,0.461642,-1.270306,0.038856,-0.106994,-0.512943,1.644673
1,-1.724629,1.464464,0.013763,-1.383007,1.298311,0.940247,-0.399843,-0.130928,0.265422,-0.863742
2,-1.719681,-1.112305,-1.507441,-1.383007,1.686759,-0.595573,-0.984775,-0.130928,0.282529,-0.703038
3,-1.714732,-0.682844,1.0279,-1.383007,0.322094,-0.807632,-1.569708,-0.059127,0.453598,-0.072378
4,-1.709783,1.464464,0.520831,-0.490745,0.918621,1.075194,-0.25361,-0.106994,0.145674,-1.451224


ROBUST SCALING      #Restart+Encoding+Cleaning

In [12]:
r_scaler = RobustScaler()
df_scaled = pd.DataFrame(r_scaler.fit_transform(df), columns=df.columns)

print("Robust Scaled DataFrame:")
print(df_scaled.head())

Robust Scaled DataFrame:
   League ID  League Name  Country  Sport  Revenue (USD)  \
0  -1.000000         0.50    -0.75    0.5       0.269249   
1  -0.997139         0.75     0.00   -1.0       0.773401   
2  -0.994278        -0.75    -0.75   -1.0       1.007467   
3  -0.991416        -0.50     0.50   -1.0       0.185163   
4  -0.988555         0.75     0.25   -0.5       0.544611   

   Average Player Salary (USD)  Top Team  Total Teams  Founded Year  \
0                    -0.725436  0.000000    -0.416667     -0.930556   
1                     0.538108 -0.272727    -0.666667      0.333333   
2                    -0.339761 -0.636364    -0.666667      0.361111   
3                    -0.460973 -1.000000     0.083333      0.638889   
4                     0.615243 -0.181818    -0.416667      0.138889   

   Viewership  
0    0.970175  
1   -0.489935  
2   -0.396392  
3   -0.029295  
4   -0.831899  


In [13]:
df_scaled.head()

Unnamed: 0,League ID,League Name,Country,Sport,Revenue (USD),Average Player Salary (USD),Top Team,Total Teams,Founded Year,Viewership
0,-1.0,0.5,-0.75,0.5,0.269249,-0.725436,0.0,-0.416667,-0.930556,0.970175
1,-0.997139,0.75,0.0,-1.0,0.773401,0.538108,-0.272727,-0.666667,0.333333,-0.489935
2,-0.994278,-0.75,-0.75,-1.0,1.007467,-0.339761,-0.636364,-0.666667,0.361111,-0.396392
3,-0.991416,-0.5,0.5,-1.0,0.185163,-0.460973,-1.0,0.083333,0.638889,-0.029295
4,-0.988555,0.75,0.25,-0.5,0.544611,0.615243,-0.181818,-0.416667,0.138889,-0.831899
