:***In this project, you are tasked to build a model/models that predict a player's overall rating given the player's profile.***

In [2]:
#Mounting
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [49]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
import pickle

In [54]:
import sklearn
sklearn.__version__

'1.2.2'

***Data Preprocessing***

In [None]:
#The name of my dataframe is 'fifa_df'.
fifa_df = pd.read_csv('/content/drive/My Drive/Introduction_to_AI_Mid_Semester_Project/players_21.csv')
fifa_df.info()
fifa_df.describe()
fifa_df.head()

In [None]:
#This code will generate a table displaying the column names, their non-null counts, and data types
non_null_counts = fifa_df.count()
data_types = fifa_df.dtypes

data_info = pd.DataFrame({'Column': non_null_counts.index, 'Non-Null Count': non_null_counts.values, 'Dtype': data_types.values})
data_info = data_info.reset_index(drop=True)

data_info_str = data_info.to_string()
print(data_info_str)


In [None]:
#This code will give a list of the column names in the DataFrame fifa_df that have the 'object' data type.
print("object columns")
object_columns = fifa_df.select_dtypes(include=['object']).columns
print(object_columns)

#This code will give a list of the column names in the DataFrame fifa_df that have the 'int' data type.
print("int columns")
int_columns = fifa_df.select_dtypes(include=['int64']).columns
print(int_columns)

#This code will give a list of the column names in the DataFrame fifa_df that have the 'float' data type.
print("float columns")
float_columns = fifa_df.select_dtypes(include=['float64']).columns
print(float_columns)

In [7]:
#Dropping features that cannot predicate a player's overall rating
#Store them  first
useless_features = [
       'player_url', 'short_name', 'long_name', 'player_positions', 'dob', 'league_name', 'club_position', 'club_loaned_from',
       'club_joined', 'nationality_name', 'nation_position', 'real_face', 'player_tags','player_face_url', 'club_logo_url',
       'club_flag_url', 'nation_logo_url', 'nation_flag_url', 'sofifa_id',  'nationality_id','club_team_id','club_jersey_number',
       'club_contract_valid_until', 'nation_team_id',
       'nation_jersey_number', 'release_clause_eur']

useless_features_dataframe = fifa_df[useless_features].copy()

fifa_df = fifa_df.drop(['player_url', 'short_name', 'long_name', 'player_positions', 'dob', 'league_name', 'club_position', 'club_loaned_from',
       'club_joined', 'nationality_name', 'nation_position', 'real_face', 'player_tags','player_face_url', 'club_logo_url',
       'club_flag_url', 'nation_logo_url', 'nation_flag_url', 'sofifa_id',  'nationality_id','club_team_id','club_jersey_number',
       'club_contract_valid_until', 'nation_team_id',
       'nation_jersey_number', 'release_clause_eur'], axis=1)


In [None]:
#Finding the percentage of missing values in each column
percentage_of_missing_values_per_column= fifa_df.isnull().sum()/len(fifa_df)
data_info = pd.DataFrame({'Column': percentage_of_missing_values_per_column.index, 'Null Percentage': percentage_of_missing_values_per_column.values})
data_info = data_info.reset_index(drop=True)
data_info_str = data_info.to_string()
print(data_info_str)


In [9]:
#Display the columns  with more than 30 percent missing values in a list
columns_with_over_30_percent_missing_percentage = percentage_of_missing_values_per_column[percentage_of_missing_values_per_column > 0.3]
columns_with_over_30_percent_missing_percentage = columns_with_over_30_percent_missing_percentage.index.tolist()
print(columns_with_over_30_percent_missing_percentage)

#Source: https://datascienceparichay.com/article/pandas-percentage-of-missing-values-in-each-column/


['player_traits', 'goalkeeping_speed']


In [None]:
#Dropping features with more than 30 percent missing values
fifa_df = fifa_df.drop(['player_traits','goalkeeping_speed'], axis=1)
fifa_df

In [11]:
#This code will give  a list of the column names in the DataFrame fifa_df that have the 'object' data type.
print("object columns")
object_columns = fifa_df.select_dtypes(include=['object']).columns
print(object_columns)

object columns
Index(['club_name', 'preferred_foot', 'work_rate', 'body_type', 'ls', 'st',
       'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm',
       'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb',
       'rcb', 'rb', 'gk'],
      dtype='object')


In [12]:
#Storing the non-numeric features in a dictionary (in case of anything)
object_columns = fifa_df.select_dtypes(include=['object']).columns
print(object_columns)

#Let the object columns be stored in a new dataframe
fifa_df_object = fifa_df[object_columns]

Index(['club_name', 'preferred_foot', 'work_rate', 'body_type', 'ls', 'st',
       'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm',
       'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb',
       'rcb', 'rb', 'gk'],
      dtype='object')


In [13]:
#Dropping the "unencoded" non-numeric features from the DataFrame
fifa_df = fifa_df.drop(columns=object_columns)
fifa_df

Unnamed: 0,overall,potential,value_eur,wage_eur,age,height_cm,weight_kg,league_level,weak_foot,skill_moves,...,mentality_penalties,mentality_composure,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes
0,93,93,103500000.0,560000.0,33,170,72,1.0,4,4,...,75,96,32,35,24,6,11,15,14,8
1,92,92,63000000.0,220000.0,35,187,83,1.0,4,5,...,84,95,28,32,24,7,11,15,14,11
2,91,91,111000000.0,240000.0,31,184,80,1.0,4,4,...,88,88,35,42,19,15,6,12,8,10
3,91,91,132000000.0,270000.0,28,175,68,1.0,5,5,...,92,93,35,30,29,9,9,15,15,11
4,91,91,129000000.0,370000.0,29,181,70,1.0,5,4,...,84,91,68,65,53,15,13,5,10,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18939,47,52,70000.0,1000.0,21,177,70,1.0,2,2,...,35,40,45,56,47,12,13,8,14,6
18940,47,53,70000.0,1000.0,21,174,68,1.0,2,2,...,35,35,43,42,53,8,8,13,14,10
18941,47,47,45000.0,2000.0,28,185,79,1.0,2,2,...,36,35,38,43,45,8,5,11,5,7
18942,47,67,130000.0,500.0,17,171,58,4.0,2,2,...,50,45,18,11,13,11,13,9,9,6


In [14]:
#Encoding- Converting non-numeric features into numeric features
encoded_features = fifa_df_object.apply(lambda x: pd.factorize(x)[0])

#Source: https://www.statology.org/pandas-factorize/#:~:text=The%20pandas%20factorize%20%28%29%20function%20can%20be%20used,%3D%20pd.factorize%28df%20%5B%27col%27%5D%29%20Method%202%3A%20Factorize%20Specific%20Columns


In [15]:
#Concatenating the encoded features with the original DataFrame
fifa_df_encoded = pd.concat([fifa_df, encoded_features], axis=1)


In [None]:
fifa_df_encoded

In [17]:
#Imputting the data- replacing missing values
imputer=SimpleImputer(strategy='mean')
imputed = imputer.fit_transform(fifa_df_encoded)
fifa_df_imputed = pd.DataFrame(imputed, columns=fifa_df_encoded.columns)
fifa_df_imputed


Unnamed: 0,overall,potential,value_eur,wage_eur,age,height_cm,weight_kg,league_level,weak_foot,skill_moves,...,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,gk
0,93.0,93.0,103500000.0,560000.0,33.0,170.0,72.0,1.0,4.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,92.0,92.0,63000000.0,220000.0,35.0,187.0,83.0,1.0,4.0,5.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,91.0,91.0,111000000.0,240000.0,31.0,184.0,80.0,1.0,4.0,4.0,...,0.0,0.0,0.0,2.0,1.0,2.0,2.0,2.0,1.0,0.0
3,91.0,91.0,132000000.0,270000.0,28.0,175.0,68.0,1.0,5.0,5.0,...,2.0,2.0,2.0,3.0,0.0,3.0,3.0,3.0,0.0,1.0
4,91.0,91.0,129000000.0,370000.0,29.0,181.0,70.0,1.0,5.0,4.0,...,3.0,3.0,3.0,4.0,2.0,4.0,4.0,4.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18939,47.0,52.0,70000.0,1000.0,21.0,177.0,70.0,1.0,2.0,2.0,...,98.0,98.0,98.0,145.0,111.0,101.0,101.0,101.0,111.0,40.0
18940,47.0,53.0,70000.0,1000.0,21.0,174.0,68.0,1.0,2.0,2.0,...,103.0,103.0,103.0,115.0,102.0,102.0,102.0,102.0,102.0,40.0
18941,47.0,47.0,45000.0,2000.0,28.0,185.0,79.0,1.0,2.0,2.0,...,256.0,256.0,256.0,219.0,213.0,258.0,258.0,258.0,213.0,55.0
18942,47.0,67.0,130000.0,500.0,17.0,171.0,58.0,4.0,2.0,2.0,...,71.0,71.0,71.0,138.0,86.0,85.0,85.0,85.0,86.0,32.0


***Feature Engineering***

In [18]:
#Storing the target feature in y
y=fifa_df_imputed['overall']

In [19]:
#Drop the target variable from the fifa_df_imputed dataframe
fifa_df_imputed=fifa_df_imputed.drop(columns=['overall'])

In [20]:
#Determine how much each independent variable corelates with the target variable, overall using correlation analysis
correlation = fifa_df_imputed.corrwith(y).abs()
sorted_correlation = correlation.sort_values(ascending=False)
sorted_correlation

movement_reactions      0.867234
mentality_composure     0.705252
passing                 0.662090
potential               0.636366
dribbling               0.596558
                          ...   
body_type               0.004861
goalkeeping_reflexes    0.004052
goalkeeping_handling    0.003454
goalkeeping_diving      0.003128
goalkeeping_kicking     0.000512
Length: 81, dtype: float64

In [21]:
#Display the columns with their correlation coefficient
correlation_df = sorted_correlation.reset_index()
correlation_df.columns = ['Feature', 'Correlation']
correlation_df

Unnamed: 0,Feature,Correlation
0,movement_reactions,0.867234
1,mentality_composure,0.705252
2,passing,0.662090
3,potential,0.636366
4,dribbling,0.596558
...,...,...
76,body_type,0.004861
77,goalkeeping_reflexes,0.004052
78,goalkeeping_handling,0.003454
79,goalkeeping_diving,0.003128


In [22]:
#Coefficients close to zero mean that there is no linear correlation. The decided threshold that indicates no linear correlation is 0.20.
#Therefore, any coefficient less than 0.20 is considered to have no linear correlation with the target variable.

correlation_threshold = 0.2
columns_with_no_linear_correlation = correlation_df[correlation_df['Correlation'] <= correlation_threshold]
columns_with_no_linear_correlation

#Display the columns in a list
features_with_no_linear_correlation = columns_with_no_linear_correlation['Feature'].tolist()
features_with_no_linear_correlation

['pace',
 'weight_kg',
 'lcb',
 'cb',
 'rcb',
 'movement_balance',
 'work_rate',
 'preferred_foot',
 'height_cm',
 'goalkeeping_positioning',
 'body_type',
 'goalkeeping_reflexes',
 'goalkeeping_handling',
 'goalkeeping_diving',
 'goalkeeping_kicking']

In [23]:
#Drop the columns with no linear correlation
fifa_df_better_correlation = fifa_df_imputed.drop([ 'pace','weight_kg', 'lcb','cb','rcb','movement_balance','work_rate','preferred_foot','height_cm','goalkeeping_positioning',
                                                    'body_type','goalkeeping_reflexes','goalkeeping_handling','goalkeeping_diving','goalkeeping_kicking'], axis=1)
fifa_df_better_correlation

Unnamed: 0,potential,value_eur,wage_eur,age,league_level,weak_foot,skill_moves,international_reputation,shooting,passing,...,rcm,rm,lwb,ldm,cdm,rdm,rwb,lb,rb,gk
0,93.0,103500000.0,560000.0,33.0,1.0,4.0,4.0,5.0,92.0,91.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,92.0,63000000.0,220000.0,35.0,1.0,4.0,5.0,5.0,93.0,81.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
2,91.0,111000000.0,240000.0,31.0,1.0,4.0,4.0,4.0,91.0,78.0,...,2.0,2.0,2.0,0.0,0.0,0.0,2.0,1.0,1.0,0.0
3,91.0,132000000.0,270000.0,28.0,1.0,5.0,5.0,5.0,85.0,86.0,...,3.0,3.0,3.0,2.0,2.0,2.0,3.0,0.0,0.0,1.0
4,91.0,129000000.0,370000.0,29.0,1.0,5.0,4.0,4.0,86.0,93.0,...,4.0,4.0,4.0,3.0,3.0,3.0,4.0,2.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18939,52.0,70000.0,1000.0,21.0,1.0,2.0,2.0,1.0,23.0,26.0,...,39.0,42.0,145.0,98.0,98.0,98.0,145.0,111.0,111.0,40.0
18940,53.0,70000.0,1000.0,21.0,1.0,2.0,2.0,1.0,32.0,49.0,...,17.0,145.0,115.0,103.0,103.0,103.0,115.0,102.0,102.0,40.0
18941,47.0,45000.0,2000.0,28.0,1.0,2.0,2.0,1.0,37.0,49.0,...,217.0,249.0,219.0,256.0,256.0,256.0,219.0,213.0,213.0,55.0
18942,67.0,130000.0,500.0,17.0,4.0,2.0,2.0,1.0,46.0,40.0,...,163.0,141.0,138.0,71.0,71.0,71.0,138.0,86.0,86.0,32.0




In [57]:

# Adjust Pandas display options
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_colwidth', -1)  # This is to display long text or URLs in full


  pd.set_option('display.max_colwidth', -1)  # This is to display long text or URLs in full


In [58]:
print(fifa_df_better_correlation)

       potential    value_eur  wage_eur   age  league_level  weak_foot  skill_moves  international_reputation  shooting  passing  dribbling  defending  physic  attacking_crossing  attacking_finishing  attacking_heading_accuracy  attacking_short_passing  attacking_volleys  skill_dribbling  skill_curve  skill_fk_accuracy  skill_long_passing  skill_ball_control  movement_acceleration  movement_sprint_speed  movement_agility  movement_reactions  power_shot_power  power_jumping  power_stamina  power_strength  power_long_shots  mentality_aggression  mentality_interceptions  mentality_positioning  mentality_vision  mentality_penalties  mentality_composure  defending_marking_awareness  defending_standing_tackle  defending_sliding_tackle  club_name     ls     st     rs    lw    lf    cf    rf    rw    lam    cam    ram     lm    lcm     cm    rcm     rm    lwb    ldm    cdm    rdm    rwb     lb     rb    gk
0      93.0       103500000.0  560000.0  33.0  1.0           4.0        4.0          5.0

In [53]:
print("All Columns")
all_columns = fifa_df_better_correlation.columns
print(all_columns)

All Columns
Index(['potential', 'value_eur', 'wage_eur', 'age', 'league_level',
       'weak_foot', 'skill_moves', 'international_reputation', 'shooting',
       'passing', 'dribbling', 'defending', 'physic', 'attacking_crossing',
       'attacking_finishing', 'attacking_heading_accuracy',
       'attacking_short_passing', 'attacking_volleys', 'skill_dribbling',
       'skill_curve', 'skill_fk_accuracy', 'skill_long_passing',
       'skill_ball_control', 'movement_acceleration', 'movement_sprint_speed',
       'movement_agility', 'movement_reactions', 'power_shot_power',
       'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots',
       'mentality_aggression', 'mentality_interceptions',
       'mentality_positioning', 'mentality_vision', 'mentality_penalties',
       'mentality_composure', 'defending_marking_awareness',
       'defending_standing_tackle', 'defending_sliding_tackle', 'club_name',
       'ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ra

In [24]:
#Scaling the data
fifa_df_scaled=pd.DataFrame(StandardScaler().fit_transform(fifa_df_better_correlation))
fifa_df_scaled

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,56,57,58,59,60,61,62,63,64,65
0,3.586563,13.073165,27.857178,1.655055,-0.483897,1.594026,2.135802,10.801035,3.009606,3.493625,...,-2.755950,-2.400053,-2.554962,-2.507989,-2.507989,-2.507989,-2.554962,-2.701839,-2.701839,-1.774860
1,3.422893,7.809992,10.662997,2.080838,-0.483897,1.594026,3.440521,10.801035,3.085367,2.461857,...,-2.728283,-2.376233,-2.524670,-2.482132,-2.482132,-2.482132,-2.524670,-2.668455,-2.668455,-1.722766
2,3.259222,14.047827,11.674420,1.229273,-0.483897,1.594026,2.135802,8.037314,2.933845,2.152326,...,-2.700617,-2.352413,-2.494377,-2.507989,-2.507989,-2.507989,-2.494377,-2.668455,-2.668455,-1.774860
3,3.259222,16.776879,13.191553,0.590598,-0.483897,3.093020,3.440521,10.801035,2.479280,2.977741,...,-2.672950,-2.328593,-2.464085,-2.456274,-2.456274,-2.456274,-2.464085,-2.701839,-2.701839,-1.722766
4,3.259222,16.387015,18.248665,0.803490,-0.483897,3.093020,2.135802,8.037314,2.555040,3.699979,...,-2.645284,-2.304773,-2.433793,-2.430416,-2.430416,-2.430416,-2.433793,-2.635071,-2.635071,-1.670673
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18939,-3.123941,-0.368070,-0.412078,-0.899642,-0.483897,-1.403962,-0.473636,-0.253847,-2.217897,-3.212867,...,-1.676956,-1.399614,1.837389,0.026073,0.026073,0.026073,1.837389,1.003794,1.003794,0.308872
18940,-2.960270,-0.368070,-0.412078,-0.899642,-0.483897,-1.403962,-0.473636,-0.253847,-1.536049,-0.839801,...,-2.285619,1.053842,0.928627,0.155362,0.155362,0.155362,0.928627,0.703337,0.703337,0.308872
18941,-3.942295,-0.371319,-0.361507,0.590598,-0.483897,-1.403962,-0.473636,-0.253847,-1.157245,-0.839801,...,3.247685,3.531119,4.079003,4.111603,4.111603,4.111603,4.079003,4.408970,4.408970,1.090272
18942,-0.668878,-0.360272,-0.437363,-1.751207,3.599992,-1.403962,-0.473636,-0.253847,-0.475396,-1.768392,...,1.753693,0.958563,1.625345,-0.672087,-0.672087,-0.672087,1.625345,0.169192,0.169192,-0.107874


***Traininig Models and Evaluation***

In [25]:
Xtrain,Xtest,Ytrain,Ytest=train_test_split(fifa_df_scaled,y,test_size=0.2,random_state=42)

In [26]:
#Random Forest Regressor
random_forest_model = RandomForestRegressor(n_estimators=100, random_state=42)
random_forest_scores = cross_val_score(random_forest_model, Xtrain, Ytrain, cv = 5, scoring = 'neg_mean_squared_error')     #Gives the negative mean squared error
rmse = -random_forest_scores.mean()
print(rmse**0.5)

random_forest_model.fit(Xtrain, Ytrain)
score = random_forest_model.score(Xtest, Ytest)        #Gives the accurcay of the model
print(score)



0.5591628693714102
0.9935996823460502


In [27]:
#XGBoost Regressor
xgboost_model = XGBRegressor(objective ='reg:squarederror', random_state=42)         #The objective is trying to minimise the sqaured error
xgb_scores = cross_val_score(xgboost_model, Xtrain, Ytrain, cv = 5, scoring = 'neg_mean_squared_error')     #Gives the negative mean squared error
rmse = -xgb_scores.mean()
print(rmse**0.5)

xgboost_model.fit(Xtrain, Ytrain)
score = xgboost_model.score(Xtest, Ytest)        #Gives the accurcay of the model
print(score)

0.5667642170231051
0.9938348856213097


In [28]:
#Gradient Regressor
gb_model = GradientBoostingRegressor(n_estimators= 500, random_state=42, max_depth= 5, min_samples_split =2, learning_rate =0.01)      #Increasing the number of estimators, adding max dpeth, min samples split and learning rate optimized my model
gb_scores = cross_val_score(gb_model, Xtrain, Ytrain, cv = 5, scoring = 'neg_mean_squared_error')     #Gives the negative mean squared error
rmse = -gb_scores.mean()
print(rmse**0.5)

gb_model.fit(Xtrain, Ytrain)
score = gb_model.score(Xtest, Ytest)        #Gives the accurcay of the model
print(score)

0.9918137793307423


***Ensemble Learning***

In [29]:
#Soft-Voting
#Voting regressor is my model
voting_regressor = VotingRegressor(estimators=[
    ('random forest', random_forest_model),
    ('xg boost', xgboost_model),
    ('gradient boost', gb_model)])

In [30]:

for model in (random_forest_model, xgboost_model, gb_model,voting_regressor):
  model.fit(Xtrain,Ytrain)
  y_pred=model.predict(Xtest)
  score = model.score(Xtest, Ytest)
  mse = mean_squared_error(Ytest, y_pred)


In [31]:
print(score)
print(mse)

0.9942367036491405
0.27165282442866184


***Test with new data set***

***Cleaning the new data set***

In [32]:
#The name of my dataframe is 'fifa_22_df'.
fifa_22_df = pd.read_csv('/content/drive/My Drive/Introduction_to_AI_Mid_Semester_Project/players_22.csv')
fifa_22_df.info()
fifa_22_df.describe()
fifa_22_df.head()

  fifa_22_df = pd.read_csv('/content/drive/My Drive/Introduction_to_AI_Mid_Semester_Project/players_22.csv')


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19239 entries, 0 to 19238
Columns: 110 entries, sofifa_id to nation_flag_url
dtypes: float64(16), int64(44), object(50)
memory usage: 16.1+ MB


Unnamed: 0,sofifa_id,player_url,short_name,long_name,player_positions,overall,potential,value_eur,wage_eur,age,...,lcb,cb,rcb,rb,gk,player_face_url,club_logo_url,club_flag_url,nation_logo_url,nation_flag_url
0,158023,https://sofifa.com/player/158023/lionel-messi/...,L. Messi,Lionel Andrés Messi Cuccittini,"RW, ST, CF",93,93,78000000.0,320000.0,34,...,50+3,50+3,50+3,61+3,19+3,https://cdn.sofifa.net/players/158/023/22_120.png,https://cdn.sofifa.net/teams/73/60.png,https://cdn.sofifa.net/flags/fr.png,https://cdn.sofifa.net/teams/1369/60.png,https://cdn.sofifa.net/flags/ar.png
1,188545,https://sofifa.com/player/188545/robert-lewand...,R. Lewandowski,Robert Lewandowski,ST,92,92,119500000.0,270000.0,32,...,60+3,60+3,60+3,61+3,19+3,https://cdn.sofifa.net/players/188/545/22_120.png,https://cdn.sofifa.net/teams/21/60.png,https://cdn.sofifa.net/flags/de.png,https://cdn.sofifa.net/teams/1353/60.png,https://cdn.sofifa.net/flags/pl.png
2,20801,https://sofifa.com/player/20801/c-ronaldo-dos-...,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,"ST, LW",91,91,45000000.0,270000.0,36,...,53+3,53+3,53+3,60+3,20+3,https://cdn.sofifa.net/players/020/801/22_120.png,https://cdn.sofifa.net/teams/11/60.png,https://cdn.sofifa.net/flags/gb-eng.png,https://cdn.sofifa.net/teams/1354/60.png,https://cdn.sofifa.net/flags/pt.png
3,190871,https://sofifa.com/player/190871/neymar-da-sil...,Neymar Jr,Neymar da Silva Santos Júnior,"LW, CAM",91,91,129000000.0,270000.0,29,...,50+3,50+3,50+3,62+3,20+3,https://cdn.sofifa.net/players/190/871/22_120.png,https://cdn.sofifa.net/teams/73/60.png,https://cdn.sofifa.net/flags/fr.png,,https://cdn.sofifa.net/flags/br.png
4,192985,https://sofifa.com/player/192985/kevin-de-bruy...,K. De Bruyne,Kevin De Bruyne,"CM, CAM",91,91,125500000.0,350000.0,30,...,69+3,69+3,69+3,75+3,21+3,https://cdn.sofifa.net/players/192/985/22_120.png,https://cdn.sofifa.net/teams/10/60.png,https://cdn.sofifa.net/flags/gb-eng.png,https://cdn.sofifa.net/teams/1325/60.png,https://cdn.sofifa.net/flags/be.png


In [33]:
#Dropping features that cannot predicate a player's overall rating
#Store them in a variable first
useless_features_22 = [
       'player_url', 'short_name', 'long_name', 'player_positions', 'dob', 'league_name', 'club_position', 'club_loaned_from',
       'club_joined', 'nationality_name', 'nation_position', 'real_face', 'player_tags','player_face_url', 'club_logo_url',
       'club_flag_url', 'nation_logo_url', 'nation_flag_url', 'sofifa_id',  'nationality_id','club_team_id','club_jersey_number',
       'club_contract_valid_until', 'nation_team_id',
       'nation_jersey_number', 'release_clause_eur']

useless_features_dataframe_22 = fifa_22_df[useless_features_22].copy()

fifa_22_df = fifa_22_df.drop(['player_url', 'short_name', 'long_name', 'player_positions', 'dob', 'league_name', 'club_position', 'club_loaned_from',
       'club_joined', 'nationality_name', 'nation_position', 'real_face', 'player_tags','player_face_url', 'club_logo_url',
       'club_flag_url', 'nation_logo_url', 'nation_flag_url', 'sofifa_id',  'nationality_id','club_team_id','club_jersey_number',
       'club_contract_valid_until', 'nation_team_id',
       'nation_jersey_number', 'release_clause_eur'], axis=1)


In [34]:
#Finding the percentage of missing values in each column
percentage_of_missing_values_per_column_22= fifa_22_df.isnull().sum()/len(fifa_22_df)
data_info_22 = pd.DataFrame({'Column': percentage_of_missing_values_per_column_22.index, 'Null Percentage': percentage_of_missing_values_per_column.values})
data_info_22 = data_info_22.reset_index(drop=True)
data_info_str = data_info_22.to_string()
print(data_info_str)


                         Column  Null Percentage
0                       overall         0.000000
1                     potential         0.000000
2                     value_eur         0.012511
3                      wage_eur         0.011877
4                           age         0.000000
5                     height_cm         0.000000
6                     weight_kg         0.000000
7                     club_name         0.011877
8                  league_level         0.011877
9                preferred_foot         0.000000
10                    weak_foot         0.000000
11                  skill_moves         0.000000
12     international_reputation         0.000000
13                    work_rate         0.000000
14                    body_type         0.000000
15                player_traits         0.561075
16                         pace         0.109956
17                     shooting         0.109956
18                      passing         0.109956
19                  

In [35]:
#Display the columns  with more than 30 percent missing values in a list
columns_with_over_30_percent_missing_percentage_22 = percentage_of_missing_values_per_column_22[percentage_of_missing_values_per_column_22 > 0.3]
columns_with_over_30_percent_missing_percentage_22 = columns_with_over_30_percent_missing_percentage_22.index.tolist()
print(columns_with_over_30_percent_missing_percentage_22)

#Source: https://datascienceparichay.com/article/pandas-percentage-of-missing-values-in-each-column/

['player_traits', 'goalkeeping_speed']


In [36]:
#Dropping features with more than 30 percent missing values
fifa_22_df = fifa_22_df.drop(['player_traits','goalkeeping_speed'], axis=1)
fifa_22_df

Unnamed: 0,overall,potential,value_eur,wage_eur,age,height_cm,weight_kg,club_name,league_level,preferred_foot,...,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,gk
0,93,93,78000000.0,320000.0,34,170,72,Paris Saint-Germain,1.0,Left,...,64+3,64+3,64+3,66+3,61+3,50+3,50+3,50+3,61+3,19+3
1,92,92,119500000.0,270000.0,32,185,81,FC Bayern München,1.0,Right,...,66+3,66+3,66+3,64+3,61+3,60+3,60+3,60+3,61+3,19+3
2,91,91,45000000.0,270000.0,36,187,83,Manchester United,1.0,Right,...,59+3,59+3,59+3,63+3,60+3,53+3,53+3,53+3,60+3,20+3
3,91,91,129000000.0,270000.0,29,175,68,Paris Saint-Germain,1.0,Right,...,63+3,63+3,63+3,67+3,62+3,50+3,50+3,50+3,62+3,20+3
4,91,91,125500000.0,350000.0,30,181,70,Manchester City,1.0,Right,...,80+3,80+3,80+3,79+3,75+3,69+3,69+3,69+3,75+3,21+3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19234,47,52,70000.0,1000.0,22,180,64,Wuhan FC,1.0,Right,...,46+2,46+2,46+2,48+2,48+2,46+2,46+2,46+2,48+2,15+2
19235,47,59,110000.0,500.0,19,175,70,Derry City,1.0,Right,...,46+2,46+2,46+2,49+2,48+2,44+2,44+2,44+2,48+2,14+2
19236,47,55,100000.0,500.0,21,178,72,Finn Harps,1.0,Right,...,46+2,46+2,46+2,47+2,47+2,45+2,45+2,45+2,47+2,12+2
19237,47,60,110000.0,500.0,19,173,66,Finn Harps,1.0,Right,...,29+2,29+2,29+2,33+2,32+2,26+2,26+2,26+2,32+2,15+2


In [37]:
#This code will give a list of the column names in DataFrame fifa_df_22 that have the 'object' data type.
print("object columns 22")
object_columns_22 = fifa_22_df.select_dtypes(include=['object']).columns
print(object_columns_22)

object columns 22
Index(['club_name', 'preferred_foot', 'work_rate', 'body_type', 'ls', 'st',
       'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm',
       'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb',
       'rcb', 'rb', 'gk'],
      dtype='object')


In [38]:
#Storing the non-numeric features in a dictionary (in case of anything)
object_columns_22 = fifa_22_df.select_dtypes(include=['object']).columns
print(object_columns_22)

#Let the object columns be stored in a new dataframe
fifa_df_object_22 = fifa_22_df[object_columns_22]

Index(['club_name', 'preferred_foot', 'work_rate', 'body_type', 'ls', 'st',
       'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm',
       'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb',
       'rcb', 'rb', 'gk'],
      dtype='object')


In [39]:
#Dropping the "unencoded" non-numeric features from the DataFrame
fifa_22_df = fifa_22_df.drop(columns=object_columns_22)
fifa_22_df

Unnamed: 0,overall,potential,value_eur,wage_eur,age,height_cm,weight_kg,league_level,weak_foot,skill_moves,...,mentality_penalties,mentality_composure,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes
0,93,93,78000000.0,320000.0,34,170,72,1.0,4,4,...,75,96,20,35,24,6,11,15,14,8
1,92,92,119500000.0,270000.0,32,185,81,1.0,4,4,...,90,88,35,42,19,15,6,12,8,10
2,91,91,45000000.0,270000.0,36,187,83,1.0,4,5,...,88,95,24,32,24,7,11,15,14,11
3,91,91,129000000.0,270000.0,29,175,68,1.0,5,5,...,93,93,35,32,29,9,9,15,15,11
4,91,91,125500000.0,350000.0,30,181,70,1.0,5,4,...,83,89,68,65,53,15,13,5,10,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19234,47,52,70000.0,1000.0,22,180,64,1.0,3,2,...,42,37,38,43,48,6,10,5,15,13
19235,47,59,110000.0,500.0,19,175,70,1.0,3,2,...,44,47,37,44,47,11,12,6,8,10
19236,47,55,100000.0,500.0,21,178,72,1.0,3,2,...,39,36,38,44,48,8,6,7,10,6
19237,47,60,110000.0,500.0,19,173,66,1.0,3,2,...,49,47,10,14,11,7,10,7,14,15


In [40]:
#Encoding- Converting non-numeric features into numeric features
encoded_features_22 = fifa_df_object_22.apply(lambda x: pd.factorize(x)[0])

#Source: https://www.statology.org/pandas-factorize/#:~:text=The%20pandas%20factorize%20%28%29%20function%20can%20be%20used,%3D%20pd.factorize%28df%20%5B%27col%27%5D%29%20Method%202%3A%20Factorize%20Specific%20Columns


In [41]:
#Concatenating the encoded features with the original DataFrame
fifa_df_encoded_22 = pd.concat([fifa_22_df, encoded_features_22], axis=1)

In [42]:
#Imputting the data- replacing missing values
imputer=SimpleImputer(strategy='mean')
imputed = imputer.fit_transform(fifa_df_encoded_22)
fifa_df_imputed_22 = pd.DataFrame(imputed, columns=fifa_df_encoded_22.columns)
fifa_df_imputed_22


Unnamed: 0,overall,potential,value_eur,wage_eur,age,height_cm,weight_kg,league_level,weak_foot,skill_moves,...,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,gk
0,93.0,93.0,78000000.0,320000.0,34.0,170.0,72.0,1.0,4.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,92.0,92.0,119500000.0,270000.0,32.0,185.0,81.0,1.0,4.0,4.0,...,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0
2,91.0,91.0,45000000.0,270000.0,36.0,187.0,83.0,1.0,4.0,5.0,...,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,1.0,1.0
3,91.0,91.0,129000000.0,270000.0,29.0,175.0,68.0,1.0,5.0,5.0,...,3.0,3.0,3.0,3.0,2.0,0.0,0.0,0.0,2.0,1.0
4,91.0,91.0,125500000.0,350000.0,30.0,181.0,70.0,1.0,5.0,4.0,...,4.0,4.0,4.0,4.0,3.0,3.0,3.0,3.0,3.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19234,47.0,52.0,70000.0,1000.0,22.0,180.0,64.0,1.0,3.0,2.0,...,136.0,136.0,136.0,104.0,87.0,131.0,131.0,131.0,87.0,45.0
19235,47.0,59.0,110000.0,500.0,19.0,175.0,70.0,1.0,3.0,2.0,...,136.0,136.0,136.0,99.0,87.0,121.0,121.0,121.0,87.0,26.0
19236,47.0,55.0,100000.0,500.0,21.0,178.0,72.0,1.0,3.0,2.0,...,136.0,136.0,136.0,115.0,98.0,87.0,87.0,87.0,98.0,44.0
19237,47.0,60.0,110000.0,500.0,19.0,173.0,66.0,1.0,3.0,2.0,...,84.0,84.0,84.0,95.0,79.0,105.0,105.0,105.0,79.0,45.0


In [43]:
#Storing the target feature in y
y_22=fifa_df_imputed_22['overall']

In [44]:
#Drop the target variable from the fifa_df_imputed dataframe
fifa_df_imputed_22=fifa_df_imputed_22.drop(columns=['overall'])

In [45]:
fifa_df_imputed_22

Unnamed: 0,potential,value_eur,wage_eur,age,height_cm,weight_kg,league_level,weak_foot,skill_moves,international_reputation,...,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,gk
0,93.0,78000000.0,320000.0,34.0,170.0,72.0,1.0,4.0,4.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,92.0,119500000.0,270000.0,32.0,185.0,81.0,1.0,4.0,4.0,5.0,...,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0
2,91.0,45000000.0,270000.0,36.0,187.0,83.0,1.0,4.0,5.0,5.0,...,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,1.0,1.0
3,91.0,129000000.0,270000.0,29.0,175.0,68.0,1.0,5.0,5.0,5.0,...,3.0,3.0,3.0,3.0,2.0,0.0,0.0,0.0,2.0,1.0
4,91.0,125500000.0,350000.0,30.0,181.0,70.0,1.0,5.0,4.0,4.0,...,4.0,4.0,4.0,4.0,3.0,3.0,3.0,3.0,3.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19234,52.0,70000.0,1000.0,22.0,180.0,64.0,1.0,3.0,2.0,1.0,...,136.0,136.0,136.0,104.0,87.0,131.0,131.0,131.0,87.0,45.0
19235,59.0,110000.0,500.0,19.0,175.0,70.0,1.0,3.0,2.0,1.0,...,136.0,136.0,136.0,99.0,87.0,121.0,121.0,121.0,87.0,26.0
19236,55.0,100000.0,500.0,21.0,178.0,72.0,1.0,3.0,2.0,1.0,...,136.0,136.0,136.0,115.0,98.0,87.0,87.0,87.0,98.0,44.0
19237,60.0,110000.0,500.0,19.0,173.0,66.0,1.0,3.0,2.0,1.0,...,84.0,84.0,84.0,95.0,79.0,105.0,105.0,105.0,79.0,45.0


In [46]:
#Dropped the columns that were dropped correlation analysis
fifa_df_imputed_22 = fifa_df_imputed_22.drop([ 'pace','weight_kg', 'lcb','cb','rcb','movement_balance','work_rate','preferred_foot','height_cm','goalkeeping_positioning',
                                                    'body_type','goalkeeping_reflexes','goalkeeping_handling','goalkeeping_diving','goalkeeping_kicking'], axis=1)
fifa_df_imputed_22

Unnamed: 0,potential,value_eur,wage_eur,age,league_level,weak_foot,skill_moves,international_reputation,shooting,passing,...,rcm,rm,lwb,ldm,cdm,rdm,rwb,lb,rb,gk
0,93.0,78000000.0,320000.0,34.0,1.0,4.0,4.0,5.0,92.0,91.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,92.0,119500000.0,270000.0,32.0,1.0,4.0,4.0,5.0,92.0,79.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
2,91.0,45000000.0,270000.0,36.0,1.0,4.0,5.0,5.0,94.0,80.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0,1.0,1.0
3,91.0,129000000.0,270000.0,29.0,1.0,5.0,5.0,5.0,83.0,86.0,...,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.0,2.0,1.0
4,91.0,125500000.0,350000.0,30.0,1.0,5.0,4.0,4.0,86.0,93.0,...,4.0,3.0,4.0,4.0,4.0,4.0,4.0,3.0,3.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19234,52.0,70000.0,1000.0,22.0,1.0,3.0,2.0,1.0,35.0,46.0,...,166.0,123.0,104.0,136.0,136.0,136.0,104.0,87.0,87.0,45.0
19235,59.0,110000.0,500.0,19.0,1.0,3.0,2.0,1.0,39.0,50.0,...,156.0,125.0,99.0,136.0,136.0,136.0,99.0,87.0,87.0,26.0
19236,55.0,100000.0,500.0,21.0,1.0,3.0,2.0,1.0,37.0,45.0,...,156.0,135.0,115.0,136.0,136.0,136.0,115.0,98.0,98.0,44.0
19237,60.0,110000.0,500.0,19.0,1.0,3.0,2.0,1.0,46.0,36.0,...,34.0,127.0,95.0,84.0,84.0,84.0,95.0,79.0,79.0,45.0


In [47]:
#Scaling the data
fifa_df_scaled_22=pd.DataFrame(StandardScaler().fit_transform(fifa_df_imputed_22))
fifa_df_scaled_22

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,56,57,58,59,60,61,62,63,64,65
0,3.601780,9.889601,15.998022,1.851089,-0.4746,1.569295,2.146241,10.525295,2.992852,3.548135,...,-2.968024,-2.604697,-2.656820,-2.443051,-2.443051,-2.443051,-2.656820,-2.609270,-2.609270,-1.747436
1,3.437470,15.350958,13.425844,1.429869,-0.4746,1.569295,2.146241,10.525295,2.992852,2.284233,...,-2.939734,-2.578313,-2.624143,-2.417457,-2.417457,-2.417457,-2.624143,-2.609270,-2.609270,-1.747436
2,3.273160,5.546836,13.425844,2.272309,-0.4746,1.569295,3.448937,10.525295,3.143798,2.389558,...,-2.911445,-2.551929,-2.591466,-2.391863,-2.391863,-2.391863,-2.591466,-2.575344,-2.575344,-1.696917
3,3.273160,16.601147,13.425844,0.798039,-0.4746,3.058403,3.448937,10.525295,2.313597,3.021509,...,-2.883155,-2.525545,-2.558790,-2.366269,-2.366269,-2.366269,-2.558790,-2.541418,-2.541418,-1.696917
4,3.273160,16.140551,17.541329,1.008649,-0.4746,3.058403,2.146241,7.830520,2.540015,3.758786,...,-2.854865,-2.525545,-2.526113,-2.340675,-2.340675,-2.340675,-2.526113,-2.507492,-2.507492,-1.646398
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19234,-3.134932,-0.365905,-0.412474,-0.676231,-0.4746,0.080187,-0.459150,-0.253804,-1.309098,-1.191498,...,1.728079,0.640566,0.741555,1.037709,1.037709,1.037709,0.741555,0.342287,0.342287,0.525915
19235,-1.984762,-0.360641,-0.438196,-1.308061,-0.4746,0.080187,-0.459150,-0.253804,-1.007207,-0.770197,...,1.445181,0.693334,0.578171,1.037709,1.037709,1.037709,0.578171,0.342287,0.342287,-0.433944
19236,-2.642002,-0.361957,-0.438196,-0.886841,-0.4746,0.080187,-0.459150,-0.253804,-1.158153,-1.296823,...,1.445181,0.957177,1.100998,1.037709,1.037709,1.037709,1.100998,0.715473,0.715473,0.475396
19237,-1.820452,-0.360641,-0.438196,-1.308061,-0.4746,0.080187,-0.459150,-0.253804,-0.478897,-2.244749,...,-2.006172,0.746103,0.447464,-0.293170,-0.293170,-0.293170,0.447464,0.070880,0.070880,0.525915


In [48]:
#Training the ensembled model
#We are giving the model new data and comparing the actual data to the predicted data
y_pred_22=voting_regressor.predict(fifa_df_scaled_22)
score = voting_regressor.score(fifa_df_scaled_22, y_22)
mse_22 = mean_squared_error(y_22, y_pred_22)
print(score)
print(mse_22)


0.993128217216947
0.32527669760414524


In [52]:
# pickling the model
pickle_out = open('/content/drive/My Drive/Introduction_to_AI_Mid_Semester_Project/voting_regressor.pkl', 'wb')
pickle.dump(voting_regressor, pickle_out)
pickle_out.close()