In [1]:
import pandas
import numpy
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, GradientBoostingRegressor, RandomForestRegressor, AdaBoostRegressor, HistGradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_squared_error, make_scorer, mean_absolute_error, r2_score
import pickle as pkl
import joblib

# Regression Problem: FIFA

In [2]:
training_data = pandas.read_csv("players_22.csv")
testing_data  = pandas.read_csv("male_players (legacy).csv")

training_data = training_data[['player_url','short_name','long_name','player_positions','potential','value_eur','wage_eur','age','dob','height_cm','weight_kg','club_team_id','club_name','league_name','league_level','club_position','club_jersey_number','club_loaned_from','nationality_id','nationality_name','nation_team_id','nation_position','nation_jersey_number','preferred_foot','weak_foot','skill_moves','international_reputation','work_rate','body_type','real_face','release_clause_eur','player_tags','player_traits','pace','shooting','passing','dribbling','defending','physic','attacking_crossing','attacking_finishing','attacking_heading_accuracy','attacking_short_passing','attacking_volleys','skill_dribbling','skill_curve','skill_fk_accuracy','skill_long_passing','skill_ball_control','movement_acceleration','movement_sprint_speed','movement_agility','movement_reactions','movement_balance','power_shot_power','power_jumping','power_stamina','power_strength','power_long_shots','mentality_aggression','mentality_interceptions','mentality_positioning','mentality_vision','mentality_penalties','mentality_composure','defending_marking_awareness','defending_standing_tackle','defending_sliding_tackle','goalkeeping_diving','goalkeeping_handling','goalkeeping_kicking','goalkeeping_positioning','goalkeeping_reflexes','goalkeeping_speed','ls','st','rs','lw','lf','cf','rf','rw','lam','cam','ram','lm','lcm','cm','rcm','rm','lwb','ldm','cdm','rdm','rwb','lb','lcb','cb','rcb','rb','gk','player_face_url','overall']]
testing_data  = testing_data[ ['player_url','short_name','long_name','player_positions','potential','value_eur','wage_eur','age','dob','height_cm','weight_kg','club_team_id','club_name','league_name','league_level','club_position','club_jersey_number','club_loaned_from','nationality_id','nationality_name','nation_team_id','nation_position','nation_jersey_number','preferred_foot','weak_foot','skill_moves','international_reputation','work_rate','body_type','real_face','release_clause_eur','player_tags','player_traits','pace','shooting','passing','dribbling','defending','physic','attacking_crossing','attacking_finishing','attacking_heading_accuracy','attacking_short_passing','attacking_volleys','skill_dribbling','skill_curve','skill_fk_accuracy','skill_long_passing','skill_ball_control','movement_acceleration','movement_sprint_speed','movement_agility','movement_reactions','movement_balance','power_shot_power','power_jumping','power_stamina','power_strength','power_long_shots','mentality_aggression','mentality_interceptions','mentality_positioning','mentality_vision','mentality_penalties','mentality_composure','defending_marking_awareness','defending_standing_tackle','defending_sliding_tackle','goalkeeping_diving','goalkeeping_handling','goalkeeping_kicking','goalkeeping_positioning','goalkeeping_reflexes','goalkeeping_speed','ls','st','rs','lw','lf','cf','rf','rw','lam','cam','ram','lm','lcm','cm','rcm','rm','lwb','ldm','cdm','rdm','rwb','lb','lcb','cb','rcb','rb','gk','player_face_url','overall']]

  training_data = pandas.read_csv("players_22.csv")
  testing_data  = pandas.read_csv("male_players (legacy).csv")


## Data Preprocessing

### cleaning the training data

manually dropping columns with irrelevant data because they, intuitively, bare no correlation with a player's rating.

In [3]:
training_data.drop(columns=['player_url','short_name','long_name','dob','club_team_id','club_name','league_name','league_level','club_position','club_jersey_number','club_loaned_from','nationality_id','nationality_name','nation_team_id','nation_position','nation_jersey_number','real_face','release_clause_eur','player_tags','player_traits','player_face_url'], inplace=True)

drop columns that have over 30% null values

In [4]:
percentage = (training_data.isnull().sum()/len(training_data))
drop = percentage[percentage > 0.3].index
training_data.drop(columns=drop, inplace=True)

In [5]:
filtered = training_data.filter(items=['ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram',
                                    'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb',
                                    'lb', 'lcb', 'cb', 'rcb', 'rb', 'gk'])

def convert_to_number(entry):
    entry = str(entry)  # convert entry to string
    if '+' in entry:
        parts = entry.split('+')
        return int(parts[0]) + int(parts[1])
    elif '-' in entry:
        parts = entry.split('-')
        return int(parts[0]) - int(parts[1])
    else:
        return int(entry)

filtered = filtered.applymap(convert_to_number)

  filtered = filtered.applymap(convert_to_number)


In [6]:
training_data.update(filtered)
int_columns = ['ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram',
                'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb',
                'lb', 'lcb', 'cb', 'rcb', 'rb', 'gk']

training_data[int_columns] = training_data[int_columns].astype(int)

In [7]:
numeric      = training_data.select_dtypes(include=numpy.number)
non_numeric  = training_data.select_dtypes(include=['object'])

imputing the missing numerical values with the **mean**

In [8]:
numeric_with_NaN = numeric.columns[numeric.isnull().any().tolist()]

for column in numeric_with_NaN:
   numeric[column].fillna(numeric[column].mean(), inplace=True)

replacing NaN `object` values with the **mode**

In [9]:
non_numeric_with_NaN = non_numeric.columns[non_numeric.isnull().any().tolist()]

for column in non_numeric_with_NaN:
  non_numeric[column].fillna(non_numeric[column].mode()[0], inplace=True)

**encoding** for non-numeric data

In [10]:
label_encoder = LabelEncoder()

for column in non_numeric:
  non_numeric[column] = label_encoder.fit_transform(non_numeric[column])

concatenating to form the final dataset

In [11]:
training_data = pandas.concat([non_numeric, numeric], axis=1)
training_data.head()

Unnamed: 0,player_positions,preferred_foot,work_rate,body_type,potential,value_eur,wage_eur,age,height_cm,weight_kg,...,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,gk,overall
0,604,0,7,9,93,78000000.0,320000.0,34,170,72,...,67,67,69,64,53,53,53,64,22,93
1,635,1,2,9,92,119500000.0,270000.0,32,185,81,...,69,69,67,64,63,63,63,64,22,92
2,658,1,1,9,91,45000000.0,270000.0,36,187,83,...,62,62,66,63,56,56,56,63,23,91
3,372,1,2,9,91,129000000.0,270000.0,29,175,68,...,66,66,70,65,53,53,53,65,23,91
4,168,1,0,9,91,125500000.0,350000.0,30,181,70,...,83,83,82,78,72,72,72,78,24,91


### function for cleaning the data

In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

def clean_player_data(data):
    """
    The `clean_player_data` function takes a DataFrame as a parameter and returns a DataFrame with the clean data (no missing values and non-numeric data is encoded). It only works if the column data is similar, so the following columns need to be present: ['player_url','short_name','long_name','player_positions','potential','value_eur','wage_eur','age','dob','height_cm','weight_kg','club_team_id','club_name','league_name','league_level','club_position','club_jersey_number','club_loaned_from','nationality_id','nationality_name','nation_team_id','nation_position','nation_jersey_number','preferred_foot','weak_foot','skill_moves','international_reputation','work_rate','body_type','real_face','release_clause_eur','player_tags','player_traits','pace','shooting','passing','dribbling','defending','physic','attacking_crossing','attacking_finishing','attacking_heading_accuracy','attacking_short_passing','attacking_volleys','skill_dribbling','skill_curve','skill_fk_accuracy','skill_long_passing','skill_ball_control','movement_acceleration','movement_sprint_speed','movement_agility','movement_reactions','movement_balance','power_shot_power','power_jumping','power_stamina','power_strength','power_long_shots','mentality_aggression','mentality_interceptions','mentality_positioning','mentality_vision','mentality_penalties','mentality_composure','defending_marking_awareness','defending_standing_tackle','defending_sliding_tackle','goalkeeping_diving','goalkeeping_handling','goalkeeping_kicking','goalkeeping_positioning','goalkeeping_reflexes','goalkeeping_speed','ls','st','rs','lw','lf','cf','rf','rw','lam','cam','ram','lm','lcm','cm','rcm','rm','lwb','ldm','cdm','rdm','rwb','lb','lcb','cb','rcb','rb','gk','player_face_url','overall']
    data: the DataFrame with similar column headings to the `players_22.csv` dataset
    """
    data.drop(columns=['player_url', 'short_name', 'long_name', 'dob', 'club_team_id', 'club_name', 'league_name', 
                       'league_level', 'club_position', 'club_jersey_number', 'club_loaned_from', 'nationality_id', 
                       'nationality_name', 'nation_team_id', 'nation_position', 'nation_jersey_number', 'real_face', 
                       'release_clause_eur', 'player_tags', 'player_traits', 'player_face_url'], inplace=True)

    # drop columns with more than 30% missing values
    percentage = (data.isnull().sum() / len(data))
    drop = percentage[percentage > 0.3].index
    data.drop(columns=drop, inplace=True)

    int_columns = ['ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram',
                   'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb',
                   'lb', 'lcb', 'cb', 'rcb', 'rb', 'gk']
    filtered_columns = data.filter(items=int_columns)

    def convert_to_int(entry):
        entry = str(entry)  # Convert entry to string
        if '+' in entry:
            parts = entry.split('+')
            return int(parts[0]) + int(parts[1])
        elif '-' in entry:
            parts = entry.split('-')
            return int(parts[0]) - int(parts[1])
        else:
            return int(entry)

    filtered_columns = filtered_columns.applymap(convert_to_int)

    # update data with converted columns
    data.update(filtered_columns)
    data[int_columns] = data[int_columns].astype(int)

    numeric_data = data.select_dtypes(include=np.number)
    non_numeric_data = data.select_dtypes(include=['object'])

    # fill missing values in numeric columns with mean
    numeric_NaN = numeric_data.columns[numeric_data.isnull().any()].tolist()
    for column in numeric_NaN:
        numeric_data[column].fillna(numeric_data[column].mean(), inplace=True)

    # fill missing values in non-numeric columns with mode
    non_numeric_NaN = non_numeric_data.columns[non_numeric_data.isnull().any()].tolist()
    for column in non_numeric_NaN:
        non_numeric_data[column].fillna(non_numeric_data[column].mode()[0], inplace=True)

    label_encoder = LabelEncoder()
    for column in non_numeric_data:
        non_numeric_data[column] = label_encoder.fit_transform(non_numeric_data[column])

    return pd.concat([non_numeric_data, numeric_data], axis=1)

cleaning the testing data using the `clean_player_data` function

In [13]:
testing_data = clean_player_data(testing_data)

  filtered_columns = filtered_columns.applymap(convert_to_int)


# Feature Engineering
Using the RandomForest classifier to decide which features are the most important, as opposed to the correlation matrix, which may be too simple

In [14]:
Xtrain, Ytrain = training_data.drop('overall',axis=1), training_data['overall']
Xtest,  Ytest  = testing_data.drop('overall',axis=1),  testing_data['overall']

In [15]:
rforest = RandomForestRegressor(n_estimators=110, random_state=45, max_depth=15, criterion='absolute_error')
rforest.fit(Xtrain, Ytrain)

feature_importances = rforest.feature_importances_

In [16]:
feature_importance_df = pd.DataFrame({'Feature': Xtrain.columns, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)

feature_importance_df.iloc[0:10]

Unnamed: 0,Feature,Importance
5,value_eur,0.593809
7,age,0.14009
4,potential,0.116538
32,movement_reactions,0.094952
79,gk,0.006534
6,wage_eur,0.003026
78,rb,0.001988
74,lb,0.001945
75,lcb,0.001551
76,cb,0.001525


comparing with correlation matrix

In [17]:
corr = pd.DataFrame(training_data.corr())
corr['overall'].sort_values(ascending = False)

overall                 1.000000
movement_reactions      0.871823
mentality_composure     0.708867
passing                 0.663519
potential               0.644275
                          ...   
goalkeeping_diving     -0.010990
goalkeeping_handling   -0.011080
goalkeeping_kicking    -0.012986
preferred_foot         -0.048961
work_rate              -0.227014
Name: overall, Length: 81, dtype: float64

In [18]:
selected = ['movement_reactions', 'mentality_composure', 'power_shot_power', 'cm', 'mentality_vision',
                     'value_eur', 'age', 'potential', 'gk', 'wage_eur', 'overall']

Xtrain = training_data[selected]
Xtest  = testing_data[selected]

# Scaling

In [19]:
scaler = StandardScaler()
Xtrain_scaled = scaler.fit_transform(Xtrain)
Xtest_scaled  = scaler.fit_transform(Xtest)

# Training the ensemble models
Picking three ensemble models, then tuning the hyper parameters to get the best predictions

In [20]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)

## 1. Gradient Boosting Regressor

In [21]:
gbr = GradientBoostingRegressor()

gbr.fit(Xtrain, Ytrain)
gbr_initial = gbr.predict(Xtest)

print(f"""MAE:  {mean_absolute_error(gbr_initial, Ytest)}
MSE:  {mean_squared_error(gbr_initial, Ytest)}
RMSE: {numpy.sqrt(mean_squared_error(gbr_initial, Ytest))}
R2 =  {r2_score(gbr_initial, Ytest)}""")

MAE:  0.002037215700197425
MSE:  0.004801986163161281
RMSE: 0.06929636471822516
R2 =  0.9999029823402199


### Hyperparameter Tuning

In [22]:
gbr_parameters = {
    'n_estimators': [100, 200],
    'learning_rate': [0.1, 0.05]
}

In [23]:
grid_search_gb = GridSearchCV(estimator=gbr, param_grid=gbr_parameters, cv=kf, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_gb.fit(Xtrain, Ytrain)

gbr_best_parameters = grid_search_gb.best_params_

# training with best parameters
best_gbr = GradientBoostingRegressor(**gbr_best_parameters, random_state=42)
best_gbr.fit(Xtrain, Ytrain)

# prediction and evaluation
y_pred_gbr = best_gbr.predict(Xtest)
print(f"""MAE:  {mean_absolute_error(y_pred_gbr, Ytest)}
MSE:  {mean_squared_error(y_pred_gbr, Ytest)}
RMSE: {numpy.sqrt(mean_squared_error(y_pred_gbr, Ytest))}
R2 =  {r2_score(y_pred_gbr, Ytest)}""")

gbr_evaluator = numpy.sqrt(mean_squared_error(y_pred_gbr, Ytest))

MAE:  0.00176237136889556
MSE:  0.004825140312538382
RMSE: 0.06946322993165796
R2 =  0.9999025254427534


## 2. Adaptive Boosting Regressor

In [24]:
ada = AdaBoostRegressor()

ada.fit(Xtrain, Ytrain)
ada_initial = ada.predict(Xtest)

print(f"""MAE:  {mean_absolute_error(ada_initial, Ytest)}
MSE:  {mean_squared_error(ada_initial, Ytest)}
RMSE: {numpy.sqrt(mean_squared_error(ada_initial, Ytest))}
R2 =  {r2_score(ada_initial, Ytest)}""")

MAE:  0.5828337239652456
MSE:  0.6133103858804542
RMSE: 0.7831413575341646
R2 =  0.9872677719086972


### Hyperparameter Tuning

In [25]:
ada_parameters = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [1.0, 0.5, 0.1]
}

In [26]:
grid_search_ada = GridSearchCV(estimator=ada, param_grid=ada_parameters, cv=kf, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_ada.fit(Xtrain, Ytrain)

ada_best_parameters = grid_search_ada.best_params_

# training with best parameters
best_ada = GradientBoostingRegressor(**ada_best_parameters, random_state=42)
best_ada.fit(Xtrain, Ytrain)

# prediction and evaluation
y_pred_ada = best_ada.predict(Xtest)
print(f"""MAE:  {mean_absolute_error(y_pred_ada, Ytest)}
MSE:  {mean_squared_error(y_pred_ada, Ytest)}
RMSE: {numpy.sqrt(mean_squared_error(y_pred_ada, Ytest))}
R2 =  {r2_score(y_pred_ada, Ytest)}""")

ada_evaluator = numpy.sqrt(mean_squared_error(y_pred_ada, Ytest))

MAE:  0.008717051641867304
MSE:  0.004970000013279849
RMSE: 0.07049822702224397
R2 =  0.9998996027444854


## 3. Histogram Gradient Boosting Regressor

In [27]:
hgb = HistGradientBoostingRegressor()

hgb.fit(Xtrain, Ytrain)
hgb_initial = hgb.predict(Xtest)

print(f"""MAE:  {mean_absolute_error(hgb_initial, Ytest)}
MSE:  {mean_squared_error(hgb_initial, Ytest)}
RMSE: {numpy.sqrt(mean_squared_error(hgb_initial, Ytest))}
R2 =  {r2_score(hgb_initial, Ytest)}""")

MAE:  0.006967419409874487
MSE:  0.011065834020021549
RMSE: 0.10519426799983708
R2 =  0.9997763353086572


### Hyperparameter Tuning

In [28]:
hgb_parameters = {
    'learning_rate': [1.0, 0.5, 0.1]
}

In [29]:
grid_search_hgb = GridSearchCV(estimator=hgb, param_grid=hgb_parameters, cv=kf, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search_hgb.fit(Xtrain, Ytrain)

hgb_best_parameters = grid_search_hgb.best_params_

# training with best parameters
best_hgb = GradientBoostingRegressor(**hgb_best_parameters, random_state=42)
best_hgb.fit(Xtrain, Ytrain)

# prediction and evaluation
y_pred_hgb = best_hgb.predict(Xtest)
print(f"""MAE:  {mean_absolute_error(y_pred_hgb, Ytest)}
MSE:  {mean_squared_error(y_pred_hgb, Ytest)}
RMSE: {numpy.sqrt(mean_squared_error(y_pred_hgb, Ytest))}
R2 =  {r2_score(y_pred_hgb, Ytest)}""")

hgb_evaluator = numpy.sqrt(mean_squared_error(y_pred_hgb, Ytest))

MAE:  0.002089769783337002
MSE:  0.004829647227291388
RMSE: 0.06949566337039591
R2 =  0.999902435993156


## Picking the best model

In [30]:
evaluators = [gbr_evaluator, ada_evaluator, hgb_evaluator]
minimum_rmse = min(evaluators)

if minimum_rmse == gbr_evaluator:
    best_model  = best_gbr
    best_params = gbr_best_parameters
elif minimum_rmse == ada_evaluator:
    best_model  = best_ada
    best_params = ada_best_parameters
else:
    best_model  = best_hgb
    best_params = hgb_best_parameters

y_pred_test = best_model.predict(Xtest)
mse_test = mean_squared_error(Ytest, y_pred_test)

print(f"""Best Model:           {best_model}
Best Hyperparameters: {best_params}""")

Best Model:           GradientBoostingRegressor(n_estimators=200, random_state=42)
Best Hyperparameters: {'learning_rate': 0.1, 'n_estimators': 200}


# Saving the model (`.pkl` file)

In [31]:
pkl.dump(best_model, open(best_model.__class__.__name__ + '.pkl', 'wb'))
joblib.dump(scaler, 'scaler.pkl')

['scaler.pkl']