[Link to Video Recording of the Model Deployment
](https://drive.google.com/file/d/1clck6MK1oqZgrNB8LGTvPI-iKgFvLFgJ/view?usp=sharing)

[Link to Streamlit Application Codes](https://drive.google.com/drive/folders/1y6v0EnfyblqaVtA1EGL9Bm3U4WDAlfxV?usp=sharing)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import  mean_absolute_error, mean_squared_error, r2_score, accuracy_score
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
import seaborn as sb
import xgboost as xgb


import matplotlib.pyplot as plt

df = pd.read_csv('/content/drive/MyDrive/Fifa/players_22.csv', low_memory=False)
df = df.set_index('sofifa_id')



**Data Preparation**


*   Drop all empty rows and columns
*   Drop columns that may not be necessary
*   Replace all missing values in the DataFrame with np.nan
*   Combine Columns with Related Information
*   Perform One Hot Encoding
*   Scale the Data









In [None]:
# df.info()

# Drop all empty rows and columns if any
df = df.dropna(how='all').dropna(axis=1, how='all')


# Drop irrelevant Columns 
df = df.drop(['long_name', 'short_name', 'player_url','player_face_url', 'club_logo_url', 'club_flag_url', 
              'nation_logo_url', 'nation_flag_url', 'club_loaned_from', 'dob', 
             'club_team_id', 'club_contract_valid_until','nationality_id', 'nationality_name', 
              'nation_team_id', 'nation_position', 'nation_jersey_number','real_face', 
              'release_clause_eur', 'club_jersey_number', 'club_joined', 'player_tags',
              'club_position','player_traits','preferred_foot', 'body_type',
              'club_name', 'league_name'], axis=1)

# Fill 
df = df.fillna(np.nan)




In [None]:
# Combine Columns with Related Information and Calculate the averages.
# Attacking  - ['attacking_crossing', 'attacking_finishing', 'attacking_heading_accuracy', 'attacking_short_passing', 'attacking_volleys']
# Skill - ['skill_dribbling', 'skill_curve', 'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control']
# Movement - ['movement_acceleration', 'movement_sprint_speed', 'movement_agility', 'movement_reactions', 'movement_balance'] 
# Power - ['power_shot_power', 'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots']
# Mentality - ['mentality_aggression', 'mentality_interceptions', 'mentality_positioning', 'mentality_vision', 'mentality_penalties', 'mentality_composure']
# Defending - ['defending_marking_awareness', 'defending_standing_tackle', 'defending_sliding_tackle']
# Goal Keeping - ['goalkeeping_diving', 'goalkeeping_handling', 'goalkeeping_kicking', 'goalkeeping_positioning', 'goalkeeping_reflexes', 'goalkeeping_speed']
# 
# Average for new Column 'Attacking' for each player.
df['Attacking'] = np.mean(df[['attacking_crossing', 'attacking_finishing', 'attacking_heading_accuracy', 'attacking_short_passing', 'attacking_volleys']], axis=1).round(1)

# Dropping related columns for Attacking from df
df = df.drop(['attacking_crossing', 'attacking_finishing', 'attacking_heading_accuracy', 'attacking_short_passing', 'attacking_volleys'], axis=1)

# Average for new Column 'Skill' for each player
df['Skill'] = np.mean(df[['skill_dribbling', 'skill_curve', 'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control']], axis=1).round(1)

# Dropping related columns for Skill from df
df = df.drop(['skill_dribbling', 'skill_curve', 'skill_fk_accuracy', 'skill_long_passing', 'skill_ball_control'], axis=1)


# Average for new Column 'Movement' for each player
df['Movement'] = np.mean(df[['movement_acceleration', 'movement_sprint_speed', 'movement_agility', 'movement_reactions', 'movement_balance']], axis=1).round(1)
# Dropping related columns for Movement from df
df = df.drop(['movement_acceleration', 'movement_sprint_speed', 'movement_agility', 'movement_reactions', 'movement_balance'], axis=1)

# Average for new Column 'Power' for each player
df['Power'] = np.mean(df[['power_shot_power', 'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots']], axis=1).round(1)
# Dropping related columns for Power from df
df = df.drop(['power_shot_power', 'power_jumping', 'power_stamina', 'power_strength', 'power_long_shots'], axis=1)

# Average for new Column 'Mentality' for each player
df['Mentality'] = np.mean(df[['mentality_aggression', 'mentality_interceptions', 'mentality_positioning', 'mentality_vision', 'mentality_penalties', 'mentality_composure']], axis=1).round(1)
# Dropping related columns for Mentality from df
df = df.drop(['mentality_aggression', 'mentality_interceptions', 'mentality_positioning', 'mentality_vision', 'mentality_penalties', 'mentality_composure'], axis=1)

# Average for new Column 'Defending' for each player
df['Defending'] = np.mean(df[['defending_marking_awareness', 'defending_standing_tackle', 'defending_sliding_tackle', 'defending']], axis=1).round(1)
# Dropping related columns for Defending from df
df = df.drop(['defending_marking_awareness', 'defending_standing_tackle', 'defending_sliding_tackle', 'defending'], axis=1)

# Average for new Column 'GoalKeeping' for each player
df['GoalKeeping'] = np.mean(df[['goalkeeping_diving', 'goalkeeping_handling', 'goalkeeping_kicking', 'goalkeeping_positioning', 'goalkeeping_reflexes', 'goalkeeping_speed']], axis=1).round(1)
# Dropping related columns for GoalKeeping from df
df = df.drop(['goalkeeping_diving', 'goalkeeping_handling', 'goalkeeping_kicking', 'goalkeeping_positioning', 'goalkeeping_reflexes', 'goalkeeping_speed'], axis=1)


df = df.rename(columns={'overall': 'overall_rating'})
df = df.drop(['player_positions'], axis=1)

# Add a new column to indicate the Current Season which is Season 22
df['Season'] = 22


In [None]:
df = df.ffill()
df = df.bfill()

# pd.set_option("display.max_columns", None)
# df.head(10)

In [None]:
# define the mapping of values to numerical ranks
sportsdf = df.copy(deep=True)
# initialize the LabelEncoder
le = LabelEncoder()

# encode the 'work_rate' column using the LabelEncoder
sportsdf['work_rate_encoded'] = le.fit_transform(sportsdf['work_rate'])
sportsdf.drop(['work_rate'], axis=1, inplace=True)
object_cols = ['ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm','rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb', 'gk']
# define a function to convert the string to integer for 'object' based columns highlighted.
def convert_cols(string):
    if '-' in string:
        return int(string.split('-')[0]) - int(string.split('-')[1])
    elif '+' in string:
        return int(string.split('+')[0]) + int(string.split('+')[1])
    else:
        return int(string)

# loop through the object type columns and convert the values to int
for column in sportsdf.select_dtypes(include='object').columns:
    sportsdf[column] = sportsdf[column].apply(convert_cols)

# Combine gk and GoalKeeping
sportsdf['GoalKeeping'] = sportsdf['gk'] + sportsdf['GoalKeeping']
sportsdf.drop('gk', axis=1, inplace=True)

sportsdf.head(10)

Unnamed: 0_level_0,overall_rating,potential,value_eur,wage_eur,age,height_cm,weight_kg,league_level,weak_foot,skill_moves,...,rb,Attacking,Skill,Movement,Power,Mentality,Defending,GoalKeeping,Season,work_rate_encoded
sofifa_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
158023,93,93,78000000.0,320000.0,34,170,72,1.0,4,4,...,64,85.8,94.0,90.2,77.8,73.8,28.2,32.8,22,7
188545,92,92,119500000.0,270000.0,32,185,81,1.0,4,4,...,64,86.0,81.4,81.6,84.8,80.7,35.0,32.2,22,2
20801,91,91,45000000.0,270000.0,36,187,83,1.0,4,5,...,63,87.6,83.6,85.4,87.2,74.3,28.5,34.6,22,1
190871,91,91,129000000.0,270000.0,29,175,68,1.0,5,5,...,65,80.6,89.2,90.2,71.8,77.0,33.2,34.8,22,2
192985,91,91,125500000.0,350000.0,30,181,70,1.0,5,4,...,78,81.4,88.0,80.0,81.6,82.7,62.5,35.2,22,0
200389,91,93,112000000.0,130000.0,28,188,87,1.0,3,1,...,35,19.0,21.8,61.4,53.6,34.7,19.0,173.2,22,8
231747,91,95,194000000.0,230000.0,22,182,73,1.0,4,5,...,66,82.2,80.8,92.4,82.2,73.5,32.0,29.4,22,1
167495,90,90,13500000.0,86000.0,35,193,93,1.0,4,1,...,38,24.8,33.8,57.4,56.8,43.0,12.7,173.3,22,8
192448,90,92,99000000.0,250000.0,29,187,85,1.0,4,1,...,34,23.6,28.8,51.2,53.6,40.2,16.0,171.3,22,8
202126,90,90,129500000.0,240000.0,27,188,89,1.0,5,3,...,67,86.6,80.4,74.4,84.8,81.2,42.8,33.8,22,0


**Feature Subsets**

In [None]:
# Description of features in the dataframe
# sportsdf.describe()


In [None]:
# # Create a subset of the features of interest
# Dependent Variable -> overall_rating
subset_cols = ['overall_rating', 'potential', 'value_eur', 'wage_eur', 'age', 'height_cm', 'weight_kg', 'league_level', 'weak_foot', 'skill_moves', 'international_reputation', 'pace', 'shooting', 'passing', 'dribbling', 'physic', 'ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram', 'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb', 'lcb', 'cb', 'rcb', 'rb', 'Attacking', 'Skill', 'Movement', 'Power', 'Mentality', 'Defending', 'GoalKeeping', 'work_rate_encoded']

sportsdf.columns
subset_df = sportsdf[subset_cols]

# Calculate the correlation matrix table 
corr_mat = subset_df.corr().round(2)
corr_mat


Unnamed: 0,overall_rating,potential,value_eur,wage_eur,age,height_cm,weight_kg,league_level,weak_foot,skill_moves,...,rcb,rb,Attacking,Skill,Movement,Power,Mentality,Defending,GoalKeeping,work_rate_encoded
overall_rating,1.0,0.64,0.55,0.6,0.46,0.04,0.15,-0.18,0.22,0.38,...,0.4,0.43,0.46,0.46,0.35,0.58,0.55,0.28,0.02,-0.23
potential,0.64,1.0,0.53,0.5,-0.26,0.0,-0.02,-0.15,0.16,0.28,...,0.23,0.27,0.29,0.32,0.3,0.28,0.32,0.16,-0.03,-0.15
value_eur,0.55,0.53,1.0,0.82,0.04,0.01,0.04,-0.12,0.15,0.26,...,0.2,0.23,0.26,0.27,0.24,0.29,0.3,0.13,-0.0,-0.14
wage_eur,0.6,0.5,0.82,1.0,0.16,0.03,0.07,-0.13,0.16,0.27,...,0.23,0.26,0.29,0.3,0.23,0.32,0.34,0.15,-0.0,-0.13
age,0.46,-0.26,0.04,0.16,1.0,0.08,0.24,-0.05,0.08,0.07,...,0.18,0.14,0.15,0.14,-0.02,0.31,0.26,0.13,0.13,-0.08
height_cm,0.04,0.0,0.01,0.03,0.08,1.0,0.77,0.07,-0.16,-0.41,...,-0.08,-0.26,-0.36,-0.45,-0.62,-0.14,-0.31,-0.09,0.37,0.21
weight_kg,0.15,-0.02,0.04,0.07,0.24,0.77,1.0,0.02,-0.12,-0.34,...,-0.04,-0.21,-0.28,-0.36,-0.52,-0.02,-0.21,-0.08,0.35,0.15
league_level,-0.18,-0.15,-0.12,-0.13,-0.05,0.07,0.02,1.0,-0.02,-0.06,...,-0.05,-0.05,-0.07,-0.07,-0.02,-0.04,-0.08,-0.03,-0.02,0.02
weak_foot,0.22,0.16,0.15,0.16,0.08,-0.16,-0.12,-0.02,1.0,0.34,...,0.12,0.19,0.36,0.37,0.31,0.32,0.32,0.05,-0.21,-0.16
skill_moves,0.38,0.28,0.26,0.27,0.07,-0.41,-0.34,-0.06,0.34,1.0,...,0.34,0.51,0.78,0.8,0.71,0.62,0.7,0.22,-0.6,-0.35



**Features highly correlated with: *overall_perfomance* :**
*   Passing - 0.72
*   Dribbling - 0.67
*   Potential - 0.64
*   wage_eur - 0.6
*   power - 0.58
*   physic - 0.53
*   value_eur - 0.55
*   mentality - 0.55
*   lcm  - 0.55
*   cm   - 0.55
*   rcm  - 0.55

















In [None]:
# Feature Selection
# Based on the correlation heatmap & table, 
# Features that are highly correlated to Dependent Variable -> overall_perfomance :
# New Model with the highly correlated features. - physic
subset_cols = ['overall_rating', 'potential', 'value_eur', 'wage_eur', 'passing','physic', 'dribbling','Mentality','lcm', 'cm', 'rcm']
modelData_df = sportsdf[subset_cols]


**Model Training**

**XGBoost Regressor with no Tuning**

In [None]:
# select which dataset to use
from sklearn.model_selection import cross_val_score

modelData_df5 = modelData_df.copy(deep=True)

# Split
X = modelData_df5.drop('overall_rating', axis=1)
sc=StandardScaler()
X_scaled=sc.fit_transform(X)
X=pd.DataFrame(X_scaled, columns=X.columns)

y = modelData_df5['overall_rating']
# # Calculate the IQR for each column
# Q1 = X.quantile(0.25)
# Q3 = X.quantile(0.75)
# IQR = Q3 - Q1

# # Filter out outliers
# X = X[~((X < (Q1 - 1.5 * IQR)) | (X > (Q3 + 1.5 * IQR))).any(axis=1)]
# y = y[X.index]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# XGBoost regression model
model = xgb.XGBRegressor()

# Train 
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)


rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print('Root Mean Square Error:', rmse)
print('Mean Absolute Error:', mae)
print('Mean Square Error:', mse)
print()

Root Mean Square Error: 1.2071388739572284
Mean Absolute Error: 0.8305576179726456
Mean Square Error: 1.4571842610187251



**XGBoostRegressor Optimized**

In [None]:
from sklearn.model_selection import GridSearchCV


# Define the XGBoost regression model
model = xgb.XGBRegressor()

# Define the hyperparameters to tune
params = {
    'learning_rate': [0.05, 0.1, 0.15],
    'max_depth': [3, 4, 5],
    'n_estimators': [50, 100, 150],
    'reg_alpha': [0, 0.1, 0.5, 1],
    'reg_lambda': [0, 0.1, 0.5, 1],
}

# Use GridSearchCV to find the best hyperparameters
grid = GridSearchCV(model, param_grid=params, scoring='neg_mean_squared_error', cv=5)
grid.fit(X_train, y_train)

# Print the best hyperparameters found
print(grid.best_params_)

# Train the model on the training set with the best hyperparameters
best_model = xgb.XGBRegressor(**grid.best_params_)
best_model.fit(X_train, y_train)

# Predictions
y_train_pred = best_model.predict(X_train)

# Make predictions on the test set
y_pred = best_model.predict(X_test)

# Model's performance 

rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)


print('Root Mean Square Error:', rmse)
print('Meam Absolute Error:', mae)
print('Mean Square Error:', mse)
print()


{'learning_rate': 0.15, 'max_depth': 5, 'n_estimators': 150, 'reg_alpha': 1, 'reg_lambda': 0}
Root Mean Square Error: 1.2100500363924542
Meam Absolute Error: 0.8290837172906761
Mean Square Error: 1.46422109057338



**GradientBoostingRegressor**

In [None]:

# GradientBoosting model
model = GradientBoostingRegressor()

# Train 
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
accuracy = model.score(X_test, y_test)

print('Root Mean Square Error:', rmse)
print('Mean Absolute Error:', mae)
print('Mean Square Error:', mse)
print()


Root Mean Square Error: 1.5684911165441457
Mean Absolute Error: 1.1630450120298061
Mean Square Error: 2.4601643826779007



**Gradient Boosting Regressor Optimized**

In [None]:

# Create the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [2, 3, 4],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'learning_rate': [0.05, 0.1, 0.2]
}

# Create a GradientBoostingRegressor object
gb = GradientBoostingRegressor()

# Instantiate the GridSearchCV object
grid_search = GridSearchCV(estimator = gb, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Get the best parameters
print('Best parameters:', grid_search.best_params_)

# Make predictions on the test set using the best model
gb_best_model = grid_search.best_estimator_
y_pred = gb_best_model.predict(X_test)

# Calculate and print the evaluation metrics
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print('Root Mean Square Error:', rmse)
print('Mean Absolute Error:', mae)
print('Mean Square Error:', mse)
print()


Fitting 3 folds for each of 243 candidates, totalling 729 fits
Best parameters: {'learning_rate': 0.2, 'max_depth': 4, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 300}
Root Mean Square Error: 1.2496883827403822
Mean Absolute Error: 0.8588339460087498
Mean Square Error: 1.561721053956272



**Use Model to test with Another Season**

link to season 2020 data set cleaning- [players_2020.csv](https://colab.research.google.com/drive/1QAD01p1y62M-0tKUul_wSviposwRjPuq?usp=sharing)

In [None]:
# Load the new cleaned dataset - players_20.csv
df1 = pd.read_csv('/content/drive/MyDrive/Fifa/players_20.csv')
df1=df1.set_index('sofifa_id')


**Prediction for the new dataset/ Season 20.**

In [None]:
import pickle
# # Saving XGBoost Regressor Trained Model/ Download Model
with open('/content/drive/MyDrive/TrainedModels/fifamodeltrained.pkl', 'wb') as file:
    pickle.dump(best_model, file)

# Load the saved model from the pickle file and do predictions on the new dataset
with open('/content/drive/MyDrive/TrainedModels/fifamodeltrained.pkl', 'rb') as file:
    loaded_model = pickle.load(file)


X_New = df1.drop('overall_rating', axis=1)
y_New = df1['overall_rating']
y_newpred = loaded_model.predict(X_New)

# Comparing between predicted values and actual values for Season 20/ players_2020.
dfNew = pd.DataFrame(data={"Actual Overall_rating_S20": y_New,
                          "Predicted_Rating_S20": y_newpred})



In [None]:
pd.set_option('display.max_rows', None)
dfNew.head(4)

Unnamed: 0_level_0,Actual Overall_rating_S20,Predicted_Rating_S20
sofifa_id,Unnamed: 1_level_1,Unnamed: 2_level_1
158023,94,88.677368
20801,93,88.566437
190871,92,89.16153
183277,91,88.393097


**Deployment**

[Link to Video Recording of the Model Deployment](https://drive.google.com/file/d/1clck6MK1oqZgrNB8LGTvPI-iKgFvLFgJ/view?usp=sharing)


[Link to Streamlit Application Codes](https://drive.google.com/drive/folders/1y6v0EnfyblqaVtA1EGL9Bm3U4WDAlfxV)