In [37]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
import pickle
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/players_22-1 (1).csv')


In [None]:
df.info()

In [None]:
#finding correlation between each feature and the overall rating of the player
selected_features = ['overall', 'potential', 'age', 'height_cm', 'weight_kg', 'pace', 'shooting', 'passing',
                     'dribbling', 'defending', 'physic', 'attacking_crossing', 'attacking_finishing',
                     'attacking_heading_accuracy', 'attacking_short_passing', 'attacking_volleys',
                     'skill_dribbling', 'skill_curve', 'skill_fk_accuracy', 'skill_long_passing',
                     'skill_ball_control', 'movement_acceleration', 'movement_sprint_speed', 'movement_agility',
                     'movement_reactions', 'movement_balance', 'power_shot_power', 'power_jumping', 'power_stamina',
                     'power_strength', 'power_long_shots', 'mentality_aggression', 'mentality_interceptions',
                     'mentality_positioning', 'mentality_vision', 'mentality_penalties', 'mentality_composure',
                     'defending_marking_awareness', 'defending_standing_tackle', 'defending_sliding_tackle',
                     'goalkeeping_diving', 'goalkeeping_handling', 'goalkeeping_kicking', 'goalkeeping_positioning',
                     'goalkeeping_reflexes', 'goalkeeping_speed']
correlation_matrix = df[selected_features].corr()
correlation_matrix['overall'].sort_values(ascending=False)

In [None]:
#selection of best correlated features
selected_columns = ['skill_long_passing', 'skill_ball_control', 'skill_curve', 'skill_fk_accuracy', 'skill_dribbling',
                    'goalkeeping_handling', 'attacking_crossing', 'movement_reactions', 'age',
                    'mentality_aggression', 'mentality_vision', 'mentality_interceptions', 'goalkeeping_kicking',
                    'goalkeeping_speed', 'shooting', 'dribbling', 'power_shot_power','power_long_shots', 'defending', 'goalkeeping_reflexes',
                    'potential', 'mentality_composure', 'mentality_positioning', 'mentality_penalties', 'passing',
                    'attacking_short_passing', 'physic', 'overall', 'goalkeeping_positioning', 'goalkeeping_diving',
                    'attacking_volleys', 'attacking_finishing']


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/players_21.csv', usecols=selected_columns)

In [None]:
df.head()

In [None]:
df.replace(np.nan, 0, inplace=True)
df.loc[df['overall'] == 93, 'overall'] = 92

In [None]:
shooting_attributes = ['shooting', 'power_shot_power', 'power_long_shots', 'attacking_volleys','attacking_finishing']

df['shooting_skills'] = df[shooting_attributes].mean(axis=1)

df.drop(columns=shooting_attributes, inplace=True)

df.info()

In [None]:
mentality_attributes = ['mentality_aggression', 'mentality_interceptions', 'mentality_positioning',
                        'mentality_vision', 'mentality_penalties', 'mentality_composure']
imputer = SimpleImputer(strategy='mean')
df[mentality_attributes] = imputer.fit_transform(df[mentality_attributes])

df['mentality'] = df[mentality_attributes].mean(axis=1)

df.drop(columns=mentality_attributes, inplace=True)

df.info()

In [None]:
skill_attributes = ['skill_long_passing', 'skill_ball_control', 'skill_curve', 'skill_fk_accuracy', 'skill_dribbling']

df['skills'] = df[skill_attributes].mean(axis=1)

df.drop(columns=skill_attributes, inplace=True)

df.info()

In [None]:
df = df.astype(int)
goalkeeping_attributes = ['goalkeeping_diving', 'goalkeeping_handling', 'goalkeeping_kicking',
                          'goalkeeping_positioning', 'goalkeeping_reflexes', 'goalkeeping_speed']


imputer = SimpleImputer(strategy='mean')
df[goalkeeping_attributes] = imputer.fit_transform(df[goalkeeping_attributes])

df['goalkeeping_skills'] = df[goalkeeping_attributes].mean(axis=1)
df.drop(columns=goalkeeping_attributes, inplace=True)
df.info()



In [None]:
df= df.astype(int)
df.info()

In [None]:
y=df['overall']
X=df.drop('overall',axis=1)

In [None]:
sc=StandardScaler()

In [None]:
scaled=sc.fit_transform(X)
X=pd.DataFrame(scaled, columns=X.columns)

In [None]:
y.head()

In [None]:
Xtrain,Xtest,Ytrain,Ytest=train_test_split(X,y,test_size=0.1,random_state=42,stratify = y)
Xtrain.shape


In [None]:
rf=RandomForestRegressor()
rf.fit(Xtrain, Ytrain)

In [None]:
y_pred = rf.predict(Xtest)
mae = mean_absolute_error(Ytest, y_pred)
print(f'Mean Absolute Error: {mae}')

In [None]:
feature_importances = rf.feature_importances_

# Creating a DataFrame
feature_importances_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
})


In [None]:
feature_importances_df = feature_importances_df.sort_values(by='Importance', ascending=False)
feature_importances_df

In [None]:
df['potential_x_movement_reactions'] = df['potential'] * df['movement_reactions']
df['dribbling_x_defending'] = df['dribbling'] * df['defending']


In [None]:
# Defining features and target
y = df['overall']
X = df[['movement_reactions', 'potential', 'dribbling', 'defending', 'goalkeeping_skills',
        'age', 'attacking_crossing', 'physic', 'shooting_skills', 'mentality',
        'attacking_short_passing', 'passing', 'skills', 'potential_x_movement_reactions',
        'dribbling_x_defending']]

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df[['age', 'potential', 'movement_reactions']] = scaler.fit_transform(df[['age', 'potential', 'movement_reactions']])


In [None]:
sc = StandardScaler()
scaled = sc.fit_transform(X)
X = pd.DataFrame(scaled, columns=X.columns)

In [None]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)


In [None]:
rf = RandomForestRegressor(random_state=42)
rf.fit(Xtrain, Ytrain)

In [None]:
y_pred = rf.predict(Xtest)
mae = mean_absolute_error(Ytest, y_pred)
print(f'Mean Absolute Error: {mae}')

In [None]:
from xgboost import XGBRegressor
xgb_model = XGBRegressor()
xgb_model.fit(Xtrain, Ytrain)

y_pred_xgb = xgb_model.predict(Xtest)

mae_xgb = mean_absolute_error(y_pred_xgb, Ytest)
mae_xgb

In [None]:
from sklearn.ensemble import AdaBoostRegressor
ada_model = AdaBoostRegressor()
ada_model.fit(Xtrain, Ytrain)

y_pred_ada = ada_model.predict(Xtest)

mae_ada = mean_absolute_error(y_pred_ada, Ytest)
mae_ada

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

# Defining the parameter grid
param_grid_rf = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Creating the RandomForestRegressor model
rf = RandomForestRegressor(random_state=42)

# Setting up RandomizedSearchCV with 5-fold cross-validation
rf_random = RandomizedSearchCV(estimator=rf, param_distributions=param_grid_rf, n_iter=100,
                               cv=5, verbose=2, random_state=42, n_jobs=-1)

# Fit RandomizedSearchCV
rf_random.fit(Xtrain, Ytrain)

# obtain the best parameters
print("Best parameters found for RandomForestRegressor:")
print(rf_random.best_params_)

# Evaluate for the best model
best_rf = rf_random.best_estimator_
y_pred_rf = best_rf.predict(Xtest)
mae_rf = mean_absolute_error(Ytest, y_pred_rf)
print(f'Mean Absolute Error (RandomForestRegressor): {mae_rf}')

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [35]:
print(f'Mean Absolute Error (RandomForestRegressor): {mae_rf}')

Mean Absolute Error (RandomForestRegressor): 0.7258896114203055


In [40]:
model_filename = 'best_random_forest_model.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(best_rf, file)
print(f'Model saved to {model_filename}')

# Loading the model to verify
with open(model_filename, 'rb') as file:
    loaded_model = pickle.load(file)
loaded_y_pred = loaded_model.predict(Xtest)
loaded_mae = mean_absolute_error(Ytest, loaded_y_pred)
print(f'Mean Absolute Error (Loaded Model): {loaded_mae}')

Model saved to best_random_forest_model.pkl
Mean Absolute Error (Loaded Model): 0.7258896114203055
