# Read data from csv that contains combined defenders statistics and their respective seasonal ratings for past three seasons.

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("finalized_players.csv")

In [3]:
print(df.shape)

(3652, 35)


# Now, we want to see if there are any entries of players without seasonal ratings, if so, exclude them from the data.

In [4]:
print(df[df.isnull().any(axis=1)])

                Player Name    Pos           Squad          League   90s  Tkl  \
17         Anel Ahmedhodžić     DF   Sheffield Utd  Premier League  29.4   64   
19          Rayan Aït-Nouri  DF,MF          Wolves  Premier League  25.9   71   
44     Michael Amir Murillo     DF       Marseille         Ligue 1  11.1   23   
50       Miguel Ángel Rubio     DF         Granada         La Liga  17.3   20   
56         Oliver Arblaster     MF   Sheffield Utd  Premier League  10.5   23   
...                     ...    ...             ...             ...   ...  ...   
3579  José Luis García Vayá     MF         Levante         La Liga  22.4   46   
3590         Freddie Veseli     DF     Salernitana         Serie A   6.6    7   
3609     Kyle Walker-Peters     DF     Southampton  Premier League  29.3   57   
3610      Aaron Wan-Bissaka     DF  Manchester Utd  Premier League  19.9   49   
3612      James Ward-Prowse     MF     Southampton  Premier League  35.7   41   

      TklW  Def 3rd  Mid 3r

# We see we have 131 players who do not have ratings, so we are going to exclude them to produce a "cleaned" DataFrame

In [5]:
df_clean = df.dropna()

In [6]:
# Ensure that the number of rows of players with NaN values summed with the rows of the clean df equal the original
# number of rows. Also that our cleaned df contains no NaN values
print(df_clean.shape)
print(df_clean[df_clean.isnull().any(axis=1)])

(3521, 35)
Empty DataFrame
Columns: [Player Name, Pos, Squad, League, 90s, Tkl, TklW, Def 3rd, Mid 3rd, Att 3rd, Chl-Tkl, Att, Tkl%, Chl-Lost, Blocks, Sh, Pass, Int, Tkl+Int, Clr, Err, CrdY, CrdR, 2CrdY, Fls, Off, Crs, PKcon, OG, Recov, Won, Lost, Won%, Season, Rating]
Index: []

[0 rows x 35 columns]


In [7]:
print(df_clean)

            Player Name    Pos          Squad          League   90s  Tkl  \
0            Max Aarons     DF    Bournemouth  Premier League  13.7   29   
1      Yunis Abdelhamid     DF          Reims         Ligue 1  30.9   64   
2     Salis Abdul Samed     MF           Lens         Ligue 1  16.9   21   
3       Laurent Abergel     MF        Lorient         Ligue 1  31.8   85   
4                 Abner     DF          Betis         La Liga  15.6   25   
...                 ...    ...            ...             ...   ...  ...   
3647       Nadir Zortea  DF,MF    Salernitana         Serie A  15.7   21   
3648         Kurt Zouma     DF       West Ham  Premier League  23.1   11   
3649      Igor Zubeldia     DF  Real Sociedad         La Liga  18.3   20   
3650   Martín Zubimendi     MF  Real Sociedad         La Liga  28.8   52   
3651   Szymon Żurkowski     MF         Empoli         Serie A  25.6   58   

      TklW  Def 3rd  Mid 3rd  Att 3rd  ...  Off  Crs  PKcon  OG  Recov  Won  \
0       

In [8]:
print(df_clean.dtypes)

Player Name     object
Pos             object
Squad           object
League          object
90s            float64
Tkl              int64
TklW             int64
Def 3rd          int64
Mid 3rd          int64
Att 3rd          int64
Chl-Tkl          int64
Att              int64
Tkl%           float64
Chl-Lost         int64
Blocks           int64
Sh               int64
Pass             int64
Int              int64
Tkl+Int          int64
Clr              int64
Err              int64
CrdY             int64
CrdR             int64
2CrdY            int64
Fls              int64
Off              int64
Crs              int64
PKcon            int64
OG               int64
Recov            int64
Won              int64
Lost             int64
Won%           float64
Season          object
Rating         float64
dtype: object


# Also, need to drop columns such as Name, Squad, and League.

In [9]:
df_processed = df_clean.drop(columns=["Player Name", "Squad", "League", "Season"])

In [10]:
# Ensure we removed the correct number of columns
print(df_processed.shape)

(3521, 31)


In [11]:
# Now, we want to transform the players' positions using One-Hot Encoding, but first we need to keep consistency
# in positions. We need to modify the players who are MF, DF to be DF, MF. This ensure we only have 3 different positions
df_processed['Pos'] = df_processed['Pos'].replace('MF,DF', 'DF,MF')

In [12]:
# Ensure we only have three positions to properly apply ONE-HOT Encoding.
df_processed['Pos'].unique()

array(['DF', 'MF', 'DF,MF'], dtype=object)

In [13]:
df_encoded = pd.get_dummies(df_processed, columns=["Pos"], drop_first=True)
df_encoded = df_encoded.astype({'Pos_DF,MF': 'int', 'Pos_MF': 'int'})


In [14]:
df_encoded

Unnamed: 0,90s,Tkl,TklW,Def 3rd,Mid 3rd,Att 3rd,Chl-Tkl,Att,Tkl%,Chl-Lost,...,Crs,PKcon,OG,Recov,Won,Lost,Won%,Rating,"Pos_DF,MF",Pos_MF
0,13.7,29,19,20,7,2,20,34,58.8,14,...,13,1,0,75,5,11,31.3,6.25,0,0
1,30.9,64,35,36,23,5,26,45,57.8,19,...,3,0,1,149,61,37,62.2,6.77,0,0
2,16.9,21,14,8,10,3,8,18,44.4,10,...,3,3,0,89,2,7,22.2,6.22,0,1
3,31.8,85,52,43,34,8,38,96,39.6,58,...,34,0,0,226,15,14,51.7,6.91,0,1
4,15.6,25,19,15,9,1,17,34,50.0,17,...,26,1,0,79,14,10,58.3,6.41,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3647,15.7,21,15,9,7,5,11,19,57.9,8,...,85,0,0,72,8,17,32.0,6.36,1,0
3648,23.1,11,6,8,3,0,4,7,57.1,3,...,1,0,1,92,50,31,61.7,6.74,0,0
3649,18.3,20,10,12,8,0,12,18,66.7,6,...,4,1,0,90,41,34,54.7,6.41,0,0
3650,28.8,52,25,19,30,3,22,44,50.0,22,...,2,0,0,139,63,24,72.4,6.76,0,1


In [15]:
from sklearn.model_selection import train_test_split

X = df_encoded.drop(columns=['Rating'])
y = df_encoded['Rating']

In [16]:
# Split into 70/15/15 for train/test/val
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42)

# Next, split that 30% equally into 15% val and 15% test
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"Training set samples: {X_train.shape[0]}")
print(f"Validation set samples: {X_val.shape[0]}")
print(f"Test set samples: {X_test.shape[0]}")

Training set samples: 2464
Validation set samples: 528
Test set samples: 529


In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

first_model = LinearRegression()
first_model.fit(X_train, y_train)

y_pred = first_model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Linear Regression MSE: {mse:.5f}, R^2: {r2:.5f}")

Linear Regression MSE: 0.03013, R^2: 0.52654


In [18]:
from sklearn.linear_model import Ridge

ridge = Ridge(alpha=1.0)

ridge.fit(X_train, y_train)

y_pred_ridge = ridge.predict(X_test)

mse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)
print(f"Ridge Regression MSE: {mse_ridge:.5f}, R^2: {r2_ridge:.5f}")

Ridge Regression MSE: 0.03013, R^2: 0.52652


In [19]:
ridge_param_grid = {
    "alpha": [0.001, 0.01, 0.1, 1.0, 10, 100]
}

ridge_grid = GridSearchCV(Ridge(), ridge_param_grid, cv=5, scoring='r2', n_jobs=1)
ridge_grid.fit(X_train, y_train)

lasso.fit(X_train, y_train)

y_pred_lasso = lasso.predict(X_test)

mse_lasso = mean_squared_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)
print(f"Lasso Regression MSE: {mse_lasso:.5f}, R^2: {r2_lasso:.5f}")

NameError: name 'GridSearchCV' is not defined

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_model = DecisionTreeRegressor(random_state=42)

tree_model.fit(X_train, y_train)

y_pred_tree = tree_model.predict(X_test)

mse_tree = mean_squared_error(y_test, y_pred_tree)
r2_tree = r2_score(y_test, y_pred_tree)
print(f"Decision Tree Regression MSE: {mse_tree:.5f}, R^2: {r2_tree:.5f}")

Decision Tree Regression MSE: 0.07543, R^2: -0.18539


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Define the parameter distributions to explore.
param_distributions = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [None, 10, 20, 30],
    'max_features': [None, 'sqrt', 'log2'],  # New parameter to tune
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize the Random Forest regressor.
rf = RandomForestRegressor(random_state=42)

# Set up RandomizedSearchCV with a fixed number of iterations (e.g., 50).
random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_distributions,
    n_iter=50,              # Number of parameter settings that are sampled.
    cv=5,                   # 5-fold cross-validation.
    n_jobs=-1,              # Use all available cores.
    verbose=2,
    scoring='neg_mean_squared_error',
    random_state=42
)

# Fit the randomized search on the training data.
random_search.fit(X_train, y_train)

# Print the best parameters found.
print("Best parameters found:", random_search.best_params_)

# Retrieve the best estimator.
best_rf = random_search.best_estimator_

# Predict on the test set using the best model.
y_pred_rf = best_rf.predict(X_test)

# Evaluate the performance.
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
print(f"Random Forest MSE: {mse_rf:.5f}, R^2: {r2_rf:.5f}")



Fitting 5 folds for each of 50 candidates, totalling 250 fits
Best parameters found: {'n_estimators': 400, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': None, 'max_depth': None, 'bootstrap': True}
Random Forest MSE: 0.03177, R^2: 0.50069


In [None]:
import pandas as pd

# Create a dictionary with the actual and predicted ratings.
results_dict = {
    'Actual Rating': y_test,
    'Predicted Rating': y_pred_rf
}

# Include Player_ID if available.
if 'Player_ID' in X_test.columns:
    results_dict['Player_ID'] = X_test['Player_ID']

# Optionally, include the player name if available.
if 'Player' in X_test.columns:
    results_dict['Player'] = X_test['Player']

# Create the DataFrame from the dictionary.
results_df = pd.DataFrame(results_dict)

# Reorder columns if necessary: put Player_ID and Player first if they exist.
ordered_columns = []
if 'Player_ID' in results_df.columns:
    ordered_columns.append('Player_ID')
if 'Player' in results_df.columns:
    ordered_columns.append('Player')
ordered_columns.extend(['Actual Rating', 'Predicted Rating'])
results_df = results_df[ordered_columns]

# Insert an Index column at the beginning.
results_df.insert(0, "Index", range(1, len(results_df) + 1))

# Export the DataFrame to a CSV file.
results_df.to_csv('predicted_vs_actual.csv', index=False)
print("Exported predicted vs actual ratings to 'predicted_vs_actual.csv'")


Exported predicted vs actual ratings to 'predicted_vs_actual.csv'


In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42)

gb_model.fit(X_train, y_train)

y_pred_gb = gb_model.predict(X_test)

mse_gb = mean_squared_error(y_test, y_pred_gb)
r2_gb = r2_score(y_test, y_pred_gb)
print(f"Gradient Boosting MSE: {mse_gb:.5f}, R^2: {r2_gb:.5f}")

Gradient Boosting MSE: 0.03072, R^2: 0.51719


In [None]:
import matplotlib.pyplot as plt

def plot_actual_vs_predicted(y_test, y_pred, model_name):
    plt.figure(figsize=(8, 6))
    plt.scatter(y_test, y_pred, color='blue', alpha=0.6, label=f'{model_name} Predictions')
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color='red', linestyle='--', label="Perfect Prediction")
    plt.xlabel('Actual Values')
    plt.ylabel('Predicted Values')
    plt.title(f'{model_name}: Actual vs Predicted')
    plt.legend()
    plt.show()

plot_actual_vs_predicted(y_test, y_pred, "Linear Regression")
plot_actual_vs_predicted(y_test, y_pred_ridge, "Ridge Regression")
plot_actual_vs_predicted(y_test, y_pred_lasso, "Lasso Regression")
plot_actual_vs_predicted(y_test, y_pred_tree, "Decision Tree")
plot_actual_vs_predicted(y_test, y_pred_rf, "Random Forest")
plot_actual_vs_predicted(y_test, y_pred_gb, "Gradient Boosting")

ModuleNotFoundError: No module named 'matplotlib'

In [None]:
# # Create a comparison bar plot for MSE values of each model
# models = ['Linear', 'Ridge', 'Decision Tree']
#         #    'Lasso', 'Decision Tree', 'Random Forest', 'Gradient Boosting']
# mse_values = [mse, mse_ridge, mse_tree]
#             #   mse_lasso, mse_tree, mse_rf, mse_gb]

# plt.figure(figsize=(10, 6))
# plt.bar(models, mse_values, color='skyblue')
# plt.xlabel('Models')
# plt.ylabel('MSE (Mean Squared Error)')
# plt.title('MSE Comparison of Models')
# plt.show()

In [None]:
# def plot_residuals(y_test, y_pred, model_name):
#     residuals = y_test - y_pred
#     plt.figure(figsize=(8, 6))
#     plt.scatter(y_pred, residuals, color='blue', alpha=0.6)
#     plt.hlines(y=0, xmin=y_pred.min(), xmax=y_pred.max(), colors='red', linestyle='--', label="Zero Residuals")
#     plt.xlabel('Predicted Values')
#     plt.ylabel('Residuals')
#     plt.title(f'{model_name}: Residuals Plot')
#     plt.legend()
#     plt.show()

# # Residual plot for each model
# plot_residuals(y_test, y_pred, "Linear Regression")
# plot_residuals(y_test, y_pred_ridge, "Ridge Regression")
# # plot_residuals(y_test, y_pred_lasso, "Lasso Regression")
# plot_residuals(y_test, y_pred_tree, "Decision Tree")
# # plot_residuals(y_test, y_pred_rf, "Random Forest")
# # plot_residuals(y_test, y_pred_gb, "Gradient Boosting")

In [None]:
ridge_export = pd.concat([y_test_series, y_test_pred_series], axis=1)


In [None]:
ridge_export.columns = ['Actual Rating', 'Predicted Rating']

In [None]:
ridge_export

Unnamed: 0,Actual Rating,Predicted Rating
0,6.74,6.748369
1,6.65,6.656592
2,6.68,6.698447
3,6.51,6.498548
4,6.61,6.643987
...,...,...
524,6.58,6.648340
525,6.29,6.604636
526,7.05,6.914139
527,6.42,6.451077


In [None]:
y_test_pred_tree_series = pd.Series(y_test_pred_tree)

In [None]:
tree_export = pd.concat([y_test_series, y_test_pred_tree_series], axis=1)

In [None]:
tree_export

Unnamed: 0,Rating,0
0,6.74,6.773300
1,6.65,6.611806
2,6.68,6.611806
3,6.51,6.477849
4,6.61,6.611806
...,...,...
524,6.58,6.774759
525,6.29,6.477849
526,7.05,6.774759
527,6.42,6.611806


In [None]:
tree_export.columns = ['Actual Rating', 'Predicted Rating']

In [None]:
tree_export

Unnamed: 0,Actual Rating,Predicted Rating
0,6.74,6.773300
1,6.65,6.611806
2,6.68,6.611806
3,6.51,6.477849
4,6.61,6.611806
...,...,...
524,6.58,6.774759
525,6.29,6.477849
526,7.05,6.774759
527,6.42,6.611806


In [None]:
ridge_export.to_csv('predicted_vs_actual_ridge.csv', index=True)

In [None]:
tree_export.to_csv('predicted_vs_actual_tree.csv', index=True)