In [1]:
import pandas as pd
import numpy as np
import optuna
import xgboost as xgb
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split, cross_validate

In [2]:
def add_team_features(df):
    start_year = 24

    # Create an empty list to hold DataFrames
    all_stats = []
    
    for i in range(3):
        year = start_year - i
        season_str = f"20{year-1}-20{year}"
        url_df = f'https://fbref.com/en/comps/Big5/{season_str}/{season_str}-Big-5-European-Leagues-Stats'
    
        try:
            # Read all tables from the page
            tables = pd.read_html(url_df)
    
            # Usually the first table contains the overall squad stats
            temp = tables[0][['Squad', 'xG', 'xGA']]
            temp['Season'] = season_str
    
            # Append to the list
            all_stats.append(temp)
    
            #print(f"Fetched {season_str} successfully.")
        except Exception as e:
            print(f"Failed to fetch {season_str}: {e}")
    
    # combine
    stats = pd.concat(all_stats, ignore_index=True)
    df = df.merge(stats, on=['Squad', 'Season'], how='left')
    return df

def setup(df):
    df = df.dropna(subset=['Rating'])
    
    df['Defensive_Efficiency'] = (df['Tkl'] + df['Blocks'] + df['Int']) / df['90s']
    df['Offensive_Contribution'] = (df['Att 3rd'] + df['Crs'] + df['Sh']) / df['90s']
    df['Tactical_Contribution'] = df['TklW'] + (df['Tkl%'] * df['Tkl'])
    df['Penalty_Risk'] = (df['CrdY'] + df['CrdR'] + df['PKcon']) / df['90s']
    df['Defensive_Interaction'] = df['Tkl'] + df['Blocks'] + df['Int']
    #df['Seasonal_Trend'] = df.groupby('Season')['Rating'].transform(lambda x: x.diff()).fillna(0)
    df['Win_Ratio'] = df['Won'] / (df['Won'] + df['Lost'])
    
    # Initialize 'Adjusted Rating' as a copy of 'Rating' to prevent NaNs for non-defenders
    df['Adjusted Rating'] = df['Rating']
    
    # Calculate mean and standard deviation for defenders only
    mean_rating = df[df['Pos'] == 'DF']['Rating'].mean()
    std_rating = df[df['Pos'] == 'DF']['Rating'].std()
    
    # Calculate z-scores for defenders only
    z_ratings = (df[df['Pos'] == 'DF']['Rating'] - mean_rating) / std_rating
    
    # Increase variance by scaling z-scores (factor > 1)
    scaling_factor = 1.3  # Adjust the factor based on how much you want to increase the variance
    scaled_z_ratings = z_ratings * scaling_factor
    
    # Revert to the original scale with increased variance for defenders only
    df.loc[df['Pos'] == 'DF', 'Adjusted Rating'] = mean_rating + scaled_z_ratings * std_rating

    orig_df = df.copy()
    
    #df = df[df['Pos'].str.contains("DF", na=False)]
    df = df.drop(columns=['Player Name'], errors="ignore")  # Remove identifier column
    
    df = df.drop(columns=['Squad'])
    
    # One-hot encode categorical columns
    categorical_cols = ['League', 'Season', 'Pos']
    df = pd.get_dummies(df, columns=categorical_cols)
    
    # Fill missing values
    df = df.fillna(df.median())
    
    #df = pd.get_dummies(df, columns=['Pos'], drop_first=True)
    
    # Drop Season columns if needed
    #df = df.drop(df.filter(like="Season").columns, axis=1)
    #df = df.drop(df.filter(like="Squad").columns, axis=1)
    #df = df.drop(df.filter(like="League").columns, axis=1)
    
    # Split data into features (X) and target (y)
    return df, orig_df, mean_rating, std_rating

# Define Optuna objective function
def objective(trial):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000, step=100),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0, log=True),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10)
    }

    # Train model
    model = xgb.XGBRegressor(**params, random_state=42)
    model.fit(X_train, y_train)

    # Validate performance
    y_pred = model.predict(X_val)
    return r2_score(y_val, y_pred)

# Run Optuna optimization
def run_study(X_train, y_train, X_test, y_test, num_trials = 20):
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=num_trials)
    
    # Print best parameters
    best_params = study.best_params
    print("Best Hyperparameters:", best_params)
    
    # Train final model with best parameters
    best_model = xgb.XGBRegressor(**best_params, random_state=42)
    best_model.fit(X_train, y_train)
    
    # Evaluate performance on test set
    y_pred_test = best_model.predict(X_test)
    final_r2 = r2_score(y_test, y_pred_test)
    final_mae = mean_absolute_error(y_test, y_pred_test)
    
    print(f"Final R² Score: {final_r2:.4f}")
    print(f"Final MAE: {final_mae:.4f}")

    return best_params

# Cross Validation
def cross_val(best_params, X_train, y_train, X_test, y_test):
    # model with best hyperparameters, found from Optuna
    best_model = xgb.XGBRegressor(**best_params, random_state=42)

    best_model.fit(X_train, y_train)
    
    # cross-validation (function imported from sklearn.model_selection)
    cv_results = cross_validate(best_model, X_test, y_test, cv=5, scoring=('r2', 'neg_mean_absolute_error'))
    
    # Print cross-validation results
    print("Cross-validation R² scores:", cv_results['test_r2'])
    print("Cross-validation MAE scores:", -cv_results['test_neg_mean_absolute_error'])  # MAE is negative due to how scoring works
    
    # Calculate the average R² score and MAE across folds
    avg_r2 = cv_results['test_r2'].mean()
    avg_mae = -cv_results['test_neg_mean_absolute_error'].mean()  # Negative to positive conversion for MAE
    
    print(f"Average Cross-Validation R² Score: {avg_r2:.4f}")
    print(f"Average Cross-Validation MAE: {avg_mae:.4f}")

def undo_adjusted_rating(y_adjusted, mean_rating, std_rating, scaling_factor=1.3):
    unscaled_z = ((y_adjusted - mean_rating) / std_rating) / scaling_factor
    return mean_rating + unscaled_z * std_rating



In [3]:
best_params = {
    "objective": "reg:squarederror",
    "max_depth": 3,
    "learning_rate": 0.0605487138909333,
    "subsample": 0.6177495307302218,
    "colsample_bytree": 0.6524095821589513,
    "reg_alpha": 0.6304827381123799,
    "reg_lambda": 0.023966067481592194,
    "min_child_weight": 8,
    "seed": 42
}

In [4]:
df = pd.read_csv("finalized_players.csv")
df = add_team_features(df)
df, orig_df, mean_def, std_def = setup(df)

#X = df.drop(columns=['Rating', 'Adjusted Rating'], errors="ignore")
#y = df['Adjusted Rating']
#X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)
#X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

df_2021_2023 = df[df['Season_2021-2022'] | df['Season_2022-2023']].drop(columns=['Season_2021-2022', 'Season_2022-2023', 'Season_2023-2024'])
df_2024 = df[df['Season_2023-2024']].drop(columns=['Season_2021-2022', 'Season_2022-2023', 'Season_2023-2024'])
X_train = df_2021_2023.drop(columns=['Rating', 'Adjusted Rating'], errors="ignore")
y_train = df_2021_2023['Rating']
X_test = df_2024.drop(columns=['Rating', 'Adjusted Rating'], errors="ignore")
y_test = df_2024['Rating']

#best_params = run_study(X_train, y_train, X_test, y_test, 5)

cross_val(best_params, X_train, y_train, X_test, y_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['Season'] = season_str
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['Season'] = season_str
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp['Season'] = season_str
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instea

Cross-validation R² scores: [0.59158718 0.65086582 0.54119213 0.63298963 0.56311021]
Cross-validation MAE scores: [0.11222479 0.11678867 0.11081823 0.11104977 0.12213882]
Average Cross-Validation R² Score: 0.5959
Average Cross-Validation MAE: 0.1146


Why is this Rsq so high? I think there's overfitting (as this accuracy should be *worse* than descriptive, given there's no stats measuring change with time and yet the Rsq is 0.5949). I won't report this number in the presentation (it was naive of me to do that before every time I saw a higher Rsq without checking if it was validly found). Note that the descriptive Rsq was only 0.51. These are also the exact same parameters that came from the descriptive model, I just refit the model in the cross validation function on exclusively the training data (the 2 seasons from 2021-2023), and did cross validation on exclusively the test data (the single 2023-2024 season). I will mention briefly to Tad if I have time.

Also note that Rsq when predicting adjusted rating (increased variance to match offensive) and actual rating (no increased variance in the assumed normal distribution) was almost the same (0.5949 for Adjusted, 0.5959 for non-Adjusted), even though in other testing, adjusted rating tended to be easier to predict.

In [5]:
model = xgb.XGBRegressor(**best_params, random_state=42)
model.fit(X_train, y_train)
y_test_pred = model.predict(X_test)

#df['Predicted Adjusted Rating'] = y_pred
#df = df[[col for col in df.columns if col != 'Rating'] + ['Rating']]
#df = df[[col for col in df.columns if col != 'Adjusted Rating'] + ['Adjusted Rating']]

In [6]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_test_pred)
r2

0.6024609225335877

In [7]:
orig_2024 = orig_df[orig_df['Season'] == '2023-2024'].copy()

In [8]:
orig_2024['Predicted Rating'] = y_test_pred

In [9]:
orig_2024.to_csv('pred_2024_no_bias.csv')

In [10]:
df.columns

Index(['90s', 'Tkl', 'TklW', 'Def 3rd', 'Mid 3rd', 'Att 3rd', 'Chl-Tkl', 'Att',
       'Tkl%', 'Chl-Lost', 'Blocks', 'Sh', 'Pass', 'Int', 'Tkl+Int', 'Clr',
       'Err', 'CrdY', 'CrdR', '2CrdY', 'Fls', 'Off', 'Crs', 'PKcon', 'OG',
       'Recov', 'Won', 'Lost', 'Won%', 'Rating', 'xG', 'xGA',
       'Defensive_Efficiency', 'Offensive_Contribution',
       'Tactical_Contribution', 'Penalty_Risk', 'Defensive_Interaction',
       'Win_Ratio', 'Adjusted Rating', 'League_Bundesliga', 'League_La Liga',
       'League_Ligue 1', 'League_Premier League', 'League_Serie A',
       'Season_2021-2022', 'Season_2022-2023', 'Season_2023-2024', 'Pos_DF',
       'Pos_DF,MF', 'Pos_MF', 'Pos_MF,DF'],
      dtype='object')

In [11]:
params = model.get_params()
print(params)

{'objective': 'reg:squarederror', 'base_score': None, 'booster': None, 'callbacks': None, 'colsample_bylevel': None, 'colsample_bynode': None, 'colsample_bytree': 0.6524095821589513, 'device': None, 'early_stopping_rounds': None, 'enable_categorical': False, 'eval_metric': None, 'feature_types': None, 'gamma': None, 'grow_policy': None, 'importance_type': None, 'interaction_constraints': None, 'learning_rate': 0.0605487138909333, 'max_bin': None, 'max_cat_threshold': None, 'max_cat_to_onehot': None, 'max_delta_step': None, 'max_depth': 3, 'max_leaves': None, 'min_child_weight': 8, 'missing': nan, 'monotone_constraints': None, 'multi_strategy': None, 'n_estimators': None, 'n_jobs': None, 'num_parallel_tree': None, 'random_state': 42, 'reg_alpha': 0.6304827381123799, 'reg_lambda': 0.023966067481592194, 'sampling_method': None, 'scale_pos_weight': None, 'subsample': 0.6177495307302218, 'tree_method': None, 'validate_parameters': None, 'verbosity': None, 'seed': 42}


In [12]:
# extra work only used for descriptive
"""
scaled_mask = df['Adjusted Rating'] != df['Rating']

# Create new column: Predicted Actual Rating
#df['Predicted Rating'] = df['Predicted Adjusted Rating']  # Default for non-defenders

# Undo scaling only for defenders
#df.loc[scaled_mask, 'Predicted Rating'] = undo_adjusted_rating(
#    df.loc[scaled_mask, 'Predicted Adjusted Rating'],
#    mean_rating=mean_def,
#    std_rating=std_def,
#    scaling_factor=1.3  # or whatever factor you used
#)

# Undo One-Hot Encoding
df['Player Name'] = orig_df['Player Name']
pred_df = df

df['Squad'] = orig_df['Squad']
df['League'] = orig_df['League']

df.to_csv('messy.csv', index=False)

# Restore League
league_cols = [col for col in df.columns if col.startswith('League_')]
df['League'] = df[league_cols].idxmax(axis=1).str.replace('League_', '', regex=False)

# Restore Season
season_cols = [col for col in df.columns if col.startswith('Season_')]
df['Season'] = df[season_cols].idxmax(axis=1).str.replace('Season_', '', regex=False)

# Restore Position
pos_cols = [col for col in df.columns if col.startswith('Pos_')]
df['Pos'] = df[pos_cols].idxmax(axis=1).str.replace('Pos_', '', regex=False)

# Drop one-hot encoded columns
df = df.drop(columns=league_cols + season_cols + pos_cols)

# Round rating columns
rating_cols = ['Adjusted Rating', 'Rating', 'Predicted Rating']
df[rating_cols] = df[rating_cols].round(2)

# Reorder columns: start with key identifiers, end with ratings
key_cols = ['Player Name', 'Pos', 'Squad', 'League', 'Season']
other_cols = [col for col in df.columns if col not in key_cols + rating_cols]
df = df[key_cols + other_cols + rating_cols]
"""

"\nscaled_mask = df['Adjusted Rating'] != df['Rating']\n\n# Create new column: Predicted Actual Rating\n#df['Predicted Rating'] = df['Predicted Adjusted Rating']  # Default for non-defenders\n\n# Undo scaling only for defenders\n#df.loc[scaled_mask, 'Predicted Rating'] = undo_adjusted_rating(\n#    df.loc[scaled_mask, 'Predicted Adjusted Rating'],\n#    mean_rating=mean_def,\n#    std_rating=std_def,\n#    scaling_factor=1.3  # or whatever factor you used\n#)\n\n# Undo One-Hot Encoding\ndf['Player Name'] = orig_df['Player Name']\npred_df = df\n\ndf['Squad'] = orig_df['Squad']\ndf['League'] = orig_df['League']\n\ndf.to_csv('messy.csv', index=False)\n\n# Restore League\nleague_cols = [col for col in df.columns if col.startswith('League_')]\ndf['League'] = df[league_cols].idxmax(axis=1).str.replace('League_', '', regex=False)\n\n# Restore Season\nseason_cols = [col for col in df.columns if col.startswith('Season_')]\ndf['Season'] = df[season_cols].idxmax(axis=1).str.replace('Season_', '',

In [13]:
#pred_df.columns

In [14]:
# experimental "rating difference" usage that didn't work out well/help at all
"""
train = pred_df.dropna(subset=['Rating Diff'])
test = pred_df[pred_df['Rating Diff'].isnull()]
X_train = train.drop(columns=['Rating Diff', 'Player Name', 'Next_Rating', 'Next_Season', 'Pos', 'Squad', 'League', 'Season'])
y_train = train['Rating Diff']

model.fit(X_train, y_train)
y_pred = model.predict(X_test)
"""

"\ntrain = pred_df.dropna(subset=['Rating Diff'])\ntest = pred_df[pred_df['Rating Diff'].isnull()]\nX_train = train.drop(columns=['Rating Diff', 'Player Name', 'Next_Rating', 'Next_Season', 'Pos', 'Squad', 'League', 'Season'])\ny_train = train['Rating Diff']\n\nmodel.fit(X_train, y_train)\ny_pred = model.predict(X_test)\n"

In [15]:
#X_train.columns

In [16]:
#X_test = X_test.drop(columns=['Season_2021-2022', 'Season_2022-2023', 'Season_2023-2024', 'Pos_DF', 'Pos_DF,MF', 'Pos_MF', 'Pos_MF,DF', 'League_Bundesliga', 'League_La Liga', 'League_Ligue 1', 'League_Premier League', 'League_Serie A'], errors="ignore")

In [17]:
#X_test.columns

In [18]:
#pd.read_html('https://fbref.com/en/comps/Big5/2023-2024/2023-2024-Big-5-European-Leagues-Stats')

In [19]:
# from descriptive model
best_params = {
    "objective": "reg:squarederror",
    "max_depth": 3,
    "learning_rate": 0.0605487138909333,
    "subsample": 0.6177495307302218,
    "colsample_bytree": 0.6524095821589513,
    "reg_alpha": 0.6304827381123799,
    "reg_lambda": 0.023966067481592194,
    "min_child_weight": 8,
    "seed": 42
}

In [20]:
#df.columns