In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import mean_absolute_error
from scipy.stats import spearmanr
from sklearn.preprocessing import StandardScaler

In [2]:
data = pd.read_csv("final_combined_with_labels.csv")

# fix result column
result_split = data['Result'].str.split('-', expand=True).astype(float)
data['Wins'] = result_split[0]
data['Draws'] = result_split[1]
data['Losses'] = result_split[2]

data = data.drop(columns=['Result'])

# fix player column

data['Player_ID'] = data['Player'].factorize()[0]
id_to_player = dict(zip(data['Player_ID'], data['Player']))

numeric_cols = data.columns.difference(['Player']).tolist()
data[numeric_cols] = data[numeric_cols].apply(pd.to_numeric, errors='coerce').fillna(0)

data.to_csv("final_ready_for_modeling_with_id.csv", index=False)

#data split
data = pd.read_csv("final_ready_for_modeling_with_id.csv")

train_data = data[data['Year'] < 2024]
predict_data = data[data['Year'] == 2024]

X_train = train_data.drop(columns=['Rank', 'Year', 'Player'])
y_train = train_data['Rank']

X_predict = predict_data.drop(columns=['Rank', 'Year', 'Player'])

X_train.to_csv("X_train.csv", index=False)
y_train.to_csv("y_train.csv", index=False)
X_predict.to_csv("X_predict_2024.csv", index=False)

In [3]:
X_train = pd.read_csv("X_train.csv")
y_train = pd.read_csv("y_train.csv").squeeze()
X_predict = pd.read_csv("X_predict_2024.csv")

# standardize
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_predict = scaler.transform(X_predict)

# init xgb
model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)
model.fit(X_train, y_train)

# eval
y_train_pred = model.predict(X_train)
mae = mean_absolute_error(y_train, y_train_pred)
spearman_corr, _ = spearmanr(y_train, y_train_pred)

print(f"MAE: {mae}")
print(f"rank correlation: {spearman_corr}")

# predict!
predicted_ranks_2024 = model.predict(X_predict)

# get player ids back
X_predict_2024 = pd.read_csv("X_predict_2024.csv")
X_predict_2024['Predicted_Rank'] = predicted_ranks_2024

# rank players
X_predict_2024['Player'] = X_predict_2024['Player_ID'].map(id_to_player)
predicted_ranks_2024_sorted = X_predict_2024[['Player', 'Predicted_Rank']].sort_values(by='Predicted_Rank')
predicted_ranks_2024_sorted.to_csv("predicted_2024_ranks_with_names.csv", index=False)

MAE: 0.014224567560739415
rank correlation: 0.9976813926542865


In [5]:
predicted_data = pd.read_csv("predicted_2024_ranks_with_names.csv")

# sort by predicted rank
predicted_data_sorted = predicted_data.sort_values(by="Predicted_Rank").reset_index(drop=True)

# add integer rankings
predicted_data_sorted["Rank"] = predicted_data_sorted.index + 1  # +1 to start ranking from 1

# save final input
predicted_data_sorted = predicted_data_sorted[['Player', 'Rank']]
predicted_data_sorted.to_csv("final_predicted_2024_rankings.csv", index=False)
print(predicted_data_sorted)

              Player  Rank
0         Harry Kane     1
1     Erling Haaland     2
2      Kylian Mbappé     3
3         Toni Kroos     4
4         Phil Foden     5
5    Jude Bellingham     6
6      Florian Wirtz     7
7   Lautaro Martínez     8
8          Dani Olmo     9
9    Vinícius Júnior    10
10     Nico Williams    11
11      Lamine Yamal    12
12             Rodri    13
13   Ademola Lookman    14
14     Dani Carvajal    15
