In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [1]:
from Data_Enrichment import get_features

RAW_DIR = "data/raw"
df_feats, feature_cols = get_features(RAW_DIR)
df_feats = df_feats[ (df_feats['season_end_year'] != 2026)]



  df["ballon_dor_winner"] = df["ballon_dor_winner"].fillna(False).astype(bool)


In [2]:
def corr_var(df_feats):
    df_feats.drop(columns=['clean_sheets', 'nb_in_group', 'matches_played', 'g_per90', 'ga_per90'], inplace=True)
    return df_feats
df = corr_var(df_feats)

In [19]:
# Filtro limpio: deja las variables temporales relevantes + identificación + target
df_model = df.filter(
    regex=r'(player_id|player_name|lag1|delta|_w$|height|main_position|position|season_end_year|ballon_dor_winner)'
).copy()


In [20]:
# Entrenamiento: todas las temporadas hasta 2024
train = df_model[df_model['season_end_year'] < 2025]

# Predicción: temporada 2025
test = df_model[df_model['season_end_year'] == 2025]


In [21]:
# Features y target
X_train = train.drop(columns=['ballon_dor_winner'])
y_train = train['ballon_dor_winner']

X_test = test.drop(columns=['ballon_dor_winner'])
y_test = test['ballon_dor_winner']


In [22]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Identificar tipos
num_cols = X_train.select_dtypes(include='number').columns.drop(['season_end_year'], errors='ignore')
cat_cols = ['position', 'main_position']

# Preprocesador
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), cat_cols)
], remainder='drop')


In [15]:
from sklearn.ensemble import RandomForestClassifier

rf = Pipeline([
    ('prep', preprocessor),
    ('model', RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        random_state=42,
        class_weight='balanced_subsample'
    ))
])

rf.fit(X_train, y_train)


0,1,2
,steps,"[('prep', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,n_estimators,300
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [16]:
from sklearn.metrics import classification_report

y_pred = rf.predict(X_test)
y_proba = rf.predict_proba(X_test)[:, 1]

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

       False       1.00      1.00      1.00      3503

    accuracy                           1.00      3503
   macro avg       1.00      1.00      1.00      3503
weighted avg       1.00      1.00      1.00      3503



In [17]:
test = test.copy()
test['pred_proba'] = y_proba

top_candidates = (
    test[['player_name', 'season_end_year', 'pred_proba']]
    .sort_values('pred_proba', ascending=False)
)

top_candidates.head(10)


Unnamed: 0,player_name,season_end_year,pred_proba
56680,Julián Alvarez (576024),2025,0.036667
40075,Robert Lewandowski (38253),2025,0.033333
43160,Lautaro Martínez (406625),2025,0.03
43756,Bryan Mbeumo (413039),2025,0.03
34628,Kylian Mbappé (342229),2025,0.02
38500,Federico Valverde (369081),2025,0.02
44223,Erling Haaland (418560),2025,0.016667
6136,Harry Kane (132098),2025,0.016667
43576,Raphinha (411295),2025,0.016667
50207,Evann Guessand (500689),2025,0.013333


In [27]:
# Extraer el modelo RandomForest ya entrenado
model = rf.named_steps['model']

# Obtener las columnas transformadas del preprocesador
# (OneHotEncoder genera columnas nuevas, así que hay que recuperarlas)
encoder = rf.named_steps['prep'].named_transformers_['cat']
encoded_cols = encoder.get_feature_names_out(['position', 'main_position'])

num_cols_scaled = rf.named_steps['prep'].named_transformers_['num'].feature_names_in_

# Combinar nombres de todas las features finales del pipeline
all_features = np.concatenate([num_cols_scaled, encoded_cols])

# Importancias del modelo
importances = model.feature_importances_

# Crear DataFrame ordenado
feat_imp = pd.DataFrame({
    'feature': all_features,
    'importance': importances
}).sort_values('importance', ascending=False)

feat_imp.head(20)


Unnamed: 0,feature,importance
17,matches_played_z_lag1,0.143851
5,a_per90_z_lag1,0.121202
3,g_per90_z_lag1,0.099438
1,ga_per90_z_lag1,0.091393
13,pen_share_z_lag1,0.077281
4,g_per90_z_delta,0.057318
21,a_per90_w,0.052728
2,ga_per90_z_delta,0.047858
20,g_per90_w,0.04017
6,a_per90_z_delta,0.036999


In [31]:
a = feat_imp["importance"].sum()
print(r"the 20 most important features explain ", round((feat_imp.head(20)["importance"].sum()/a)*100,2), "% of the model")

the 20 most important features explain  94.95 % of the model


In [26]:
# Top 20 important feature names
top_features = feat_imp.head(20)['feature'].values
top_features



array(['matches_played_z_lag1', 'a_per90_z_lag1', 'g_per90_z_lag1',
       'ga_per90_z_lag1', 'pen_share_z_lag1', 'g_per90_z_delta',
       'a_per90_w', 'ga_per90_z_delta', 'g_per90_w', 'a_per90_z_delta',
       'ga_per90_w', 'main_position_Attack', 'pen_share_w',
       'minutes_played_z_lag1', 'minutes_played_z_delta',
       'position_Attack - Right Winger', 'clean_sheet_rate_z_lag1',
       'height', 'pen_share_z_delta', 'matches_played_z_delta'],
      dtype=object)