In [20]:
from Data_Enrichment import get_features

RAW_DIR = "data/raw"
df_feats, feature_cols = get_features(RAW_DIR)
df_feats = df_feats[(df_feats['season_end_year'] != 2026)]

  df["ballon_dor_winner"] = df["ballon_dor_winner"].fillna(False).astype(bool)


In [21]:
# 🔹 Filtro limpio manteniendo identificador y variables relevantes + age y age_penalty
df_model = df_feats.filter(
    regex=r'(player_id|player_name|lag1|delta|_w$|height|main_position|position|season_end_year|ballon_dor_winner|age|age_penalty)'
).copy()



In [22]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

In [23]:
# Target
y = df_model['ballon_dor_winner'].values

# Features (excluyendo columnas no numéricas o irrelevantes)
X = df_model.drop(columns=['ballon_dor_winner', 'player_name']).values


In [24]:
# Seleccionar solo columnas numéricas
X_numeric = df_model.select_dtypes(include=['int64', 'float64',"bool"])


In [6]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_numeric)


In [7]:
np.random.seed(30) #we fix the seed so we can reproduce the data any time
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_numeric, X_numeric["ballon_dor_winner"], test_size=0.3, random_state=42)

In [8]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Fit Lasso regression model
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)

0,1,2
,alpha,0.1
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


In [9]:
y_pred = lasso.predict(X_test)
print(y_pred.mean())

0.00011516093741003046


In [10]:
# Model Score
print("Model Score: ", lasso.score(X_test, y_test))

Model Score:  -8.776524989739798e-05


chat help me this part

In [11]:
import pandas as pd
import numpy as np

# Target
y = df_model['ballon_dor_winner'].values  # 1 si ganó, 0 si no

# Features
# Excluimos target y player_name
X = df_model.drop(columns=['ballon_dor_winner', 'player_name']).copy()

# Detectar columnas no numéricas
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
print("Categorical columns:", categorical_cols)

# Codificar variables categóricas
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
print("Shape after encoding:", X_encoded.shape)

# Convertimos a numpy array
X_values = X_encoded.values


Categorical columns: ['position', 'main_position']
Shape after encoding: (74430, 50)


In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_values, y, test_size=0.2, random_state=42, stratify=y
)


In [13]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [14]:
from sklearn.linear_model import LogisticRegression

log_lasso = LogisticRegression(
    penalty='l1',      # Lasso
    solver='saga',     # necesario para L1
    class_weight='balanced',  # ayuda con clases desbalanceadas
    max_iter=10000,
    random_state=42
)

# Entrenamos
log_lasso.fit(X_train_scaled, y_train)


0,1,2
,penalty,'l1'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'saga'
,max_iter,10000


In [15]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score

# Predicciones de probabilidad
y_pred_prob = log_lasso.predict_proba(X_test_scaled)[:,1]

# Predicciones binarias con threshold 0.5
y_pred = (y_pred_prob >= 0.5).astype(int)

# Métricas
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_pred_prob))


Accuracy: 0.9985892785167271
Precision: 0.08695652173913043
Recall: 1.0
ROC AUC: 0.9999328137597421


In [16]:
# Coeficientes del modelo
coef = log_lasso.coef_[0]
feature_names = X_encoded.columns

# Ordenar por importancia absoluta
feature_importance = pd.DataFrame({
    'feature': feature_names,
    'coef': coef
}).sort_values(by='coef', key=abs, ascending=False)

print(feature_importance.head(30))


                                 feature      coef
5                            age_penalty -5.041675
2                                    age -3.490038
16                discipline_rate_z_lag1 -2.551511
20                 minutes_played_z_lag1  2.240530
4                               age_norm  2.119805
21                minutes_played_z_delta  1.889588
14               clean_sheet_rate_z_lag1 -1.282808
17               discipline_rate_z_delta -1.261965
10                        a_per90_z_lag1  1.188719
22                 matches_played_z_lag1  1.139992
11                       a_per90_z_delta  1.055513
43  position_Midfield - Central Midfield  0.998525
3                        season_end_year  0.770680
9                        g_per90_z_delta  0.751166
33        position_Attack - Right Winger  0.625495
7                       ga_per90_z_delta -0.552206
6                        ga_per90_z_lag1  0.481290
40                   position_Goalkeeper -0.437109
48              main_position_G

In [17]:
# Asegurarse de que df_feats y y tengan el mismo tamaño
min_len = min(len(df_feats), len(y))
df_feats_aligned = df_feats.iloc[:min_len].copy()
y_aligned = y[:min_len]

# Ahora sí split
_, X_test_idx, _, _ = train_test_split(
    np.arange(len(df_feats_aligned)),
    y_aligned,
    test_size=0.2,
    random_state=42,
    stratify=y_aligned
)

# DataFrame de predicciones
df_preds = pd.DataFrame({
    "player_name": df_feats_aligned.iloc[X_test_idx]["player_name"].values,
    "pred_prob": y_pred_prob
})

# Top 30 jugadores
top_players = df_preds.sort_values(by='pred_prob', ascending=False).head(30)
print(top_players)


                             player_name  pred_prob
1355                Lionel Messi (28003)   0.999836
12779               Lionel Messi (28003)   0.999498
7958                Lionel Messi (28003)   0.992303
2063                Borja Valero (40372)   0.986305
10775                Luka Modrić (27992)   0.971675
5287           Domenico Berardi (177843)   0.971110
1578            Cristiano Ronaldo (8198)   0.875723
7337                Marco Parolo (84545)   0.848869
10576              João Moutinho (29364)   0.833911
10212             Kylian Mbappé (342229)   0.813694
13453                Luis Suárez (44352)   0.802989
14069             Ciro Immobile (105521)   0.774361
5464           Mathieu Coutadeur (39914)   0.706748
106           Robert Lewandowski (38253)   0.700690
6978            Cristiano Ronaldo (8198)   0.696871
4681                 Luka Modrić (27992)   0.682553
10175            Álvaro González (55957)   0.667494
10131              Santi Cazorla (15799)   0.639380
1595   Pierr

In [18]:
from Data_Enrichment import get_features
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# --- 1️⃣ Cargar datos y features ---
RAW_DIR = "data/raw"
df_feats, feature_cols = get_features(RAW_DIR)

# --- 2️⃣ Entrenamos solo con datos hasta 2024 ---
df_train = df_feats[df_feats["season_end_year"] < 2025].copy()

# Extraemos y usamos solo jugadores que **tienen datos de 2025**
df_2025 = df_feats[df_feats["season_end_year"] == 2025].copy()

# --- 3️⃣ Preparar X_train y y_train ---
X_train = df_train[feature_cols].fillna(0)
y_train = df_train["ballon_dor_winner"]  # tu variable objetivo histórica

# Escalar features (recomendado para Lasso)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# --- 4️⃣ Entrenar modelo Lasso (LogisticRegression L1) ---
lasso_clf = LogisticRegression(
    penalty="l1",
    solver="liblinear",
    class_weight="balanced",
    random_state=42,
    max_iter=500
)
lasso_clf.fit(X_train_scaled, y_train)

# --- 5️⃣ Preparar X_2025 ---
X_2025 = df_2025[feature_cols].fillna(0)
X_2025_scaled = scaler.transform(X_2025)

# --- 6️⃣ Predicción para 2025 ---
df_2025["pred_prob"] = lasso_clf.predict_proba(X_2025_scaled)[:, 1]

# --- 7️⃣ Top jugadores 2025 (solo los que existen) ---
top_2025 = df_2025[["player_name","team_name","pred_prob"]].sort_values(by="pred_prob", ascending=False).head(30)
print(top_2025)


  df["ballon_dor_winner"] = df["ballon_dor_winner"].fillna(False).astype(bool)


                          player_name            team_name  pred_prob
66955                Savinho (743591)      Manchester City   0.876977
40075      Robert Lewandowski (38253)         FC Barcelona   0.142439
71200     Warren Zaïre-Emery (810092)  Paris Saint-Germain   0.054702
74892           Lamine Yamal (937958)         FC Barcelona   0.036178
52736          İlkay Gündoğan (53622)      Manchester City   0.005221
65213        Bradley Barcola (708265)  Paris Saint-Germain   0.003395
54571      Henrikh Mkhitaryan (55735)          Inter Milan   0.002951
57793           Thomas Müller (58358)        Bayern Munich   0.002851
62568     Cristhian Mosquera (646750)          Valencia CF   0.001325
51730       Miguel Gutiérrez (525299)            Girona FC   0.000564
46456         Angelo Stiller (443710)        VfB Stuttgart   0.000559
27189        Ousmane Dembélé (288230)  Paris Saint-Germain   0.000454
50360  Khvicha Kvaratskhelia (502670)           SSC Napoli   0.000445
62449     Keane Lewi

In [19]:
import pandas as pd
import numpy as np

# --- coeficientes del Lasso ---
coef = lasso_clf.coef_[0]  # array con coeficientes para cada feature
features = feature_cols     # mismas columnas usadas en X_train_scaled

# Crear DataFrame para visualizar
df_importance = pd.DataFrame({
    "feature": features,
    "coefficient": coef
})

# Calcular importancia absoluta y ordenar
df_importance["abs_coef"] = df_importance["coefficient"].abs()
df_importance = df_importance.sort_values(by="abs_coef", ascending=False)

print(df_importance.head(20))  # top 20 factores predictivos


                   feature  coefficient  abs_coef
42             age_penalty    -7.898795  7.898795
15                     age    -4.747883  4.747883
11             nb_in_group     3.688843  3.688843
46         g_per90_z_delta     3.248781  3.248781
53  discipline_rate_z_lag1    -3.160270  3.160270
44        ga_per90_z_delta    -2.967738  2.967738
32        minutes_played_z     2.816427  2.816427
7           minutes_played     2.653266  2.653266
6         direct_red_cards    -2.579832  2.579832
18                 a_per90     2.344697  2.344697
16          matches_played    -2.099157  2.099157
34                 goals_z     1.804765  1.804765
38          yellow_cards_z    -1.665633  1.665633
10             nb_on_pitch    -1.426254  1.426254
9             clean_sheets    -1.297433  1.297433
57   minutes_played_z_lag1     1.217977  1.217977
63               a_per90_w    -1.217812  1.217812
60  matches_played_z_delta    -1.156930  1.156930
13               subed_out     1.134789  1.134789
