In [5]:
from Data_Enrichment import get_features

RAW_DIR = "data/raw"
df_feats, feature_cols = get_features(RAW_DIR)
df_feats = df_feats[(df_feats['season_end_year'] != 2026)]

  df["ballon_dor_winner"] = df["ballon_dor_winner"].fillna(False).astype(bool)


In [6]:
# 🔹 Filtro limpio manteniendo identificador y variables relevantes + age y age_penalty
df_model = df_feats.filter(
    regex=r'(player_id|player_name|lag1|delta|_w$|height|main_position|position|season_end_year|ballon_dor_winner|age|age_penalty)'
).copy()



In [7]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

In [8]:
# Target
y = df_model['ballon_dor_winner'].values

# Features (excluyendo columnas no numéricas o irrelevantes)
X = df_model.drop(columns=['ballon_dor_winner', 'player_name']).values


In [9]:
# Seleccionar solo columnas numéricas
X_numeric = df_model.select_dtypes(include=['int64', 'float64',"bool"])


In [10]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_numeric)


In [11]:
np.random.seed(30) #we fix the seed so we can reproduce the data any time
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_numeric, X_numeric["ballon_dor_winner"], test_size=0.3, random_state=42)

In [12]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Fit Lasso regression model
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)

0,1,2
,alpha,0.1
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


In [13]:
y_pred = lasso.predict(X_test)
print(y_pred.mean())

0.0002354825036499788


In [14]:
# Model Score
print("Model Score: ", lasso.score(X_test, y_test))

Model Score:  -9.479042004900684e-05


chat help me this part

In [15]:
import pandas as pd
import numpy as np

# Target
y = df_model['ballon_dor_winner'].values  # 1 si ganó, 0 si no

# Features
# Excluimos target y player_name
X = df_model.drop(columns=['ballon_dor_winner', 'player_name']).copy()

# Detectar columnas no numéricas
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
print("Categorical columns:", categorical_cols)

# Codificar variables categóricas
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
print("Shape after encoding:", X_encoded.shape)

# Convertimos a numpy array
X_values = X_encoded.values


Categorical columns: ['position', 'main_position']
Shape after encoding: (30333, 47)


In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_values, y, test_size=0.2, random_state=42, stratify=y
)


In [17]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [18]:
from sklearn.linear_model import LogisticRegression

log_lasso = LogisticRegression(
    penalty='l1',      # Lasso
    solver='saga',     # necesario para L1
    class_weight='balanced',  # ayuda con clases desbalanceadas
    max_iter=10000,
    random_state=42
)

# Entrenamos
log_lasso.fit(X_train_scaled, y_train)


0,1,2
,penalty,'l1'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'saga'
,max_iter,10000


In [19]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score

# Predicciones de probabilidad
y_pred_prob = log_lasso.predict_proba(X_test_scaled)[:,1]

# Predicciones binarias con threshold 0.5
y_pred = (y_pred_prob >= 0.5).astype(int)

# Métricas
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_pred_prob))


Accuracy: 0.9953848689632437
Precision: 0.06666666666666667
Recall: 1.0
ROC AUC: 0.9996702390766694


In [25]:
# Coeficientes del modelo
coef = log_lasso.coef_[0]
feature_names = X_encoded.columns

# Ordenar por importancia absoluta
feature_importance = pd.DataFrame({
    'feature': feature_names,
    'coef': coef
}).sort_values(by='coef', key=abs, ascending=False)

print(feature_importance.head(30))


                                   feature      coef
2                                      age  3.470757
15                  discipline_rate_z_lag1 -2.123673
19                   minutes_played_z_lag1  1.997682
20                  minutes_played_z_delta  1.640536
13                 clean_sheet_rate_z_lag1 -1.190149
4                              age_penalty  1.157556
32          position_Attack - Right Winger  1.025573
30        position_Attack - Centre-Forward -1.025035
40    position_Midfield - Central Midfield  0.995306
16                 discipline_rate_z_delta -0.912661
7                           g_per90_z_lag1  0.867736
8                          g_per90_z_delta  0.780189
11                         gc_per90_z_lag1 -0.631226
1                                   height -0.598671
0                                player_id  0.556864
39  position_Midfield - Attacking Midfield -0.526554
29                             pen_share_w -0.442973
9                           a_per90_z_lag1  0.

In [26]:
# Asegurarse de que df_feats y y tengan el mismo tamaño
min_len = min(len(df_feats), len(y))
df_feats_aligned = df_feats.iloc[:min_len].copy()
y_aligned = y[:min_len]

# Ahora sí split
_, X_test_idx, _, _ = train_test_split(
    np.arange(len(df_feats_aligned)),
    y_aligned,
    test_size=0.2,
    random_state=42,
    stratify=y_aligned
)

# DataFrame de predicciones
df_preds = pd.DataFrame({
    "player_name": df_feats_aligned.iloc[X_test_idx]["player_name"].values,
    "pred_prob": y_pred_prob
})

# Top 30 jugadores
top_players = df_preds.sort_values(by='pred_prob', ascending=False).head(30)
print(top_players)


                     player_name  pred_prob
2167        Lionel Messi (28003)   0.999687
1922        Lionel Messi (28003)   0.998585
863       Mohamed Salah (148455)   0.993012
2045     Andrés Guardado (20506)   0.992008
1765     Erling Haaland (418560)   0.978716
4702        Lionel Messi (28003)   0.974222
2057     Andrés Guardado (20506)   0.972529
4683         Luka Modrić (27992)   0.969759
3438       João Moutinho (29364)   0.963223
4901       Santi Cazorla (15799)   0.955721
3329       João Moutinho (29364)   0.911650
1692        Lionel Messi (28003)   0.907085
565         Thiago Silva (29241)   0.833543
2563               Pedro (65278)   0.816417
4534       José Callejón (61253)   0.791049
2616       Robert Tesche (41458)   0.771288
1849         Luka Modrić (27992)   0.764255
4218      Kylian Mbappé (342229)   0.755609
4770      Thomas Mangani (18934)   0.743010
1052      İlkay Gündoğan (53622)   0.733234
4657        Thiago Silva (29241)   0.683695
5385      Mohamed Salah (148455)

In [33]:
from Data_Enrichment import get_features
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

# --- 1️⃣ Cargar datos y features ---
RAW_DIR = "data/raw"
df_feats, feature_cols = get_features(RAW_DIR)

# --- 2️⃣ Entrenamos solo con datos hasta 2024 ---
df_train = df_feats[df_feats["season_end_year"] < 2025].copy()

# Extraemos y usamos solo jugadores que **tienen datos de 2025**
df_2025 = df_feats[df_feats["season_end_year"] == 2025].copy()

# --- 3️⃣ Preparar X_train y y_train ---
X_train = df_train[feature_cols].fillna(0)
y_train = df_train["ballon_dor_winner"]  # tu variable objetivo histórica

# Escalar features (recomendado para Lasso)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# --- 4️⃣ Entrenar modelo Lasso (LogisticRegression L1) ---
lasso_clf = LogisticRegression(
    penalty="l1",
    solver="liblinear",
    class_weight="balanced",
    random_state=42,
    max_iter=500
)
lasso_clf.fit(X_train_scaled, y_train)

# --- 5️⃣ Preparar X_2025 ---
X_2025 = df_2025[feature_cols].fillna(0)
X_2025_scaled = scaler.transform(X_2025)

# --- 6️⃣ Predicción para 2025 ---
df_2025["pred_prob"] = lasso_clf.predict_proba(X_2025_scaled)[:, 1]

# --- 7️⃣ Top jugadores 2025 (solo los que existen) ---
top_2025 = df_2025[["player_name","team_name","pred_prob"]].sort_values(by="pred_prob", ascending=False).head(30)
print(top_2025)


  df["ballon_dor_winner"] = df["ballon_dor_winner"].fillna(False).astype(bool)


                        player_name           team_name  pred_prob
16714    Robert Lewandowski (38253)        FC Barcelona   0.727490
21955        İlkay Gündoğan (53622)     Manchester City   0.413135
23524         Thomas Müller (58358)       Bayern Munich   0.229501
22619    Henrikh Mkhitaryan (55735)         Inter Milan   0.097941
3936         Mohamed Salah (148455)        Liverpool FC   0.033521
8319         Lucas Vázquez (221316)         Real Madrid   0.009321
1089                  Neto (111819)     AFC Bournemouth   0.004241
19269          Rui Patrício (45026)         Atalanta BC   0.002539
18443           Yann Sommer (42205)         Inter Milan   0.002350
2858           David López (129444)           Girona FC   0.001724
7913       Florian Tardieu (212804)    AS Saint-Étienne   0.001648
1074         Florent Hanin (111811)          Angers SCO   0.000536
11411           Luka Modrić (27992)         Real Madrid   0.000495
9270         Steve Mandanda (23951)    Stade Rennais FC   0.00

In [34]:
import pandas as pd
import numpy as np

# --- coeficientes del Lasso ---
coef = lasso_clf.coef_[0]  # array con coeficientes para cada feature
features = feature_cols     # mismas columnas usadas en X_train_scaled

# Crear DataFrame para visualizar
df_importance = pd.DataFrame({
    "feature": features,
    "coefficient": coef
})

# Calcular importancia absoluta y ordenar
df_importance["abs_coef"] = df_importance["coefficient"].abs()
df_importance = df_importance.sort_values(by="abs_coef", ascending=False)

print(df_importance.head(20))  # top 20 factores predictivos


                   feature  coefficient  abs_coef
11             nb_in_group     4.796242  4.796242
15                     age     4.502292  4.502292
7           minutes_played     2.478231  2.478231
38          yellow_cards_z    -1.868070  1.868070
16          matches_played    -1.647052  1.647052
18                 a_per90     1.433397  1.433397
52  discipline_rate_z_lag1    -1.256349  1.256349
32        minutes_played_z     1.145997  1.145997
66             pen_share_w    -1.120941  1.120941
35               assists_z     1.091506  1.091506
9             clean_sheets    -1.063449  1.063449
62               a_per90_w    -1.047098  1.047098
44          g_per90_z_lag1    -1.021510  1.021510
8           goals_conceded    -0.845021  0.845021
34                 goals_z    -0.769580  0.769580
56   minutes_played_z_lag1     0.721923  0.721923
14                  height    -0.715859  0.715859
47         a_per90_z_delta    -0.615614  0.615614
36         penalty_goals_z     0.574789  0.574789
