In [None]:
from Data_Enrichment import get_features

RAW_DIR = "data/raw"
df_feats, feature_cols = get_features(RAW_DIR)
df_feats = df_feats[(df_feats['season_end_year'] != 2026)]

  df["ballon_dor_winner"] = df["ballon_dor_winner"].fillna(False).astype(bool)


In [None]:
# 🔹 Filtro limpio manteniendo identificador y variables relevantes
df_model = df_feats.filter(
    regex=r'(player_id|player_name|lag1|delta|_w$|height|main_position|position|season_end_year|ballon_dor_winner)'
).copy()


In [4]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

In [7]:
# Target
y = df_model['ballon_dor_winner'].values

# Features (excluyendo columnas no numéricas o irrelevantes)
X = df_model.drop(columns=['ballon_dor_winner', 'player_name']).values


In [8]:
# Seleccionar solo columnas numéricas
X_numeric = df_model.select_dtypes(include=['int64', 'float64',"bool"])


In [9]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_numeric)


In [10]:
np.random.seed(30) #we fix the seed so we can reproduce the data any time
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_numeric, X_numeric["ballon_dor_winner"], test_size=0.3, random_state=42)

In [11]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Fit Lasso regression model
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)

0,1,2
,alpha,0.1
,fit_intercept,True
,precompute,False
,copy_X,True
,max_iter,1000
,tol,0.0001
,warm_start,False
,positive,False
,random_state,
,selection,'cyclic'


In [12]:
y_pred = lasso.predict(X_test)
print(y_pred.mean())

0.00018837713101629464


In [13]:
# Model Score
print("Model Score: ", lasso.score(X_test, y_test))

Model Score:  -0.00023735783127243337


chat help me this part

In [15]:
import pandas as pd
import numpy as np

# Target
y = df_model['ballon_dor_winner'].values  # 1 si ganó, 0 si no

# Features
# Excluimos target y player_name
X = df_model.drop(columns=['ballon_dor_winner', 'player_name']).copy()

# Detectar columnas no numéricas
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
print("Categorical columns:", categorical_cols)

# Codificar variables categóricas
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
print("Shape after encoding:", X_encoded.shape)

# Convertimos a numpy array
X_values = X_encoded.values


Categorical columns: ['position', 'main_position']
Shape after encoding: (30335, 45)


In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_values, y, test_size=0.2, random_state=42, stratify=y
)


In [17]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [18]:
from sklearn.linear_model import LogisticRegression

log_lasso = LogisticRegression(
    penalty='l1',      # Lasso
    solver='saga',     # necesario para L1
    class_weight='balanced',  # ayuda con clases desbalanceadas
    max_iter=10000,
    random_state=42
)

# Entrenamos
log_lasso.fit(X_train_scaled, y_train)


0,1,2
,penalty,'l1'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'saga'
,max_iter,10000


In [19]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score

# Predicciones de probabilidad
y_pred_prob = log_lasso.predict_proba(X_test_scaled)[:,1]

# Predicciones binarias con threshold 0.5
y_pred = (y_pred_prob >= 0.5).astype(int)

# Métricas
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_pred_prob))


Accuracy: 0.9962089995055217
Precision: 0.08
Recall: 1.0
ROC AUC: 0.999752679307502


In [23]:
# Obtener índices del test set
_, X_test_idx, _, _ = train_test_split(
    np.arange(len(df_feats)), y, test_size=0.2, random_state=42, stratify=y
)

# DataFrame con nombres y probabilidades
df_preds = df_feats.iloc[X_test_idx][['player_name']].copy()
df_preds['pred_prob'] = y_pred_prob

# Top 10 jugadores según modelo
top_players = df_preds.sort_values(by='pred_prob', ascending=False).head(30)
print(top_players)


                        player_name  pred_prob
11422          Lionel Messi (28003)   0.999746
11425          Lionel Messi (28003)   0.998363
3930         Mohamed Salah (148455)   0.994841
7574        Andrés Guardado (20506)   0.987059
11430          Lionel Messi (28003)   0.976325
7571        Andrés Guardado (20506)   0.976063
12504         João Moutinho (29364)   0.957420
11400           Luka Modrić (27992)   0.955709
21955        İlkay Gündoğan (53622)   0.938755
12502         João Moutinho (29364)   0.925021
18193         Robert Tesche (41458)   0.884397
4259          Santi Cazorla (15799)   0.847553
16709    Robert Lewandowski (38253)   0.840938
17306     Maxence Caqueret (395237)   0.814383
13746          Ivan Rakitic (32467)   0.809168
11429          Lionel Messi (28003)   0.808394
21884           Kevin Kampl (53418)   0.786943
5137   Giovanni Di Lorenzo (169880)   0.753413
17423          Mats Hummels (39728)   0.740031
3932         Mohamed Salah (148455)   0.669156
3928         

In [22]:
# Coeficientes del modelo
coef = log_lasso.coef_[0]
feature_names = X_encoded.columns

# Ordenar por importancia absoluta
feature_importance = pd.DataFrame({
    'feature': feature_names,
    'coef': coef
}).sort_values(by='coef', key=abs, ascending=False)

print(feature_importance.head(30))


                                   feature      coef
17                   minutes_played_z_lag1  2.911996
13                  discipline_rate_z_lag1 -2.799792
18                  minutes_played_z_delta  2.204616
0                                player_id -1.736228
11                 clean_sheet_rate_z_lag1 -1.566491
38    position_Midfield - Central Midfield  1.291414
14                 discipline_rate_z_delta -1.277440
30          position_Attack - Right Winger  1.037381
9                          gc_per90_z_lag1 -0.918535
28        position_Attack - Centre-Forward -0.888304
5                           g_per90_z_lag1  0.877814
1                                   height -0.848371
6                          g_per90_z_delta  0.826517
2                          season_end_year -0.607835
27                             pen_share_w -0.605253
7                           a_per90_z_lag1  0.468218
12                clean_sheet_rate_z_delta -0.461029
44                  main_position_Midfield  0.