In [1]:
from Data_Enrichment import get_features

RAW_DIR = "data/raw"
df_feats, feature_cols = get_features(RAW_DIR)
df_feats = df_feats[(df_feats['season_end_year'] != 2026)]

  df["ballon_dor_winner"] = df["ballon_dor_winner"].fillna(False).astype(bool)


In [2]:
# 🔹 Filtro limpio manteniendo identificador y variables relevantes + age y age_penalty
df_model = df_feats.filter(
    regex=r'(player_id|player_name|lag1|delta|_w$|height|main_position|position|season_end_year|ballon_dor_winner|age|age_penalty)'
).copy()


In [3]:
import pandas as pd
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

In [4]:
# Target
y = df_model['ballon_dor_winner'].values

# Features (excluyendo columnas no numéricas o irrelevantes)
X = df_model.drop(columns=['ballon_dor_winner', 'player_name']).values


In [5]:
# Seleccionar solo columnas numéricas
X_numeric = df_model.select_dtypes(include=['int64', 'float64',"bool"])


In [6]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_numeric)


In [11]:
np.random.seed(30) #we fix the seed so we can reproduce the data any time
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_numeric, X_numeric["ballon_dor_winner"], test_size=0.3, random_state=42)

In [12]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Fit Lasso regression model
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)

ValueError: Input X contains NaN.
Lasso does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
y_pred = lasso.predict(X_test)
print(y_pred.mean())

In [None]:
# Model Score
print("Model Score: ", lasso.score(X_test, y_test))

chat help me this part

In [7]:
import pandas as pd
import numpy as np

# Target
y = df_model['ballon_dor_winner'].values  # 1 si ganó, 0 si no

# Features
# Excluimos target y player_name
X = df_model.drop(columns=['ballon_dor_winner', 'player_name']).copy()

# Detectar columnas no numéricas
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
print("Categorical columns:", categorical_cols)

# Codificar variables categóricas
X_encoded = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
print("Shape after encoding:", X_encoded.shape)

# Convertimos a numpy array
X_values = X_encoded.values


Categorical columns: ['position', 'main_position']
Shape after encoding: (30335, 47)


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X_values, y, test_size=0.2, random_state=42, stratify=y
)


In [9]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [10]:
from sklearn.linear_model import LogisticRegression

log_lasso = LogisticRegression(
    penalty='l1',      # Lasso
    solver='saga',     # necesario para L1
    class_weight='balanced',  # ayuda con clases desbalanceadas
    max_iter=10000,
    random_state=42
)

# Entrenamos
log_lasso.fit(X_train_scaled, y_train)


ValueError: Input X contains NaN.
LogisticRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score

# Predicciones de probabilidad
y_pred_prob = log_lasso.predict_proba(X_test_scaled)[:,1]

# Predicciones binarias con threshold 0.5
y_pred = (y_pred_prob >= 0.5).astype(int)

# Métricas
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("ROC AUC:", roc_auc_score(y_test, y_pred_prob))


In [None]:
# Obtener índices del test set
_, X_test_idx, _, _ = train_test_split(
    np.arange(len(df_feats)), y, test_size=0.2, random_state=42, stratify=y
)

# DataFrame con nombres y probabilidades
df_preds = df_feats.iloc[X_test_idx][['player_name']].copy()
df_preds['pred_prob'] = y_pred_prob

# Top 10 jugadores según modelo
top_players = df_preds.sort_values(by='pred_prob', ascending=False).head(30)
print(top_players)


In [None]:
# Coeficientes del modelo
coef = log_lasso.coef_[0]
feature_names = X_encoded.columns

# Ordenar por importancia absoluta
feature_importance = pd.DataFrame({
    'feature': feature_names,
    'coef': coef
}).sort_values(by='coef', key=abs, ascending=False)

print(feature_importance.head(30))
