# Import

In [17]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

# Load the data

In [18]:
data = pd.read_csv('merged_data_clean.csv')
data.dropna(subset=["market_value"], inplace=True)

In [19]:
# Afficher les colonnes qui sont entièrement remplies de NaN dans le DataFrame data
all_nan_cols = data.columns[data.isna().all()]
print("Colonnes entièrement NaN :", all_nan_cols.tolist())

Colonnes entièrement NaN : []


# Clean the data


In [20]:
data.drop(columns=['Nation'],inplace=True)

In [21]:
# Afficher les colonnes qui sont entièrement remplies de NaN dans le DataFrame data
all_nan_cols = data.columns[data.isna().all()]
print("Colonnes entièrement NaN :", all_nan_cols.tolist())

Colonnes entièrement NaN : []


In [22]:
data = data[data['Pos'] != 'GK']
# Afficher les lignes contenant au moins une valeur NaN dans le DataFrame data
# Supprimer toutes les colonnes contenant "stats_keeper", "stats_keeper_adv" ou "(GK)"
cols_to_drop = [col for col in data.columns if "stats_keeper" in col or "GK" in col or "(GK)" in col]
data.drop(columns=cols_to_drop, inplace=True)
gk_cols = [
    "PSxG+/-", "CS", "Stp%", "Launch%", "Save%", "Saves", "CS%", "AvgDist",
    "GA90", "GA", "Thr", "D", "PSxG", "SoTA", "AvgLen", "Stp", "/90",
    "PSxG/SoT", "#OPA", "PKm", "PKsv", "#OPA/90", "PKA", "Opp","W","L"
]

data.drop(columns=gk_cols, inplace=True)




In [23]:
# Afficher les colonnes qui sont entièrement remplies de NaN dans le DataFrame data
all_nan_cols = data.columns[data.isna().all()]
print("Colonnes entièrement NaN :", all_nan_cols.tolist())

Colonnes entièrement NaN : []


In [24]:
data["preferred_foot"] = data["preferred_foot"].map({"Right": 1, "Left": 0})

In [25]:
data.drop(["normalized_player", "normalized_full_name", "full_name", "dob", "Age", "birth_year_y", "Player", "Squad", "Comp", "last_evaluation"], axis=1, inplace=True)

In [26]:
columns_to_drop = [col for col in data.columns if "Pos" in col and col != "Pos"]
data.drop(columns=columns_to_drop, inplace=True)

In [27]:
columns_to_drop = [col for col in data.columns if "Nation" in col and col != "Nation"]
data.drop(columns=columns_to_drop, inplace=True)

In [28]:
columns_to_drop = [col for col in data.columns if "Comp" in col and col != "Comp"]
data.drop(columns=columns_to_drop, inplace=True)

In [29]:
data['club_contract_valid_until'] = pd.to_datetime(data['club_contract_valid_until'], errors='coerce')
data['club_contract_valid_until'] = data['club_contract_valid_until'].dt.year

In [30]:
data["years_left"] = data["club_contract_valid_until"].astype(float) - 2025
data.drop(columns=["club_contract_valid_until"], inplace=True)
data = pd.get_dummies(data, columns=["Pos"], prefix="Pos", drop_first=True)


In [31]:
columns_to_drop = [col for col in data.columns if "Age" in col]
data.drop(columns=columns_to_drop, inplace=True)

In [40]:
cols_to_drop_90 = [
    col for col in data.columns
    if '/90' in col or '90s_' in col or col.endswith('90')
]
data.drop(columns=cols_to_drop_90, inplace=True)




In [48]:
n_samples, n_features = data.shape
print(f"{n_samples} échantillons, {n_features} features")
print(f"Ratio samples/features : {n_samples / n_features:.2f}")



1810 échantillons, 179 features
Ratio samples/features : 10.11


| Catégorie                     | Exemples de colonnes conservées                                                | Description / Rôle dans la modélisation                                               |
|------------------------------|----------------------------------------------------------------------------------|----------------------------------------------------------------------------------------|
| Variable cible                | `market_value`                                                                 | Valeur marchande à prédire (variable de régression).                                  |
| Identité / Caractéristiques  | `height_cm`, `weight_kg`, `preferred_foot`, `weak_foot`, `birth_year_x`        | Données démographiques et morphologiques du joueur.                                   |
| Contrat / Mise à jour        | `years_left`, `value_updated`                                                  | Informations contractuelles : durée restante, date de mise à jour.                   |
| Temps de jeu                 | `Min`, `MP`, `Starts`, `Subs`, `Mn/MP`, `Mn/Start`, `Mn/Sub`, `Min%`, `unSub`   | Quantité et répartition du temps de jeu.                                               |
| Position (one-hot)           | `Pos_DF,MF`, `Pos_FW`, `Pos_MF`, etc.                                           | Postes joués, encodés en variables binaires.                                           |
| Productivité offensive       | `Gls`, `G/Sh`, `G/SoT`, `G-xG`, `xG`, `xA`, `xAG`, `npxG`, `npxG+xAG`, `xG+xAG` | Capacité à marquer et transformer les occasions.                                      |
| Création d’occasions         | `Ast`, `SCA`, `GCA`, `KP`, `A-xAG`                                              | Création de buts et d’actions dangereuses pour l’équipe.                              |
| Jeu de passes                | `Pass`, `Cmp`, `Cmp%`, `PrgP`, `PassLive`, `PassDead`, `Cmp_stats_passing_types`| Qualité de passe, volume et précision, passes progressives.                           |
| Conduite / progression       | `Carries`, `Succ`, `Succ%`, `Dist`, `TotDist`, `PrgC`, `PrgR`, `PrgDist`, `Rec`, `Recov`, `Touches` | Conduites de balle, distance parcourue, récupération.                         |
| Défense / Duels              | `Tkl`, `Tkl%`, `Tkl+Int`, `TklW`, `Tkld`, `Tkld%`, `Won`, `Won%`, `Blocks`, `Int`, `Clr`, `Dis` | Engagement défensif, tacles, interceptions, duels.                           |
| Zones du terrain             | `Att`, `Mid 3rd`, `Def 3rd`, `Att Pen`, `Def Pen`, `1/3`, `PPA`                 | Zones d’intervention préférentielles sur le terrain.                                 |
| Phases / Types d’actions     | `Live`, `Dead`, `CK`, `CPA`, `FK`, `PK`, `PKatt`, `PKwon`, `PKcon`, `TI`, `TO`, `TB`, `Sw` | Types d’actions (jeux arrêtés, penalties, remises en jeu).                      |
| Discipline / Événements      | `CrdY`, `CrdR`, `Fls`, `Fld`, `OG`, `Off`, `On-Off`, `Lost`, `Mis`, `Err`       | Discipline, fautes, erreurs, impact global sur l’équipe.                              |
| Statistiques avancées        | `G+A`, `G+A-PK`, `G-PK`, `np:G-xG`, `xG+/-`, `+/-`                               | Indicateurs combinés et différentiels de performance.                                 |
| Références techniques        | `Rk`, `Born`, `MP_stats_playing_time`, `Min_stats_playing_time`, `Rk_stats_*`   | Colonnes liées à l’origine ou des statistiques de référence internes.                |


# Baseline

In [43]:
nan_ratio = data.isna().mean().sort_values(ascending=False)
print(nan_ratio[nan_ratio > 0])  # Colonnes concernées

G/SoT         0.218232
years_left    0.146961
Mn/Start      0.092818
Mn/Sub        0.088950
npxG/Sh       0.083425
Dist          0.083425
SoT%          0.083425
G/Sh          0.083425
Succ%         0.074033
Tkld%         0.074033
Tkl%          0.060773
Won%          0.040884
Cmp%          0.003867
On-Off        0.002210
dtype: float64


In [45]:
data.shape

(1810, 179)

In [44]:
# Créer une copie du DataFrame sans les lignes contenant des NaN
data_no_nan = data.dropna()
nb_lignes_supprimees = len(data) - len(data_no_nan)
print(f"Nombre de lignes supprimées : {nb_lignes_supprimees}")
print(f"Nombre de lignes restantes : {len(data_no_nan)}")

Nombre de lignes supprimées : 734
Nombre de lignes restantes : 1076


In [49]:
n_samples, n_features = data_no_nan.shape
print(f"{n_samples} échantillons, {n_features} features")
print(f"Ratio samples/features : {n_samples / n_features:.2f}")



1076 échantillons, 179 features
Ratio samples/features : 6.01


In [47]:
X = data_no_nan.drop(columns=['market_value'])
y = data_no_nan['market_value']
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=42, test_size=0.1)

In [50]:
from sklearn.linear_model import Lasso

# Entraîner un modèle Lasso sur X_train et y_train
lasso = Lasso(alpha=1.0, max_iter=10000)
lasso.fit(X_train, y_train)

# Prédire et évaluer
y_pred_lasso = lasso.predict(X_test)
mae_lasso = mean_absolute_error(y_test, y_pred_lasso)
rmse_lasso = mean_squared_error(y_test, y_pred_lasso)
r2_lasso = r2_score(y_test, y_pred_lasso)

print(f"Lasso MAE: {mae_lasso:.2f}")
print(f"Lasso RMSE: {rmse_lasso:.2f}")
print(f"Lasso R²: {r2_lasso:.3f}")

Lasso MAE: 8767429.53
Lasso RMSE: 148396442050629.44
Lasso R²: 0.533


  model = cd_fast.enet_coordinate_descent(


## C'est de la merde apres


In [86]:
from sklearn.feature_selection import f_regression

f_vals, p_vals = f_regression(data.drop(columns=['market_value']), data['market_value'])


significant = p_vals < 0.05
print(f"{significant.sum()} features significatives sur {len(p_vals)}")


ValueError: Input X contains NaN.

In [11]:
X = data.drop(columns=['market_value'])
y = data['market_value']
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [12]:
# Fit the RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(X_train, y_train)

# Get feature importances
importances = pd.Series(rf.feature_importances_, index=X_train.columns)
print(importances.sort_values(ascending=False))

xG+/-            0.327806
SCA              0.053197
+/-              0.043592
PassLive         0.034231
id               0.034015
                   ...   
Nation_pe PER    0.000000
Nation_pa PAN    0.000000
Nation_nz NZL    0.000000
Nation_cf CTA    0.000000
Nation_il ISR    0.000000
Length: 332, dtype: float64


In [31]:
importances.sort_values(ascending=False).to_csv('feature_importances.csv')

In [23]:
selected_columns = importances.sort_values(ascending=False).index[:30].tolist()
selected_columns

['xG+/-',
 '+/-',
 'Fld_stats_misc',
 'id',
 'Succ',
 'SCA',
 'Att 3rd_stats_possession',
 'onG',
 'Att Pen',
 'TotDist_stats_possession',
 'SCA90',
 'PassLive',
 'PPM',
 'KP',
 'Cmp%',
 '1/3_stats_possession',
 'PrgC_stats_possession',
 'PrgDist_stats_possession',
 'Born',
 'Born_stats_misc',
 'Born_stats_passing',
 'Born_stats_shooting',
 'Born_stats_gca',
 'xG+/-90',
 'Tkld',
 'Born_stats_defense',
 'Born_stats_passing_types',
 'npxG+xAG',
 'Mn/Sub',
 'On-Off']

In [32]:
X_train = X_train[selected_columns]  # Sélection des 10 meilleures features

In [36]:
X_train.shape

(1336, 30)

In [None]:


# Train Ridge regression on the same data
ridge = Ridge()
ridge.fit(X_train, y_train)

# Predict and evaluate
y_pred_ridge = ridge.predict(X_test)
mae_ridge = mean_absolute_error(y_test, y_pred_ridge)
rmse_ridge = mean_squared_error(y_test, y_pred_ridge)
r2_ridge = r2_score(y_test, y_pred_ridge)

print(f"Ridge MAE: {mae_ridge:.2f}")
print(f"Ridge RMSE: {rmse_ridge:.2f}")
print(f"Ridge R²: {r2_ridge:.3f}")

Ridge MAE: 8740978.32
Ridge RMSE: 143013386305343.59
Ridge R²: 0.466


In [38]:
# Train a RandomForestRegressor on the same data
rf = LinearRegression()
rf.fit(X_train, y_train)

# Predict and evaluate
y_pred_rf = rf.predict(X_test)
mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

print(f"RandomForest MAE: {mae_rf:.2f}")
print(f"RandomForest RMSE: {rmse_rf:.2f}")
print(f"RandomForest R²: {r2_rf:.3f}")

RandomForest MAE: 8741295.05
RandomForest RMSE: 143027446229128.31
RandomForest R²: 0.465


In [34]:
# Use already defined data and selected_columns
df = data[selected_columns].dropna()

# Define features and target
X = df
y = data['market_value'].loc[df.index]

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# Predictions and evaluation
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE : {mae:.2f}")
print(f"RMSE : {rmse:.2f}")
print(f"R² : {r2:.3f}")
# If you want to use the same X_train and y_train as used for the RandomForestRegressor,
# simply reuse those variables directly (they are already defined and split above).
# You do NOT need to redefine or resplit X_train and y_train here.

# Train model on the same train set as RandomForestRegressor
model = LinearRegression()
model.fit(X_train, y_train)

# Predictions and evaluation
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE : {mae:.2f}")
print(f"RMSE : {rmse:.2f}")
print(f"R² : {r2:.3f}")
print(f"Nombre de variables (num + dummies) : {X.shape[1]}")


MAE : 8741295.05
RMSE : 143027446229128.31
R² : 0.465
MAE : 8741295.05
RMSE : 143027446229128.31
R² : 0.465
Nombre de variables (num + dummies) : 30


In [83]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Use already defined data and selected_columns
df = data[selected_columns].dropna()

# Define features and target
X = df
y = data['market_value'].loc[df.index]

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
model = LinearRegression()
model.fit(X_train, y_train)

# Predictions and evaluation
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE : {mae:.2f}")
print(f"RMSE : {rmse:.2f}")
print(f"R² : {r2:.3f}")
print(f"Nombre de variables (num + dummies) : {X.shape[1]}")


MAE : 8482333.96
RMSE : 135710190689703.83
R² : 0.590
Nombre de variables (num + dummies) : 30
