In [285]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import matplotlib.pyplot as plt

# Cluster des équipes de football

Dans cet exercice, nous allons regrouper de façon non supervisée les équipes en agrégeant les statistiques des joueurs de chaque équipe. Le but sera d'observer s'il est possible, par ces méthodes et avec ce dataset, de regrouper les équipes en fonction de leur style de jeu. Dans un premier temps, nous n'allons pas définir de nombre de clusters et nous allons utiliser la méthode Mean Shift. Nous observerons les métriques de silhouette afin de mesurer la cohésion et la distinction des clusters, ainsi que les indices de Davies-Bouldin et de Calinski-Harabasz pour mesurer le ratio entre la dispersion intra-cluster et inter-cluster.

Dans un second temps, nous allons utiliser les méthodes de clustering supervisé avec KMeans avec un nombre de clusters défini à 5, le nombre de championnats dans le dataset, et à l'aide de la mesure ARI (Adjusted Rand Index), nous allons comparer les clusters obtenus avec les championnats auxquels appartiennent les équipes.

Nous allons appliquer un nettoyage des données similaire à celui de l'exercice 6, mais en conservant cette fois les gardiens de but afin d'intégrer davantage de statistiques défensives. Pour les joueurs de champ, les colonnes liées aux gardiens seront complétées par la moyenne des statistiques des gardiens de leur équipe. Par ailleurs, nous n'effectuerons pas d'augmentation des données en fonction des positions, car notre objectif est de représenter les équipes globalement.

In [326]:
def load_and_clean_data(keep_gk=False, keep_market_value=False):
    data = pd.read_csv('merged_data_clean.csv')
    if not keep_market_value:
        data.drop(columns=["market_value"], inplace=True)
    else:
        data.dropna(subset=["market_value"], inplace=True)
    data.drop(columns=["Nation", "id"], inplace=True)
    if not keep_gk:
        data = data[data['Pos'] != 'GK']
        cols_to_drop = [col for col in data.columns if "stats_keeper" in col or "GK" in col or "(GK)" in col]
        data.drop(columns=cols_to_drop, inplace=True)
        gk_cols = [
            "PSxG+/-", "CS", "Stp%", "Launch%", "Save%", "Saves", "CS%", "AvgDist",
            "GA90", "GA", "Thr", "D", "PSxG", "SoTA", "AvgLen", "Stp", "/90",
            "PSxG/SoT", "#OPA", "PKm", "PKsv", "#OPA/90", "PKA", "Opp","W","L"
        ]
        data.drop(columns=gk_cols, inplace=True)
    # data["preferred_foot"] = data["preferred_foot"].map({"Right": 1, "Left": 0})
    data.drop(["preferred_foot", "normalized_player", "normalized_full_name", "full_name", "dob", "birth_year_y", "birth_year_x", "Born", "Player", "last_evaluation", "club_contract_valid_until", "value_updated", "success"], axis=1, inplace=True)
    columns_to_drop = [col for col in data.columns if "Pos" in col and col != "Pos"]
    data.drop(columns=columns_to_drop, inplace=True)
    columns_to_drop = [col for col in data.columns if "Nation" in col]
    data.drop(columns=columns_to_drop, inplace=True)
    columns_to_drop = [col for col in data.columns if "Comp" in col]
    data.drop(columns=columns_to_drop, inplace=True)
    columns_to_drop = [col for col in data.columns if "Born" in col]
    data.drop(columns=columns_to_drop, inplace=True)
    columns_to_drop = [col for col in data.columns if "Rk" in col]
    data.drop(columns=columns_to_drop, inplace=True)
    columns_to_drop = [col for col in data.columns if "Age" in col and col != "Age"]
    data.drop(columns=columns_to_drop, inplace=True)
    # data = data.assign(Pos=data["Pos"].str.split(",")).explode("Pos").reset_index(drop=True)
    # data = pd.get_dummies(data, columns=["Pos"], prefix=["Pos"], drop_first=True)
    cols_to_drop_90 = [
        col for col in data.columns
        if '/90' in col or '90s_' in col or col.endswith('90')
    ]
    data.drop(columns=cols_to_drop_90, inplace=True)


    def fill_missing_values(data):
        for col in data.select_dtypes(include='number').columns:
            if data[col].isnull().any():
                data[col] = data.groupby('Pos')[col].transform(lambda x: x.fillna(x.mean()))
                if data[col].isnull().any():
                    gk_mean = data.loc[data['Pos'] == 'GK', col].mean()
                    data[col] = data[col].fillna(gk_mean)
        return data

    data = fill_missing_values(data)
    data.drop(columns=["Pos"], inplace=True)
    return data
data = load_and_clean_data(keep_gk=True)

In [287]:
data.columns.tolist()

['height_cm',
 'weight_kg',
 'weak_foot',
 '#OPA',
 '+/-',
 '1/3',
 '1/3_stats_possession',
 '2CrdY',
 '90s',
 'A-xAG',
 'Age',
 'Ast',
 'Ast_stats_passing',
 'Att',
 'Att (GK)',
 'Att 3rd',
 'Att 3rd_stats_possession',
 'Att Pen',
 'Att_stats_defense',
 'Att_stats_keeper_adv',
 'Att_stats_passing_types',
 'Att_stats_possession',
 'AvgDist',
 'AvgLen',
 'Blocks',
 'Blocks_stats_defense',
 'CK',
 'CK_stats_keeper_adv',
 'CPA',
 'CS',
 'CS%',
 'Carries',
 'Clr',
 'Cmp',
 'Cmp%',
 'Cmp%_stats_keeper_adv',
 'Cmp_stats_keeper_adv',
 'Cmp_stats_passing_types',
 'CrdR',
 'CrdR_stats_misc',
 'CrdY',
 'CrdY_stats_misc',
 'Crs',
 'CrsPA',
 'Crs_stats_misc',
 'D',
 'Dead',
 'Def',
 'Def 3rd',
 'Def 3rd_stats_possession',
 'Def Pen',
 'Dis',
 'Dist',
 'Err',
 'FK',
 'FK_stats_keeper_adv',
 'FK_stats_passing_types',
 'Fld',
 'Fld_stats_misc',
 'Fls',
 'G+A',
 'G+A-PK',
 'G-PK',
 'G-xG',
 'G/Sh',
 'G/SoT',
 'GA',
 'GA_stats_keeper_adv',
 'GCA',
 'Gls',
 'Gls_stats_shooting',
 'In',
 'Int',
 'Int_sta

Pour le regroupement des statistiques, nous allons essayer deux méthodes : la première est de faire la moyenne des statistiques de chaque équipe, la seconde est de sommer certaines statistiques pour obtenir un score de performance par équipe.

In [288]:
sum_cols = [
    'Gls', 'Ast', 'Sh', 'SoT', 'Tkl', 'Int', 'Blocks', 'Clr',
    'CrdY', 'CrdR', 'PK', 'PKatt', 'PKwon', 'Touches',
    'Pass', 'PassLive', 'PassDead', 'Carries', 'Recov',
    'SCA', 'GCA', 'Fls', 'Fld', 'Starts', 'Subs', 'MP', 'Min', 'xG', 'npxG', 'xA', 'npxG+xA', 'xGChain', 'xGBuildup', 'GA'
]

mean_cols = [
    'Cmp%', 'SoT%', 'Succ%', 'Won%', 'Tkl%',
    'Min%', 'G/Sh', 'G/SoT', 'xG/Sh', 'npxG/Sh',
    'height_cm', 'weight_kg',
]

agg_dict = {}

# Colonnes à sommer
for col in sum_cols:
    if col in data.columns:
        agg_dict[col] = 'sum'

# Colonnes à moyenner
for col in mean_cols:
    if col in data.columns:
        agg_dict[col] = 'mean'

team_df_with_sum = data.groupby('Squad').agg(agg_dict).reset_index()
team_df_with_sum.set_index('Squad', inplace=True)


In [289]:
team_df = data.groupby("Squad").mean()
team_df.head()

Unnamed: 0_level_0,height_cm,weight_kg,weak_foot,#OPA,+/-,1/3,1/3_stats_possession,2CrdY,90s,A-xAG,...,onxG,onxGA,unSub,xA,xAG,xAG_stats_passing,xG,xG+/-,xG+xAG,xG_stats_shooting
Squad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alavés,181.823529,73.647059,3.117647,17.767684,-4.705882,41.705882,13.823529,0.058824,16.058824,-0.158824,...,17.835294,19.823529,8.941176,1.058824,1.1,1.1,1.435294,-1.970588,0.212941,1.435294
Arsenal,181.75,75.5,3.15,21.692722,18.2,63.4,25.15,0.15,19.415,0.37,...,30.705,17.39,3.85,2.165,2.23,2.23,3.04,13.32,0.29,3.04
Aston Villa,184.652174,75.608696,3.086957,19.198954,2.956522,47.086957,16.608696,0.043478,15.530435,-0.052174,...,23.008696,20.447826,5.086957,1.369565,1.530435,1.530435,1.630435,2.595652,0.223478,1.630435
Atalanta,186.809524,80.666667,3.047619,19.261603,17.380952,56.809524,26.0,0.095238,15.957143,-0.047619,...,27.095238,16.219048,6.285714,1.828571,1.904762,1.904762,2.880952,10.87619,0.282857,2.880952
Athletic Club,182.576923,75.884615,2.961538,19.60224,9.346154,49.730769,13.730769,0.038462,14.892308,-0.073077,...,20.45,14.8,10.307692,0.992308,1.15,1.15,1.684615,5.669231,0.190769,1.684615


## Mean Shift

Pour la recherche du paramètre bandwidth de Mean Shift, nous avons dans un premier temps utilisé l'algorithme estimate_bandwidth implémenté par défaut dans sklearn, puis nous avons cherché le meilleur paramètre en utilisant Optuna.

In [290]:
from sklearn.cluster import MeanShift
from sklearn.preprocessing import StandardScaler

In [291]:
clusterer = MeanShift()
X = team_df.select_dtypes(include=[np.number])
# X = StandardScaler().fit_transform(X)
clusterer.fit(X)

labels = clusterer.labels_
team_df['Cluster'] = labels

team_df.head()

Unnamed: 0_level_0,height_cm,weight_kg,weak_foot,#OPA,+/-,1/3,1/3_stats_possession,2CrdY,90s,A-xAG,...,onxGA,unSub,xA,xAG,xAG_stats_passing,xG,xG+/-,xG+xAG,xG_stats_shooting,Cluster
Squad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alavés,181.823529,73.647059,3.117647,17.767684,-4.705882,41.705882,13.823529,0.058824,16.058824,-0.158824,...,19.823529,8.941176,1.058824,1.1,1.1,1.435294,-1.970588,0.212941,1.435294,0
Arsenal,181.75,75.5,3.15,21.692722,18.2,63.4,25.15,0.15,19.415,0.37,...,17.39,3.85,2.165,2.23,2.23,3.04,13.32,0.29,3.04,1
Aston Villa,184.652174,75.608696,3.086957,19.198954,2.956522,47.086957,16.608696,0.043478,15.530435,-0.052174,...,20.447826,5.086957,1.369565,1.530435,1.530435,1.630435,2.595652,0.223478,1.630435,1
Atalanta,186.809524,80.666667,3.047619,19.261603,17.380952,56.809524,26.0,0.095238,15.957143,-0.047619,...,16.219048,6.285714,1.828571,1.904762,1.904762,2.880952,10.87619,0.282857,2.880952,1
Athletic Club,182.576923,75.884615,2.961538,19.60224,9.346154,49.730769,13.730769,0.038462,14.892308,-0.073077,...,14.8,10.307692,0.992308,1.15,1.15,1.684615,5.669231,0.190769,1.684615,0


In [292]:
team_df['Cluster'].unique()

array([0, 1])

In [293]:
team_df["Cluster"].value_counts()

Cluster
0    59
1    34
Name: count, dtype: int64

In [294]:
# List teams in each cluster
for cluster in team_df['Cluster'].unique():
    teams_in_cluster = team_df[team_df['Cluster'] == cluster].index.tolist()
    print(f"Cluster {cluster}: {teams_in_cluster}")

Cluster 0: ['Alavés', 'Athletic Club', 'Augsburg', 'Auxerre', 'Bochum', 'Bournemouth', 'Brentford', 'Brest', 'Cagliari', 'Como', 'Crystal Palace', 'Eint Frankfurt', 'Empoli', 'Espanyol', 'Everton', 'Fiorentina', 'Freiburg', 'Genoa', 'Getafe', 'Gladbach', 'Heidenheim', 'Hellas Verona', 'Hoffenheim', 'Holstein Kiel', 'Ipswich Town', 'Las Palmas', 'Le Havre', 'Lecce', 'Leganés', 'Leicester City', 'Lens', 'Mainz 05', 'Manchester Utd', 'Montpellier', 'Monza', 'Nantes', 'Nice', "Nott'ham Forest", 'Parma', 'RB Leipzig', 'Real Sociedad', 'Reims', 'Rennes', 'Saint-Étienne', 'Southampton', 'St. Pauli', 'Strasbourg', 'Torino', 'Tottenham', 'Toulouse', 'Udinese', 'Union Berlin', 'Valencia', 'Valladolid', 'Venezia', 'Villarreal', 'West Ham', 'Wolfsburg', 'Wolves']
Cluster 1: ['Arsenal', 'Aston Villa', 'Atalanta', 'Atlético Madrid', 'Barcelona', 'Bayern Munich', 'Betis', 'Bologna', 'Brighton', 'Celta Vigo', 'Chelsea', 'Dortmund', 'Fulham', 'Girona', 'Inter', 'Juventus', 'Lazio', 'Leverkusen', 'Lille

In [295]:
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import calinski_harabasz_score

def my_silhouette_score(X, labels):
    return silhouette_score(X, labels)

def my_davies_bouldin_score(X, labels):
    return davies_bouldin_score(X, labels)

def my_calinski_harabasz_score(X, labels):
    return calinski_harabasz_score(X, labels)

def evaluate_clustering(X, labels):
    silhouette = my_silhouette_score(X, labels)
    davies_bouldin = my_davies_bouldin_score(X, labels)
    calinski_harabasz = my_calinski_harabasz_score(X, labels)
    print(f"Silhouette Score: {silhouette}")
    print(f"Davies-Bouldin Score: {davies_bouldin}")
    print(f"Calinski-Harabasz Score: {calinski_harabasz}")

evaluate_clustering(X, labels)

Silhouette Score: 0.5741940123390761
Davies-Bouldin Score: 0.5974331579725343
Calinski-Harabasz Score: 198.09203009554395


In [296]:
from sklearn.cluster import estimate_bandwidth
bandwidth = estimate_bandwidth(X)
print(f"Estimated bandwidth: {bandwidth}")

Estimated bandwidth: 1885.1147138312317


In [297]:
# Mean shift optuna
import optuna

# Use the previously estimated bandwidth as the center for the search space
def objective(trial):
    bw = trial.suggest_float('bandwidth', bandwidth * 0.5, bandwidth * 2.0)
    clusterer = MeanShift(bandwidth=bw)
    clusterer.fit(X)
    labels = clusterer.labels_
    
    n_clusters = len(np.unique(labels))
    n_samples = X.shape[0]
    # Silhouette score requires at least 2 clusters and at most n_samples - 1 clusters
    if n_clusters < 2 or n_clusters >= n_samples:
        return -1  # Return a low score if only one cluster or all samples are their own cluster
    silhouette = my_silhouette_score(X, labels)
    return silhouette

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print("Best parameters:", study.best_params)
print("Best score:", study.best_value)

[I 2025-07-05 15:53:00,874] A new study created in memory with name: no-name-fc93a995-972b-4107-a56c-70b3de0217be
[I 2025-07-05 15:53:01,124] Trial 0 finished with value: -1.0 and parameters: {'bandwidth': 3743.680625224692}. Best is trial 0 with value: -1.0.
[I 2025-07-05 15:53:01,285] Trial 1 finished with value: 0.5925154367506982 and parameters: {'bandwidth': 2311.2917065775237}. Best is trial 1 with value: 0.5925154367506982.
[I 2025-07-05 15:53:01,348] Trial 2 finished with value: -1.0 and parameters: {'bandwidth': 3702.51703704804}. Best is trial 1 with value: 0.5925154367506982.
[I 2025-07-05 15:53:01,497] Trial 3 finished with value: -1.0 and parameters: {'bandwidth': 2052.0660666735153}. Best is trial 1 with value: 0.5925154367506982.
[I 2025-07-05 15:53:01,602] Trial 4 finished with value: -1.0 and parameters: {'bandwidth': 2225.4346747708123}. Best is trial 1 with value: 0.5925154367506982.
[I 2025-07-05 15:53:01,650] Trial 5 finished with value: 0.43201532440323015 and par

Best parameters: {'bandwidth': 2139.7295643653233}
Best score: 0.5981246650236934


In [298]:
clustering_opt = MeanShift(bandwidth=study.best_params['bandwidth'])
clustering_opt.fit(X)
labels_opt = clustering_opt.labels_

team_df['Cluster_Opt'] = labels_opt
team_df.head()

Unnamed: 0_level_0,height_cm,weight_kg,weak_foot,#OPA,+/-,1/3,1/3_stats_possession,2CrdY,90s,A-xAG,...,unSub,xA,xAG,xAG_stats_passing,xG,xG+/-,xG+xAG,xG_stats_shooting,Cluster,Cluster_Opt
Squad,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Alavés,181.823529,73.647059,3.117647,17.767684,-4.705882,41.705882,13.823529,0.058824,16.058824,-0.158824,...,8.941176,1.058824,1.1,1.1,1.435294,-1.970588,0.212941,1.435294,0,0
Arsenal,181.75,75.5,3.15,21.692722,18.2,63.4,25.15,0.15,19.415,0.37,...,3.85,2.165,2.23,2.23,3.04,13.32,0.29,3.04,1,1
Aston Villa,184.652174,75.608696,3.086957,19.198954,2.956522,47.086957,16.608696,0.043478,15.530435,-0.052174,...,5.086957,1.369565,1.530435,1.530435,1.630435,2.595652,0.223478,1.630435,1,0
Atalanta,186.809524,80.666667,3.047619,19.261603,17.380952,56.809524,26.0,0.095238,15.957143,-0.047619,...,6.285714,1.828571,1.904762,1.904762,2.880952,10.87619,0.282857,2.880952,1,1
Athletic Club,182.576923,75.884615,2.961538,19.60224,9.346154,49.730769,13.730769,0.038462,14.892308,-0.073077,...,10.307692,0.992308,1.15,1.15,1.684615,5.669231,0.190769,1.684615,0,0


In [299]:
evaluate_clustering(X, labels_opt)

Silhouette Score: 0.5981246650236934
Davies-Bouldin Score: 0.5488807372159662
Calinski-Harabasz Score: 204.00085055702667


In [300]:
for cluster in team_df['Cluster_Opt'].unique():
    teams_in_cluster = team_df[team_df['Cluster_Opt'] == cluster].index.tolist()
    print(f"Cluster_Opt {cluster}: {teams_in_cluster}")

Cluster_Opt 0: ['Alavés', 'Aston Villa', 'Athletic Club', 'Augsburg', 'Auxerre', 'Bochum', 'Bournemouth', 'Brentford', 'Brest', 'Brighton', 'Cagliari', 'Como', 'Crystal Palace', 'Eint Frankfurt', 'Empoli', 'Espanyol', 'Everton', 'Fiorentina', 'Freiburg', 'Fulham', 'Genoa', 'Getafe', 'Gladbach', 'Heidenheim', 'Hellas Verona', 'Hoffenheim', 'Holstein Kiel', 'Ipswich Town', 'Las Palmas', 'Le Havre', 'Lecce', 'Leganés', 'Leicester City', 'Lens', 'Mainz 05', 'Mallorca', 'Manchester Utd', 'Montpellier', 'Monza', 'Nantes', 'Newcastle Utd', 'Nice', "Nott'ham Forest", 'Parma', 'RB Leipzig', 'Real Sociedad', 'Reims', 'Rennes', 'Roma', 'Saint-Étienne', 'Sevilla', 'Southampton', 'St. Pauli', 'Strasbourg', 'Torino', 'Tottenham', 'Toulouse', 'Udinese', 'Union Berlin', 'Valencia', 'Valladolid', 'Venezia', 'Villarreal', 'Werder Bremen', 'West Ham', 'Wolfsburg', 'Wolves']
Cluster_Opt 1: ['Arsenal', 'Atalanta', 'Atlético Madrid', 'Barcelona', 'Bayern Munich', 'Betis', 'Bologna', 'Celta Vigo', 'Chelsea',

In [None]:
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score
ari_score = adjusted_rand_score(team_df['Cluster'], team_df['Cluster_Opt'])
print(f"Adjusted Rand Index entre Cluster et Cluster_Opt: {ari_score}")
nmi_score = normalized_mutual_info_score(team_df['Cluster'], team_df['Cluster_Opt'])
print(f"Normalized Mutual Information entre Cluster et Cluster_Opt: {nmi_score}")
identical = (team_df['Cluster'] == team_df['Cluster_Opt']).all()
print(f"Les clusters sont identiques: {identical}")
print("\nDistribution Cluster:")
print(team_df['Cluster'].value_counts().sort_index())
print("\nDistribution Cluster_Opt:")
print(team_df['Cluster_Opt'].value_counts().sort_index())

Adjusted Rand Index entre Cluster et Cluster_Opt: 1.0
Normalized Mutual Information entre Cluster et Cluster_Opt: 1.0
Les clusters sont identiques: True

Distribution Cluster:
Cluster
0    69
1    23
Name: count, dtype: int64

Distribution Cluster_Opt:
Cluster_Opt
0    69
1    23
Name: count, dtype: int64


In [301]:
team_market_value = pd.read_csv('merged_data_clean.csv')
team_market_value = fill_missing_values(team_market_value)
team_market_value = team_market_value.groupby("Squad")["market_value"].mean()

In [302]:
team_market_value

Squad
Alavés           3.470836e+06
Angers           1.303971e+07
Arsenal          4.974517e+07
Aston Villa      2.300000e+07
Atalanta         1.958854e+07
                     ...     
Villarreal       7.871854e+06
Werder Bremen    5.690476e+06
West Ham         1.567273e+07
Wolfsburg        8.046741e+06
Wolves           1.693148e+07
Name: market_value, Length: 96, dtype: float64

In [303]:
team_df_with_market_value = team_df.merge(team_market_value.to_frame(), left_index=True, right_index=True, how='left')
mean_market_value_per_cluster = team_df_with_market_value.groupby('Cluster_Opt')['market_value'].mean()
print(mean_market_value_per_cluster)

Cluster_Opt
0    9.474055e+06
1    2.252095e+07
Name: market_value, dtype: float64


In [304]:
cluster_shift = team_df.groupby('Cluster_Opt').mean(numeric_only=True)
iter_cluster_variance = cluster_shift.var(axis=0).sort_values(ascending=False)
print(iter_cluster_variance.head(20))

TotDist                     1.136551e+07
TotDist_stats_possession    7.656885e+05
PrgDist                     7.073606e+05
PrgDist_stats_possession    2.300225e+05
Touches                     5.351636e+04
Live_stats_possession       5.347616e+04
Att                         4.921452e+04
Att_stats_passing_types     4.921452e+04
Live                        4.543062e+04
Cmp_stats_passing_types     4.500641e+04
Cmp                         4.500641e+04
Rec                         4.456716e+04
Min_stats_playing_time      2.880440e+04
Min                         2.880440e+04
Carries                     2.826806e+04
Mid 3rd_stats_possession    1.507268e+04
Att 3rd_stats_possession    6.661110e+03
Def 3rd_stats_possession    7.773588e+02
Min_stats_keeper            4.403494e+02
PrgR                        4.256535e+02
dtype: float64


Comparaison entre les deux méthodes de recherche du paramètre bandwidth :
| Méthode                                | Bandwidth  | Silhouette Score | Davies-Bouldin Score | Calinski-Harabasz Score |
|----------------------------------------|-------------|------------------|----------------------|-------------------------|
| Sklearn (par défaut)                   | 1885.11     | 0.574            | 0.597               | 198.09                 |
| Optuna (optimisé)                      | 2356.67     | 0.598            | 0.549               | 204.00                 |


L'optimisation avec Optuna permet d'obtenir une meilleure qualité de clustering en termes de cohésion et de séparation des clusters, ce que confirment les scores de silhouette, Davies-Bouldin et Calinski-Harabasz, sans pour autant modifier le nombre ni la composition des clusters.

Dans le cluster 1, on trouve des clubs qui ont un budget important, qui sont en haut du classement de leur championnat national et qui jouent régulièrement la coupe d'Europe, contrairement aux équipes du cluster 0.

L’analyse des moyennes par cluster montre que les variables expliquant le mieux la séparation sont la distance totale parcourue (TotDist), la distance parcourue en possession (TotDist_stats_possession), la distance progressive totale (PrgDist), ainsi que le nombre de touches (Touches). Ces variables traduisent la capacité des clubs à dominer la possession, à progresser efficacement vers l’avant et à maintenir une forte activité collective.

Le clustering isole un groupe d'élite. Il est intéressant de noter que la valeur marchande n’a pas été intégrée directement dans le modèle. Les résultats montrent néanmoins une séparation cohérente, où les clubs les plus performants et les plus médiatisés forment un cluster distinct. Cette structure suggère que les variables utilisées portent déjà une information suffisante pour capturer implicitement la hiérarchie économique et sportive des clubs.

On remarque que les statistiques des gardiens de but apportent peu d'information pour la séparation des équipes : les gardiens participent peu au jeu collectif et leurs statistiques sont très différentes de celles des joueurs de champ. Essayons de faire un clustering sans les gardiens de but et de comparer les résultats.

## Sans les gardiens de buts

In [305]:
data = load_and_clean_data(keep_gk=False)
team_df = data.groupby("Squad").mean()
X = team_df.select_dtypes(include=[np.number])
clusterer = MeanShift()
clusterer.fit(X)

labels = clusterer.labels_
team_df['Cluster_nogk'] = labels

evaluate_clustering(X, labels)

for cluster in team_df['Cluster_nogk'].unique():
    teams_in_cluster = team_df[team_df['Cluster_nogk'] == cluster].index.tolist()
    print(f"Cluster_nogk {cluster}: {teams_in_cluster}")

Silhouette Score: 0.6056121865141645
Davies-Bouldin Score: 0.5384510665105253
Calinski-Harabasz Score: 196.54653845068182
Cluster_nogk 0: ['Alavés', 'Aston Villa', 'Athletic Club', 'Augsburg', 'Auxerre', 'Bochum', 'Bologna', 'Bournemouth', 'Brentford', 'Brest', 'Brighton', 'Cagliari', 'Como', 'Crystal Palace', 'Eint Frankfurt', 'Empoli', 'Espanyol', 'Everton', 'Fiorentina', 'Freiburg', 'Fulham', 'Genoa', 'Getafe', 'Gladbach', 'Heidenheim', 'Hellas Verona', 'Hoffenheim', 'Holstein Kiel', 'Ipswich Town', 'Las Palmas', 'Le Havre', 'Lecce', 'Leganés', 'Leicester City', 'Lyon', 'Mainz 05', 'Mallorca', 'Manchester Utd', 'Montpellier', 'Monza', 'Nantes', 'Newcastle Utd', 'Nice', "Nott'ham Forest", 'Parma', 'RB Leipzig', 'Rayo Vallecano', 'Real Sociedad', 'Reims', 'Rennes', 'Roma', 'Saint-Étienne', 'Sevilla', 'Southampton', 'St. Pauli', 'Strasbourg', 'Torino', 'Tottenham', 'Toulouse', 'Udinese', 'Union Berlin', 'Valencia', 'Valladolid', 'Venezia', 'Villarreal', 'Werder Bremen', 'West Ham', 'Wo

In [306]:
# optuna study for bandwidth
def objective(trial):
    bw = trial.suggest_float('bandwidth', bandwidth * 0.5, bandwidth * 2.0)
    clusterer = MeanShift(bandwidth=bw)
    clusterer.fit(X)
    labels = clusterer.labels_
    
    n_clusters = len(np.unique(labels))
    n_samples = X.shape[0]
    # Silhouette score requires at least 2 clusters and at most n_samples - 1 clusters
    if n_clusters < 2 or n_clusters >= n_samples:
        return -1  # Return a low score if only one cluster or all samples are their own cluster
    silhouette = my_silhouette_score(X, labels)
    return silhouette

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print("Best parameters:", study.best_params)
print("Best score:", study.best_value)

[I 2025-07-05 15:53:06,173] A new study created in memory with name: no-name-4165e316-e5fb-4eb9-b50e-52f0b2511674
[I 2025-07-05 15:53:06,241] Trial 0 finished with value: 0.48869596320080605 and parameters: {'bandwidth': 1594.4750675855869}. Best is trial 0 with value: 0.48869596320080605.
[I 2025-07-05 15:53:06,303] Trial 1 finished with value: -1.0 and parameters: {'bandwidth': 3719.673869326111}. Best is trial 0 with value: 0.48869596320080605.
[I 2025-07-05 15:53:06,390] Trial 2 finished with value: 0.5299867619038764 and parameters: {'bandwidth': 1758.211642658517}. Best is trial 2 with value: 0.5299867619038764.
[I 2025-07-05 15:53:06,454] Trial 3 finished with value: -1.0 and parameters: {'bandwidth': 3263.7226830447858}. Best is trial 2 with value: 0.5299867619038764.
[I 2025-07-05 15:53:06,545] Trial 4 finished with value: -1.0 and parameters: {'bandwidth': 2956.935424391059}. Best is trial 2 with value: 0.5299867619038764.
[I 2025-07-05 15:53:06,627] Trial 5 finished with val

Best parameters: {'bandwidth': 1928.5886483582547}
Best score: 0.6056121865141645


In [307]:
clustering_opt = MeanShift(bandwidth=study.best_params['bandwidth'])
clustering_opt.fit(X)
labels_opt = clustering_opt.labels_

team_df['Cluster_Opt_nogk'] = labels_opt
evaluate_clustering(X, labels_opt)

Silhouette Score: 0.6056121865141645
Davies-Bouldin Score: 0.5384510665105253
Calinski-Harabasz Score: 196.54653845068182


In [308]:
from sklearn.metrics import adjusted_rand_score, normalized_mutual_info_score


for cluster in team_df['Cluster_Opt_nogk'].unique():
    teams_in_cluster = team_df[team_df['Cluster_Opt_nogk'] == cluster].index.tolist()
    print(f"Cluster_Opt_nogk {cluster}: {teams_in_cluster}")

print("\n" + "="*50)
print("COMPARISON BETWEEN CLUSTERS")
print("="*50)

# Compare Cluster_nogk vs Cluster_Opt_nogk
ari_nogk = adjusted_rand_score(team_df['Cluster_nogk'], team_df['Cluster_Opt_nogk'])
print(f"\nAdjusted Rand Index entre Cluster_nogk et Cluster_Opt_nogk: {ari_nogk}")
nmi_nogk = normalized_mutual_info_score(team_df['Cluster_nogk'], team_df['Cluster_Opt_nogk'])
print(f"Normalized Mutual Information entre Cluster_nogk et Cluster_Opt_nogk: {nmi_nogk}")
identical_nogk = (team_df['Cluster_nogk'] == team_df['Cluster_Opt_nogk']).all()
print(f"Les clusters nogk sont identiques: {identical_nogk}")

print("\nDistribution Cluster_nogk:")
print(team_df['Cluster_nogk'].value_counts().sort_index())
print("\nDistribution Cluster_Opt_nogk:")
print(team_df['Cluster_Opt_nogk'].value_counts().sort_index())

# Compare with GK vs without GK (optimized versions)
# First merge the datasets to compare
team_df_with_gk = pd.read_csv('merged_data_clean.csv')
team_df_with_gk = load_and_clean_data(keep_gk=True)
team_df_with_gk = team_df_with_gk.groupby("Squad").mean()

# Get the Cluster_Opt from the GK version (from previous cells)
# We need to recreate it since we're in a new context
X_with_gk = team_df_with_gk.select_dtypes(include=[np.number])
clustering_with_gk = MeanShift(bandwidth=2356.67)
clustering_with_gk.fit(X_with_gk)
labels_with_gk = clustering_with_gk.labels_
team_df_with_gk['Cluster_Opt'] = labels_with_gk  # <-- Add this line

# Compare GK vs no-GK optimized clusters
teams_common = set(team_df.index) & set(team_df_with_gk.index)
team_df_nogk_filtered = team_df.loc[list(teams_common), 'Cluster_Opt_nogk']
labels_with_gk_filtered = pd.Series(labels_with_gk, index=team_df_with_gk.index).loc[list(teams_common)]

ari_gk_vs_nogk = adjusted_rand_score(labels_with_gk_filtered, team_df_nogk_filtered)
print(f"\n" + "="*50)
print(f"Adjusted Rand Index entre Cluster_Opt (avec GK) et Cluster_Opt_nogk: {ari_gk_vs_nogk}")
# Nombre d'équipes dans les deux clusters
teams_common = set(team_df_nogk_filtered.index) & set(labels_with_gk_filtered.index)
print(f"Nombre d'équipes communes: {len(teams_common)}")
print(f"Cluster_Opt (avec GK): {labels_with_gk_filtered.value_counts().sort_index()}")
print(f"Cluster_Opt_nogk: {team_df_nogk_filtered.value_counts().sort_index()}")
# Les équipes du cluster 1 qui sont dans le cluster avec gk mais pas dans nogk
teams_in_cluster_1_with_gk = team_df_with_gk[team_df_with_gk['Cluster_Opt'] == 1].index.tolist()
teams_in_cluster_1_nogk = team_df[team_df['Cluster_Opt_nogk'] == 1].index.tolist()
teams_in_cluster_1_only_with_gk = set(teams_in_cluster_1_with_gk) - set(teams_in_cluster_1_nogk)
print(f"Équipes dans le cluster 1 avec GK mais pas dans nogk: {teams_in_cluster_1_only_with_gk}")

Cluster_Opt_nogk 0: ['Alavés', 'Aston Villa', 'Athletic Club', 'Augsburg', 'Auxerre', 'Bochum', 'Bologna', 'Bournemouth', 'Brentford', 'Brest', 'Brighton', 'Cagliari', 'Como', 'Crystal Palace', 'Eint Frankfurt', 'Empoli', 'Espanyol', 'Everton', 'Fiorentina', 'Freiburg', 'Fulham', 'Genoa', 'Getafe', 'Gladbach', 'Heidenheim', 'Hellas Verona', 'Hoffenheim', 'Holstein Kiel', 'Ipswich Town', 'Las Palmas', 'Le Havre', 'Lecce', 'Leganés', 'Leicester City', 'Lyon', 'Mainz 05', 'Mallorca', 'Manchester Utd', 'Montpellier', 'Monza', 'Nantes', 'Newcastle Utd', 'Nice', "Nott'ham Forest", 'Parma', 'RB Leipzig', 'Rayo Vallecano', 'Real Sociedad', 'Reims', 'Rennes', 'Roma', 'Saint-Étienne', 'Sevilla', 'Southampton', 'St. Pauli', 'Strasbourg', 'Torino', 'Tottenham', 'Toulouse', 'Udinese', 'Union Berlin', 'Valencia', 'Valladolid', 'Venezia', 'Villarreal', 'Werder Bremen', 'West Ham', 'Wolfsburg', 'Wolves']
Cluster_Opt_nogk 1: ['Arsenal', 'Atalanta', 'Atlético Madrid', 'Barcelona', 'Bayern Munich', 'Beti

Le bandwidth donne les mêmes résultats sur les métriques du cluster avec la recherche par défaut et l'optimisation par Optuna.

Avec gardiens :
| Méthode                                | Bandwidth  | Silhouette Score | Davies-Bouldin Score | Calinski-Harabasz Score |
|----------------------------------------|-------------|------------------|----------------------|-------------------------|
| Sklearn (par défaut)                   | 1885.11     | 0.574            | 0.597               | 198.09                 |
| Optuna (optimisé)                      | 2356.67     | 0.598            | 0.549               | 204.00                 |

Sans gardiens :
| Méthode                                | Silhouette Score | Davies-Bouldin Score | Calinski-Harabasz Score |
|----------------------------------------|------------------|----------------------|-------------------------|
| Sklearn + Optuna                       | 0.606            | 0.538               | 196.55                 |

Après avoir retiré les gardiens de but ainsi que leurs statistiques associées, nous conservons deux clusters, mais ceux-ci apparaissent plus nets et mieux séparés, comme en témoignent les scores de silhouette (0.606), Davies-Bouldin (0.538) et Calinski-Harabasz (196.55).

La composition des clusters évolue légèrement : les équipes de Rayo Vallecano, Lyon et Bologne passent du cluster 1 au cluster 0. Ce repositionnement semble plus cohérent au regard des performances sportives récentes. Rayo Vallecano et Bologne occupent plutôt le milieu de tableau dans leurs championnats respectifs (Espagne et Italie), tandis que Lyon, après avoir dominé la Ligue 1 dans le passé, traverse actuellement une période difficile. Il est d’ailleurs notable que, malgré sa victoire en Coupe d’Italie, Bologne a terminé seulement 9ᵉ en Serie A et ne s'est pas qualifié en coupe d'Europe, ce qui justifie pleinement son repositionnement.

Comme mentionné précédemment, nous allons essayer de sommer certaines statistiques pour obtenir un score de performance par équipe.

In [309]:
sum_cols = [
    'Gls', 'Ast', 'Sh', 'SoT', 'Tkl', 'Int', 'Blocks', 'Clr',
    'CrdY', 'CrdR', 'PK', 'PKatt', 'PKwon', 'Touches',
    'Pass', 'PassLive', 'PassDead', 'Carries', 'Recov',
    'SCA', 'GCA', 'Fls', 'Fld', 'Starts', 'Subs', 'MP', 'Min', 'xG', 'npxG', 'xA', 'npxG+xA', 'xGChain', 'xGBuildup', 'GA'
]

mean_cols = [
    'Cmp%', 'SoT%', 'Succ%', 'Won%', 'Tkl%',
    'Min%', 'G/Sh', 'G/SoT', 'xG/Sh', 'npxG/Sh',
    'height_cm', 'weight_kg',
]

agg_dict = {}

# Colonnes à sommer
for col in sum_cols:
    if col in data.columns:
        agg_dict[col] = 'sum'

# Colonnes à moyenner
for col in mean_cols:
    if col in data.columns:
        agg_dict[col] = 'mean'

team_df_with_sum = data.groupby('Squad').agg(agg_dict).reset_index()
team_df_with_sum.set_index('Squad', inplace=True)

In [310]:
X_sum = team_df_with_sum.select_dtypes(include=[np.number])
clustering_opt = MeanShift(bandwidth=study.best_params['bandwidth'])
clustering_opt.fit(X_sum)
labels_opt = clustering_opt.labels_

team_df['Cluster_Opt_nogk'] = labels_opt
evaluate_clustering(X_sum, labels_opt)
for cluster in team_df['Cluster_Opt_nogk'].unique():
    teams_in_cluster = team_df[team_df['Cluster_Opt_nogk'] == cluster].index.tolist()
    print(f"Cluster_Opt_nogk {cluster}: {teams_in_cluster}")

Silhouette Score: 0.2830776099691316
Davies-Bouldin Score: 0.6922467421322542
Calinski-Harabasz Score: 90.12838460983706
Cluster_Opt_nogk 7: ['Alavés', 'Auxerre', 'Getafe', 'Leganés', 'Montpellier']
Cluster_Opt_nogk 3: ['Arsenal', 'Betis', 'Bologna', 'Chelsea', 'Napoli', 'Rennes']
Cluster_Opt_nogk 2: ['Aston Villa', 'Brighton', 'Lazio', 'Monza', 'Newcastle Utd', 'Valencia']
Cluster_Opt_nogk 4: ['Atalanta', 'Leverkusen', 'Lyon', 'Manchester Utd', 'Werder Bremen']
Cluster_Opt_nogk 13: ['Athletic Club', 'Cagliari', 'Leicester City']
Cluster_Opt_nogk 17: ['Atlético Madrid', 'Real Madrid']
Cluster_Opt_nogk 0: ['Augsburg', 'Bochum', 'Crystal Palace', 'Gladbach', 'Hellas Verona', 'Holstein Kiel', 'Las Palmas', 'Le Havre', 'Lecce', 'Mallorca', 'St. Pauli', 'Strasbourg', 'Toulouse', 'Villarreal', 'West Ham', 'Wolfsburg']
Cluster_Opt_nogk 16: ['Barcelona', 'Bayern Munich']
Cluster_Opt_nogk 9: ['Bournemouth', 'Everton', 'Torino', 'Union Berlin']
Cluster_Opt_nogk 6: ['Brentford', 'Fiorentina', 'Ge

L’agrégation via la somme met davantage en avant les dynamiques globales d’équipe et le volume total des contributions individuelles. Cette méthode révèle des micro-clusters correspondant à des profils très spécifiques, voire extrêmes (ex. PSG ou Real Madrid isolés).

L’utilisation des sommes accentue les volumes collectifs et met en évidence des clubs très spécifiques, mais induit une segmentation plus fragmentée et moins cohérente globalement.

## Preprocessing

Observons si un prétraitement similaire à celui de l'exercice 6 permet d'améliorer les résultats du clustering.

In [311]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Apply preprocessing similar to exercise 6 to improve clustering results

# Create preprocessing pipeline with standardization
preprocessing_pipeline = Pipeline([
    ('scaler', StandardScaler())
])

# Apply preprocessing to the data without goalkeepers
X_preprocessed = preprocessing_pipeline.fit_transform(X)

# Apply Mean Shift clustering on preprocessed data
clusterer_preprocessed = MeanShift()
clusterer_preprocessed.fit(X_preprocessed)
labels_preprocessed = clusterer_preprocessed.labels_

# Check if clustering produced valid results
n_clusters_preprocessed = len(np.unique(labels_preprocessed))
print(f"Number of clusters with preprocessing: {n_clusters_preprocessed}")

if n_clusters_preprocessed >= 2 and n_clusters_preprocessed < X_preprocessed.shape[0]:
    print("Clustering results with preprocessing:")
    evaluate_clustering(X_preprocessed, labels_preprocessed)
else:
    print("Clustering with preprocessing produced invalid number of clusters for evaluation")

# Add cluster labels to dataframe
team_df['Cluster_Preprocessed'] = labels_preprocessed

# Show teams in each cluster
for cluster in team_df['Cluster_Preprocessed'].unique():
    teams_in_cluster = team_df[team_df['Cluster_Preprocessed'] == cluster].index.tolist()
    print(f"Cluster_Preprocessed {cluster}: {teams_in_cluster}")

Number of clusters with preprocessing: 6
Clustering results with preprocessing:
Silhouette Score: 0.15084675750907242
Davies-Bouldin Score: 1.0663812917340032
Calinski-Harabasz Score: 6.546986237693414
Cluster_Preprocessed 0: ['Alavés', 'Aston Villa', 'Athletic Club', 'Atlético Madrid', 'Augsburg', 'Auxerre', 'Betis', 'Bochum', 'Bologna', 'Bournemouth', 'Brentford', 'Brest', 'Brighton', 'Cagliari', 'Celta Vigo', 'Chelsea', 'Como', 'Crystal Palace', 'Dortmund', 'Eint Frankfurt', 'Empoli', 'Espanyol', 'Everton', 'Fiorentina', 'Freiburg', 'Fulham', 'Genoa', 'Girona', 'Gladbach', 'Heidenheim', 'Hellas Verona', 'Hoffenheim', 'Holstein Kiel', 'Ipswich Town', 'Juventus', 'Las Palmas', 'Le Havre', 'Lecce', 'Leganés', 'Leicester City', 'Leverkusen', 'Lille', 'Lyon', 'Mainz 05', 'Mallorca', 'Manchester City', 'Manchester Utd', 'Marseille', 'Montpellier', 'Monza', 'Nantes', 'Napoli', 'Newcastle Utd', 'Nice', "Nott'ham Forest", 'Parma', 'RB Leipzig', 'Rayo Vallecano', 'Real Sociedad', 'Reims', 'Re

Après ce prétraitement, le nombre de clusters détectés est passé à six, contre deux précédemment. Toutefois, les scores de qualité du clustering se sont globalement dégradés : le silhouette score diminue à 0.15, le Davies-Bouldin score augmente à 1.07, et le Calinski-Harabasz score chute fortement à 6.55, indiquant des clusters moins compacts et moins bien séparés.

La composition des clusters montre aussi une fragmentation excessive : on observe par exemple des clusters isolant des clubs individuellement (Liverpool seul dans un cluster, Osasuna seul dans un autre, Getafe isolé), tandis que la majorité des clubs reste regroupée dans un large cluster principal.

Ces résultats suggèrent que, même si la normalisation permet d’homogénéiser les échelles et d’éviter la domination de certaines variables, elle peut également introduire une sensibilité excessive aux variations mineures dans les données lorsque les structures initiales sont déjà stables. Dans ce cas précis, le prétraitement n’apporte donc pas d’amélioration, mais perturbe la structure robuste initialement observée sans normalisation.

En conclusion, il apparaît plus pertinent de conserver la version non standardisée des données pour ce type de clustering, les performances et la cohérence des groupes obtenus étant meilleures sans normalisation préalable.

# KMEANS

Dans cette partie, nous allons regrouper les équipes en non supervisé en comparant les clusters avec les championnats auxquels elles appartiennent. Le but sera d'observer s'il est possible, par ces méthodes et avec ce dataset, de regrouper les équipes en fonction de leur championnat.

In [347]:
# KMEANS
from sklearn.cluster import KMeans

In [348]:
data = pd.read_csv('merged_data_clean.csv')

In [349]:
comp = data[["Squad", "Comp"]].drop_duplicates()

Essayons en conservant la market value, car notre objectif est de créer des clusters par championnat et nous avons constaté qu’il existe une forte disparité de valeur marchande entre les ligues.

In [353]:
data = load_and_clean_data(keep_gk=False, keep_market_value=True)

In [355]:
team_df = data.groupby("Squad").mean()

In [357]:
team_df = team_df.merge(comp.set_index('Squad'), left_index=True, right_index=True, how='left')
# Check if duplicate columns were created and handle accordingly
if 'Comp_y' in team_df.columns:
	team_df.drop(columns=["Comp_y"], inplace=True)
if 'Comp_x' in team_df.columns:
	team_df.rename(columns={"Comp_x": "Comp"}, inplace=True)

In [358]:
from sklearn.metrics import adjusted_rand_score

n = 5 # Dataset sur les 5 grands championnats
kmeans = KMeans(n_clusters=n, random_state=42)
X_kmeans = team_df.select_dtypes(include=[np.number])
kmeans.fit(X_kmeans)
labels_kmeans = kmeans.labels_

team_df['Cluster_KMeans'] = labels_kmeans
team_df.head()

# evaluate_clustering(X, labels_kmeans)
# Compute ARI for KMeans clustering
ari_kmeans = adjusted_rand_score(team_df["Comp"], labels_kmeans)
print(f"Adjusted Rand Index for KMeans: {ari_kmeans}")

# Print teams in each KMeans cluster
for cluster in team_df['Cluster_KMeans'].unique():
    teams_in_cluster = team_df[team_df['Cluster_KMeans'] == cluster].index.tolist()
    print(f"Cluster_KMeans {cluster}: {teams_in_cluster}")

Adjusted Rand Index for KMeans: 0.05555922453985504
Cluster_KMeans 0: ['Alavés', 'Augsburg', 'Auxerre', 'Betis', 'Bochum', 'Brest', 'Cagliari', 'Celta Vigo', 'Como', 'Empoli', 'Espanyol', 'Freiburg', 'Genoa', 'Getafe', 'Girona', 'Gladbach', 'Heidenheim', 'Hellas Verona', 'Hoffenheim', 'Holstein Kiel', 'Las Palmas', 'Le Havre', 'Lecce', 'Leganés', 'Leicester City', 'Mainz 05', 'Mallorca', 'Montpellier', 'Monza', 'Nantes', 'Nice', 'Osasuna', 'Parma', 'Rayo Vallecano', 'Reims', 'Rennes', 'Saint-Étienne', 'Southampton', 'St. Pauli', 'Torino', 'Toulouse', 'Udinese', 'Union Berlin', 'Valencia', 'Valladolid', 'Venezia', 'Villarreal', 'Werder Bremen', 'Wolfsburg']
Cluster_KMeans 2: ['Arsenal', 'Liverpool', 'Paris S-G', 'Real Madrid']
Cluster_KMeans 4: ['Aston Villa', 'Atalanta', 'Atlético Madrid', 'Bournemouth', 'Brentford', 'Brighton', 'Crystal Palace', 'Eint Frankfurt', "Nott'ham Forest", 'RB Leipzig', 'West Ham', 'Wolves']
Cluster_KMeans 3: ['Athletic Club', 'Bologna', 'Dortmund', 'Everton'

In [361]:
if 'Cluster_KMeans' not in team_df_with_market_value.columns:
	team_df_with_market_value = team_df_with_market_value.merge(
		team_df[['Cluster_KMeans']], left_index=True, right_index=True, how='left'
	)
mean_market_value_per_kmeans_cluster = team_df_with_market_value.groupby('Cluster_KMeans')['market_value'].mean()
print("Mean market value per KMeans cluster:")
print(mean_market_value_per_kmeans_cluster)

Mean market value per KMeans cluster:
Cluster_KMeans
0.0    5.870677e+06
1.0    2.891821e+07
2.0    4.742264e+07
3.0    1.287466e+07
4.0    1.858735e+07
Name: market_value, dtype: float64


La valeur marchande semble jouer un rôle déterminant dans la structuration des clusters, traduisant potentiellement des écarts économiques entre les championnats. Afin d'éviter ce biais, nous décidons de l'exclure de l'analyse.

In [344]:
from sklearn.metrics import adjusted_rand_score
team_df.drop(columns=["market_value"], inplace=True)

n = 5 # Dataset sur les 5 grands championnats
kmeans = KMeans(n_clusters=n, random_state=42)
X_kmeans = team_df.select_dtypes(include=[np.number])
kmeans.fit(X_kmeans)
labels_kmeans = kmeans.labels_

team_df['Cluster_KMeans'] = labels_kmeans
team_df.head()

# evaluate_clustering(X, labels_kmeans)
# Compute ARI for KMeans clustering
ari_kmeans = adjusted_rand_score(team_df["Comp"], labels_kmeans)
print(f"Adjusted Rand Index for KMeans: {ari_kmeans}")

# Print teams in each KMeans cluster
for cluster in team_df['Cluster_KMeans'].unique():
    teams_in_cluster = team_df[team_df['Cluster_KMeans'] == cluster].index.tolist()
    print(f"Cluster_KMeans {cluster}: {teams_in_cluster}")

Adjusted Rand Index for KMeans: -0.0060721786324550425
Cluster_KMeans 3: ['Alavés', 'Auxerre', 'Bournemouth', 'Brentford', 'Brest', 'Cagliari', 'Como', 'Everton', 'Fiorentina', 'Freiburg', 'Getafe', 'Heidenheim', 'Lecce', 'Leganés', 'Leicester City', 'Monza', 'Nantes', "Nott'ham Forest", 'Real Sociedad', 'Southampton', 'Toulouse', 'Villarreal', 'West Ham']
Cluster_KMeans 2: ['Arsenal', 'Atalanta', 'Atlético Madrid', 'Betis', 'Celta Vigo', 'Chelsea', 'Dortmund', 'Lille', 'Lyon', 'Marseille', 'Napoli', 'Newcastle Utd', 'Osasuna', 'Sevilla', 'Stuttgart']
Cluster_KMeans 4: ['Aston Villa', 'Athletic Club', 'Bologna', 'Brighton', 'Eint Frankfurt', 'Fulham', 'Gladbach', 'Las Palmas', 'Mainz 05', 'Mallorca', 'Manchester Utd', 'RB Leipzig', 'Rayo Vallecano', 'Rennes', 'Roma', 'Strasbourg', 'Torino', 'Tottenham', 'Valencia', 'Werder Bremen', 'Wolves']
Cluster_KMeans 0: ['Augsburg', 'Bochum', 'Crystal Palace', 'Empoli', 'Espanyol', 'Genoa', 'Hellas Verona', 'Hoffenheim', 'Holstein Kiel', 'Ipswich

Les équipes des clusters ne représentent pas les championnats mais plutôt des styles de jeu, ce qui peut indiquer que les championnats n'ont pas de style de jeu propre mais plutôt des styles de jeu qui sont partagés entre les championnats, ou bien que ces features ne sont pas adaptées pour représenter les championnats.