In [7]:
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error

# =====================================================
# 0) Chemins & lecture du star_schema
# =====================================================

BASE_DIR = Path("CSV")
STAR_DIR = BASE_DIR / "star_schema"
MODEL_DIR = BASE_DIR / "model_data"
MODEL_DIR.mkdir(parents=True, exist_ok=True)

fpw = pd.read_csv(STAR_DIR / "fact_player_wyscout.csv", low_memory=False)

print("fact_player_wyscout shape:", fpw.shape)
print("Colonnes principales :", fpw.columns[:20].tolist())


# =====================================================
# 1) Construction de segment_index (1..6)
# =====================================================
# Rappel :
#   periode : "1ére mi temps", "2éme mi temps", "MATCH COMPLET"
#   segment : "1ére 15 min", "2éme 15 min", "3éme 15 min",
#             "mitemps complet", "MATCH COMPLET"

half_map = {
    "1ére mi temps": 1,
    "2éme mi temps": 2,
}

seg_map = {
    "1ére 15 min": 1,
    "2éme 15 min": 2,
    "3éme 15 min": 3,
}

fpw["half"] = fpw["periode"].map(half_map)
fpw["seg_in_half"] = fpw["segment"].map(seg_map)

fpw["segment_index"] = (fpw["half"] - 1) * 3 + fpw["seg_in_half"]

print("\nDistrib `periode` :")
print(fpw["periode"].value_counts())

print("\nDistrib `segment` :")
print(fpw["segment"].value_counts())

print("\nDistrib `segment_index` (1..6 attendus) :")
print(fpw["segment_index"].value_counts().sort_index())


# =====================================================
# 2) Filtrer les lignes utilisables
#    - on veut player_key, match_key, segment_index non-NaN
#    - on garde uniquement les (player,match) qui ont les 6 segments
# =====================================================

fpw_valid = fpw.dropna(subset=["player_key", "match_key", "segment_index"])

print("\nShape après filtre (player_key, match_key, segment_index non NaN) :", fpw_valid.shape)

# nombre de segments distincts par (player, match)
group_counts = fpw_valid.groupby(["player_key", "match_key"])["segment_index"].nunique()
print("\nNb de segments distincts par (player, match) :")
print(group_counts.value_counts().sort_index())

# couples (player, match) qui ont exactement les 6 segments
valid_pairs = group_counts[group_counts == 6].index

fpw_6 = (
    fpw_valid
    .set_index(["player_key", "match_key"])
    .loc[valid_pairs]
    .reset_index()
)

print("\nShape fpw_6 (seulement (player,match) avec 6 segments) :", fpw_6.shape)

# petit exemple pour vérifier les 6 segments
example_pair = valid_pairs[0]
print("\nExemple pour player_key, match_key =", example_pair)
print(
    fpw_6[
        (fpw_6["player_key"] == example_pair[0]) &
        (fpw_6["match_key"] == example_pair[1])
    ][["periode", "segment", "segment_index"]]
    .sort_values("segment_index")
)


# =====================================================
# 3) Sélection des colonnes de métriques
#    - toutes les colonnes numériques
#    - on enlève les clés et colonnes techniques
# =====================================================

numeric_cols = fpw_6.select_dtypes(include=[np.number]).columns.tolist()

cols_to_exclude = [
    "fact_player_wyscout_key",
    "date_key",
    "team_key",
    "player_key",
    "competition_key",
    "match_key",
    "half",
    "seg_in_half",
    "segment_index",
]

metric_cols = [c for c in numeric_cols if c not in cols_to_exclude]

print("\nNb de métriques utilisées :", len(metric_cols))
print("Exemple de métriques :", metric_cols[:20])


# =====================================================
# 4) Construction de X (segments 1 à 5) et y (segment 6)
# =====================================================

# X : segments 1..5
fpw_feat = fpw_6[fpw_6["segment_index"] <= 5].copy()
fpw_feat["segment_index"] = fpw_feat["segment_index"].astype(int)

# y : segment 6
fpw_target = fpw_6[fpw_6["segment_index"] == 6].copy()

# ---- Pivot X : index = (player_key, match_key), colonnes = metric_seg{1..5}
X_pivot = fpw_feat.pivot_table(
    index=["player_key", "match_key"],
    columns="segment_index",
    values=metric_cols,
    aggfunc="first",
)

X_pivot.columns = [
    f"{metric}_seg{seg}"
    for (metric, seg) in X_pivot.columns.to_flat_index()
]

print("\nShape X_pivot (features) :", X_pivot.shape)

# ---- Pivot y : même index, colonnes = target_metric (sur segment 6)
y_pivot = (
    fpw_target
    .set_index(["player_key", "match_key"])[metric_cols]
    .add_prefix("target_")
)

print("Shape y_pivot (targets) :", y_pivot.shape)


# =====================================================
# 5) Ajout des méta-données et fusion finale
# =====================================================

meta_cols = [
    "player_key",
    "match_key",
    "team_key",
    "competition_key",
    "date_key",
    "position_type",
]

meta = (
    fpw_target[meta_cols]
    .drop_duplicates(subset=["player_key", "match_key"])
    .set_index(["player_key", "match_key"])
)

dataset = (
    meta
    .join(X_pivot, how="inner")
    .join(y_pivot, how="inner")
    .reset_index()
)

print("\nShape dataset final :", dataset.shape)

feature_cols = [c for c in dataset.columns if "_seg" in c]
target_cols = [c for c in dataset.columns if c.startswith("target_")]

print("\nNb features (segments 1..5) :", len(feature_cols))
print("Nb targets (segment 6)       :", len(target_cols))
print("\nExemple features :", feature_cols[:10])
print("Exemple targets  :", target_cols[:10])


# =====================================================
# 6) Sauvegarde du dataset pour le modèle
# =====================================================

out_path = MODEL_DIR / "wyscout_player_5x15_v1.csv"
dataset.to_csv(out_path, index=False, encoding="utf-8")

print("\n✅ Dataset sauvegardé dans :", out_path)




fact_player_wyscout shape: (156881, 93)
Colonnes principales : ['fact_player_wyscout_key', 'team_name', 'date', 'match', 'competition', 'player', 'periode', 'segment', 'goalkeeper_but_coup_franc', 'goalkeeper_but_coup_franc_courtes', 'goalkeeper_but_coup_franc_longues', 'goalkeeper_buts_concedes', 'goalkeeper_passes_courtes_precises', 'goalkeeper_passes_courtes_total', 'goalkeeper_passes_longues_precises', 'goalkeeper_passes_longues_total', 'goalkeeper_place', 'goalkeeper_sorties_total', 'goalkeeper_tirs_contre_cadres', 'goalkeeper_tirs_contre_total']

Distrib `periode` :
periode
2éme mi temps    70852
1ére mi temps    64875
MATCH COMPLET    21154
Name: count, dtype: int64

Distrib `segment` :
segment
mitemps complet    36173
3éme 15 min        33776
2éme 15 min        33360
1ére 15 min        32418
MATCH COMPLET      21154
Name: count, dtype: int64

Distrib `segment_index` (1..6 attendus) :
segment_index
1.0    16148
2.0    16150
3.0    16208
4.0    16270
5.0    17210
6.0    17568
Nam

In [10]:
import pandas as pd
import numpy as np
from pathlib import Path

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error

# =====================================================
# 0) Chemins & lecture du star_schema
# =====================================================

BASE_DIR = Path("CSV")
STAR_DIR = BASE_DIR / "star_schema"
MODEL_DIR = BASE_DIR / "model_data"
MODEL_DIR.mkdir(parents=True, exist_ok=True)

fpw = pd.read_csv(STAR_DIR / "fact_player_wyscout.csv", low_memory=False)

print("fact_player_wyscout shape:", fpw.shape)
print("Colonnes principales :", fpw.columns[:20].tolist())


# =====================================================
# 1) Construction de segment_index (1..6)
# =====================================================

half_map = {
    "1ére mi temps": 1,
    "2éme mi temps": 2,
}

seg_map = {
    "1ére 15 min": 1,
    "2éme 15 min": 2,
    "3éme 15 min": 3,
}

fpw["half"] = fpw["periode"].map(half_map)
fpw["seg_in_half"] = fpw["segment"].map(seg_map)

fpw["segment_index"] = (fpw["half"] - 1) * 3 + fpw["seg_in_half"]

print("\nDistrib `periode` :")
print(fpw["periode"].value_counts())

print("\nDistrib `segment` :")
print(fpw["segment"].value_counts())

print("\nDistrib `segment_index` (1..6 attendus) :")
print(fpw["segment_index"].value_counts().sort_index())


# =====================================================
# 2) Filtrer les lignes utilisables
#    - player_key, match_key, segment_index non NaN
#    - on garde seulement (player,match) avec les 6 segments
# =====================================================

fpw_valid = fpw.dropna(subset=["player_key", "match_key", "segment_index"])

print("\nShape après filtre (player_key, match_key, segment_index non NaN) :", fpw_valid.shape)

group_counts = fpw_valid.groupby(["player_key", "match_key"])["segment_index"].nunique()
print("\nNb de segments distincts par (player, match) :")
print(group_counts.value_counts().sort_index())

# couples (player, match) qui ont exactement les 6 segments
valid_pairs = group_counts[group_counts == 6].index

fpw_6 = (
    fpw_valid
    .set_index(["player_key", "match_key"])
    .loc[valid_pairs]
    .reset_index()
)

print("\nShape fpw_6 (seulement (player,match) avec 6 segments) :", fpw_6.shape)

# check visuel sur un exemple
example_pair = valid_pairs[0]
print("\nExemple pour player_key, match_key =", example_pair)
print(
    fpw_6[
        (fpw_6["player_key"] == example_pair[0]) &
        (fpw_6["match_key"] == example_pair[1])
    ][["periode", "segment", "segment_index"]]
    .sort_values("segment_index")
)


# =====================================================
# 3) Sélection des colonnes de métriques
#    -> mêmes variables pour X (seg1..5) et Y (seg6)
# =====================================================

numeric_cols = fpw_6.select_dtypes(include=[np.number]).columns.tolist()

cols_to_exclude = [
    "fact_player_wyscout_key",
    "date_key",
    "team_key",
    "player_key",
    "competition_key",
    "match_key",
    "half",
    "seg_in_half",
    "segment_index",
]

metric_cols = [c for c in numeric_cols if c not in cols_to_exclude]

print("\nNb de métriques utilisées :", len(metric_cols))
print("Exemple de métriques :", metric_cols[:20])


# =====================================================
# 4) Construction de X (segments 1 à 5) et Y (segment 6)
# =====================================================

# X : segments 1..5
fpw_feat = fpw_6[fpw_6["segment_index"] <= 5].copy()
fpw_feat["segment_index"] = fpw_feat["segment_index"].astype(int)

# Y : segment 6
fpw_target = fpw_6[fpw_6["segment_index"] == 6].copy()

# Pivot X : index = (player_key, match_key), colonnes = metric_seg{1..5}
X_pivot = fpw_feat.pivot_table(
    index=["player_key", "match_key"],
    columns="segment_index",
    values=metric_cols,
    aggfunc="first",
)

X_pivot.columns = [
    f"{metric}_seg{seg}"
    for (metric, seg) in X_pivot.columns.to_flat_index()
]

print("\nShape X_pivot (features) :", X_pivot.shape)

# Pivot Y : mêmes métriques, mais sur segment 6 -> target_metric
Y_pivot = (
    fpw_target
    .set_index(["player_key", "match_key"])[metric_cols]
    .add_prefix("target_")
)

print("Shape Y_pivot (multi-target) :", Y_pivot.shape)


# =====================================================
# 5) Ajout des méta-données et fusion finale
# =====================================================

meta_cols = [
    "player_key",
    "match_key",
    "team_key",
    "competition_key",
    "date_key",
    "position_type",
]

meta = (
    fpw_target[meta_cols]
    .drop_duplicates(subset=["player_key", "match_key"])
    .set_index(["player_key", "match_key"])
)

dataset = (
    meta
    .join(X_pivot, how="inner")
    .join(Y_pivot, how="inner")
    .reset_index()
)

print("\nShape dataset final :", dataset.shape)

feature_cols = [c for c in dataset.columns if "_seg" in c]         # X
target_cols  = [c for c in dataset.columns if c.startswith("target_")]  # Y multi

print("\nNb features (segments 1..5) :", len(feature_cols))
print("Nb targets (segment 6)       :", len(target_cols))
print("Exemple features :", feature_cols[:10])
print("Exemple targets  :", target_cols[:10])


# =====================================================
# 6) Sauvegarde du dataset pour le modèle
# =====================================================

out_path = MODEL_DIR / "wyscout_player_5x15_v1.csv"
dataset.to_csv(out_path, index=False, encoding="utf-8")
print("\n✅ Dataset sauvegardé dans :", out_path)


# =====================================================
# 7) Construction de X et Y (multi-target)
# =====================================================

dataset = pd.read_csv(out_path)

feature_cols = [c for c in dataset.columns if "_seg" in c]
target_cols  = [c for c in dataset.columns if c.startswith("target_")]

X = dataset[feature_cols].copy()
Y = dataset[target_cols].copy()   # <- ICI : Y contient **toutes** les variables cible

print("\nShape X :", X.shape)
print("Shape Y :", Y.shape)
print("Nb targets :", Y.shape[1])




fact_player_wyscout shape: (156881, 93)
Colonnes principales : ['fact_player_wyscout_key', 'team_name', 'date', 'match', 'competition', 'player', 'periode', 'segment', 'goalkeeper_but_coup_franc', 'goalkeeper_but_coup_franc_courtes', 'goalkeeper_but_coup_franc_longues', 'goalkeeper_buts_concedes', 'goalkeeper_passes_courtes_precises', 'goalkeeper_passes_courtes_total', 'goalkeeper_passes_longues_precises', 'goalkeeper_passes_longues_total', 'goalkeeper_place', 'goalkeeper_sorties_total', 'goalkeeper_tirs_contre_cadres', 'goalkeeper_tirs_contre_total']

Distrib `periode` :
periode
2éme mi temps    70852
1ére mi temps    64875
MATCH COMPLET    21154
Name: count, dtype: int64

Distrib `segment` :
segment
mitemps complet    36173
3éme 15 min        33776
2éme 15 min        33360
1ére 15 min        32418
MATCH COMPLET      21154
Name: count, dtype: int64

Distrib `segment_index` (1..6 attendus) :
segment_index
1.0    16148
2.0    16150
3.0    16208
4.0    16270
5.0    17210
6.0    17568
Nam

In [12]:
Y.head()

Unnamed: 0,target_goalkeeper_but_coup_franc,target_goalkeeper_but_coup_franc_courtes,target_goalkeeper_but_coup_franc_longues,target_goalkeeper_buts_concedes,target_goalkeeper_passes_courtes_precises,target_goalkeeper_passes_courtes_total,target_goalkeeper_passes_longues_precises,target_goalkeeper_passes_longues_total,target_goalkeeper_sorties_total,target_goalkeeper_tirs_contre_cadres,...,target_pass_passes_longues_precises,target_pass_passes_longues_total,target_pass_passes_precises,target_pass_passes_profondeur_precises,target_pass_passes_profondeur_total,target_pass_passes_tiers3_precises,target_pass_passes_tiers3_total,target_pass_passes_total,target_pass_secondes_passes_decisives,target_pass_xa
0,,,,,,,,,,,...,0.0,0.0,4.0,0.0,0.0,1.0,1.0,4.0,0.0,0.0
1,,,,,,,,,,,...,0.0,1.0,7.0,0.0,0.0,0.0,1.0,10.0,0.0,0.0
2,,,,,,,,,,,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
3,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,,,,,,,,,,,...,0.0,0.0,7.0,0.0,1.0,0.0,0.0,8.0,0.0,0.0
