In [6]:
import pandas as pd

df = pd.read_csv("../data/raw/Arsenal/players.csv")

df.isnull().sum()


Player     2
Nation     2
Pos        0
Age        0
MP         0
Starts     0
Min       13
90s       15
Gls       15
Ast       15
G+A       15
G-PK      15
PK        15
PKatt     15
CrdY      15
CrdR      15
dtype: int64

# Transformation des données Footballistiques

Ce notebook est dédié au **nettoyage et à la standardisation** des données brutes extraites pour chaque équipe de la Premier League.


In [22]:
import os
import pandas as pd
import numpy as np

# Path to raw data folders
data_path = "../data/raw"
teams = os.listdir(data_path)

players_file = os.path.join(data_path, "liverpool/players.csv")

    # Load raw data
df_players = pd.read_csv(players_file)
new = {}
# if "Nation" in df_players.columns:
#         new["Nation"] = df_players["Nation"].str.extract(r'([A-Z]{3})')

new = {}

if "Pos" in df_players.columns:
            positions = ["DF", "MF", "FW", "GK"]
            for pos in positions:
                df_players[f"is_{pos.lower()}"] = df_players["Pos"].apply(
                    lambda x: pos in str(x).split(",") if pd.notna(x) else False
                )
print(df_players["is_df"])
# print(len(teams) , "\n" , teams)



0     False
1      True
2     False
3     False
4      True
5     False
6      True
7     False
8     False
9      True
10    False
11     True
12    False
13    False
14     True
15    False
16     True
17     True
18     True
19    False
20     True
21    False
22    False
23    False
24    False
25    False
26    False
27     True
28    False
Name: is_df, dtype: bool


In [None]:
for team in teams:
    team_folder = os.path.join(data_path, team)
    players_file = os.path.join(team_folder, "players.csv")
    matches_file = os.path.join(team_folder, "matches.csv")


In [28]:
import os
import pandas as pd
import numpy as np

# === Paths ===
base_path = "../data/raw"
cleaned_path = "../data/cleaned"
os.makedirs(cleaned_path, exist_ok=True)

# Path to raw data folders
teams = os.listdir(base_path)


# === Helper function for cleaning strings ===
def clean_string(text):
    if isinstance(text, str):
        return text.strip().replace("\n", " ").replace("\xa0", " ")
    return text


# === Loop through all teams ===
for team_folder in teams:
    team_path = os.path.join(base_path, team_folder)
    if not os.path.isdir(team_path):
        continue

    print(f"\n Cleaning data for team: {team_folder}")

    #  1. Clean MATCHES

    matches_path = os.path.join(team_path, "matches.csv")
    if os.path.exists(matches_path):
        matches = pd.read_csv(matches_path)

        # --- Clean column names ---
        matches.columns = [c.strip().replace("\n", "_").replace(" ", "_") for c in matches.columns]
        matches = matches.applymap(clean_string)

        # --- Handle missing or placeholder values ---
        matches.replace(["", "NaN", "None", "-", "--"], np.nan, inplace=True)

        # --- Fix the Time column keep first time only 
        if "Time" in matches.columns:
            matches["Time"] = matches["Time"].astype(str).str.replace(r"\s*\(.*?\)", "", regex=True).str.strip()

        # --- Combine Date + Time into datetime ---
        if {"Date", "Time"}.issubset(matches.columns):
            matches["datetime"] = pd.to_datetime(
                matches["Date"].astype(str) + " " + matches["Time"].astype(str),
                errors="coerce"
            )
            matches.drop(["Date", "Time"], axis=1, inplace=True)

        # --- Standardize Round column (clean text) ---
        if "Round" in matches.columns:
            matches["Round"] = matches["Round"].str.replace("round proper", "Round", case=False)
            matches["Round"] = matches["Round"].str.replace("Matchweek", "Week", case=False)
            matches["Round"] = matches["Round"].str.strip()

        # --- Clean GF and GA (remove parentheses content like '1 (4)' -> '1') ---
        for col in ["GF", "GA"]:
            if col in matches.columns:
                matches[col] = matches[col].astype(str).str.extract(r"^(\d+)")[0]
                matches[col] = pd.to_numeric(matches[col], errors="coerce").fillna(0).astype(int)

        # --- Clean Attendance (remove commas and convert to int) ---
        if "Attendance" in matches.columns:
            matches["Attendance"] = (
                matches["Attendance"]
                .astype(str)
                .str.replace(",", "", regex=False)
                .replace("nan", np.nan)
            )
            matches["Attendance"] = pd.to_numeric(matches["Attendance"], errors="coerce").fillna(0).astype(int)

        # --- Rename app formation column if exists ---
        for col in matches.columns:
            if "Opp Formation" in col:
                matches.rename(columns={col: "Opp_Formation"}, inplace=True)

        # --- Convert numeric columns ---
        numeric_cols = ["xG", "xGA" , "Poss"]
        for col in numeric_cols:
            if col in matches.columns:
                matches[col] = pd.to_numeric(matches[col], errors="coerce").fillna(0)

        # --- Handle missing categorical data ---
        for cat_col in ["Referee", "Captain", "Opponent", "Venue", "Result", "Comp", "Round"]:
            if cat_col in matches.columns:
                matches[cat_col] = matches[cat_col].fillna("Unknown")

        # --- Save cleaned matches ---
        clean_team_path = os.path.join(cleaned_path, team_folder)
        os.makedirs(clean_team_path, exist_ok=True)
        matches.to_csv(os.path.join(clean_team_path, "matches.csv"), index=False)
        print(f"✅ Matches cleaned for {team_folder}")

    # 2. Clean PLAYERS

    players_path = os.path.join(team_path, "players.csv")
    if os.path.exists(players_path):
        players = pd.read_csv(players_path)

        # --- Clean column names ---
        players.columns = [c.strip().replace("\n", "_").replace(" ", "_") for c in players.columns]
        players = players.applymap(clean_string)

        # --- Handle missing values ---
        players.replace(["", "NaN", "None", "-", "--"], np.nan, inplace=True)

        # --- Extract Nation 3-letter code ---
        if "Nation" in players.columns:
            players["Nation"] = players["Nation"].str.extract(r"([A-Z]{3})")

  
        # --- Convert numeric columns ---
        numeric_cols = ["Age", "MP", "Starts", "Min", "90s", "Gls", "Ast", "G-PK", "PK", "PKatt", "CrdY", "CrdR" , "G+A"]
        for col in numeric_cols:
            if col in players.columns:
                players[col] = pd.to_numeric(players[col], errors="coerce").fillna(0)

        # --- Handle missing categorical values ---
        for cat_col in ["Player", "Nation", "Pos"]:
            if cat_col in players.columns:
                players[cat_col] = players[cat_col].fillna("Unknown")

        # --- Save cleaned players ---
        clean_team_path = os.path.join(cleaned_path, team_folder)
        os.makedirs(clean_team_path, exist_ok=True)
        players.to_csv(os.path.join(clean_team_path, "players.csv"), index=False)
        print(f"✅ Players cleaned for {team_folder}")

print("\n All teams cleaned and saved successfully to:", cleaned_path)



 Cleaning data for team: Arsenal
✅ Matches cleaned for Arsenal
✅ Players cleaned for Arsenal

 Cleaning data for team: Aston_Villa
✅ Matches cleaned for Aston_Villa
✅ Players cleaned for Aston_Villa

 Cleaning data for team: Bournemouth
✅ Matches cleaned for Bournemouth
✅ Players cleaned for Bournemouth

 Cleaning data for team: Brentford


  matches = matches.applymap(clean_string)
  players = players.applymap(clean_string)
  matches = matches.applymap(clean_string)
  players = players.applymap(clean_string)
  matches = matches.applymap(clean_string)
  players = players.applymap(clean_string)
  matches = matches.applymap(clean_string)


✅ Matches cleaned for Brentford
✅ Players cleaned for Brentford

 Cleaning data for team: Brighton
✅ Matches cleaned for Brighton
✅ Players cleaned for Brighton

 Cleaning data for team: Chelsea
✅ Matches cleaned for Chelsea
✅ Players cleaned for Chelsea

 Cleaning data for team: Crystal_Palace
✅ Matches cleaned for Crystal_Palace


  players = players.applymap(clean_string)
  matches = matches.applymap(clean_string)
  players = players.applymap(clean_string)
  matches = matches.applymap(clean_string)
  players = players.applymap(clean_string)
  matches = matches.applymap(clean_string)
  players = players.applymap(clean_string)


✅ Players cleaned for Crystal_Palace

 Cleaning data for team: Everton
✅ Matches cleaned for Everton
✅ Players cleaned for Everton

 Cleaning data for team: Fulham
✅ Matches cleaned for Fulham
✅ Players cleaned for Fulham

 Cleaning data for team: Ipswich_Town
✅ Matches cleaned for Ipswich_Town
✅ Players cleaned for Ipswich_Town

 Cleaning data for team: Leicester_City
✅ Matches cleaned for Leicester_City


  matches = matches.applymap(clean_string)
  players = players.applymap(clean_string)
  matches = matches.applymap(clean_string)
  players = players.applymap(clean_string)
  matches = matches.applymap(clean_string)
  players = players.applymap(clean_string)
  matches = matches.applymap(clean_string)
  players = players.applymap(clean_string)
  matches = matches.applymap(clean_string)
  players = players.applymap(clean_string)
  matches = matches.applymap(clean_string)
  players = players.applymap(clean_string)
  matches = matches.applymap(clean_string)
  players = players.applymap(clean_string)
  matches = matches.applymap(clean_string)
  players = players.applymap(clean_string)


✅ Players cleaned for Leicester_City

 Cleaning data for team: Liverpool
✅ Matches cleaned for Liverpool
✅ Players cleaned for Liverpool

 Cleaning data for team: Manchester_City
✅ Matches cleaned for Manchester_City
✅ Players cleaned for Manchester_City

 Cleaning data for team: Manchester_Utd
✅ Matches cleaned for Manchester_Utd
✅ Players cleaned for Manchester_Utd

 Cleaning data for team: Newcastle_Utd
✅ Matches cleaned for Newcastle_Utd
✅ Players cleaned for Newcastle_Utd

 Cleaning data for team: Nott'ham_Forest
✅ Matches cleaned for Nott'ham_Forest
✅ Players cleaned for Nott'ham_Forest

 Cleaning data for team: Southampton
✅ Matches cleaned for Southampton
✅ Players cleaned for Southampton

 Cleaning data for team: Tottenham
✅ Matches cleaned for Tottenham
✅ Players cleaned for Tottenham

 Cleaning data for team: West_Ham
✅ Matches cleaned for West_Ham
✅ Players cleaned for West_Ham

 Cleaning data for team: Wolves
✅ Matches cleaned for Wolves
✅ Players cleaned for Wolves

 All 

  matches = matches.applymap(clean_string)
  players = players.applymap(clean_string)
  matches = matches.applymap(clean_string)
  players = players.applymap(clean_string)
  matches = matches.applymap(clean_string)
  players = players.applymap(clean_string)
  matches = matches.applymap(clean_string)
  players = players.applymap(clean_string)
  matches = matches.applymap(clean_string)
  players = players.applymap(clean_string)


In [52]:
import os
import pandas as pd

# === Paths ===
BASE_DIR = "../data/cleaned"      # Folder where each team has its own subfolder
READY_DIR = "../data/ready"
os.makedirs(READY_DIR, exist_ok=True)

all_players = []
all_matches = []
teams = []

# === Loop through team folders ===
for team_name in os.listdir(BASE_DIR):
    team_folder = os.path.join(BASE_DIR, team_name)
    if not os.path.isdir(team_folder):
        continue

    # Save team info
    clean_team_name = team_name.replace("_", " ").strip().title() 

    teams.append({"team_name": clean_team_name})
    # teams.append({"team_name": team_name})

    players_path = os.path.join(team_folder, "players.csv")
    matches_path = os.path.join(team_folder, "matches.csv")

    if os.path.exists(players_path):
        players_df = pd.read_csv(players_path)
        players_df["team_name"] = clean_team_name
        all_players.append(players_df)

    if os.path.exists(matches_path):
        matches_df = pd.read_csv(matches_path)
    
        # Keep only Premier League matches
        if "Comp" in matches_df.columns:
            matches_df = matches_df[matches_df["Comp"].str.lower().eq("premier league")]
    
        # Keep only 'Home' matches to avoid duplicates
        if "Venue" in matches_df.columns:
            matches_df = matches_df[matches_df["Venue"].str.lower().eq("home")]
    
        # Add team name (for reference)
        matches_df["team_name"] = clean_team_name
    
        all_matches.append(matches_df)



# === Concatenate ===
all_players_df = pd.concat(all_players, ignore_index=True)
all_matches_df = pd.concat(all_matches, ignore_index=True)

# === Create Teams DataFrame ===
teams_df = pd.DataFrame(teams).drop_duplicates().reset_index(drop=True)
teams_df.insert(0, "team_id", range(1, len(teams_df) + 1))

print(f"Players shape: {all_players_df.shape}")
print(f"Matches shape: {all_matches_df.shape}")
print(f"Teams shape: {teams_df.shape}")

# === Optional: save for the ready phase ===
all_players_df.to_csv(os.path.join(READY_DIR, "all_players.csv"), index=False)
all_matches_df.to_csv(os.path.join(READY_DIR, "all_matches.csv"), index=False)
teams_df.to_csv(os.path.join(READY_DIR, "teams.csv"), index=False)


Players shape: (702, 17)
Matches shape: (380, 18)
Teams shape: (20, 2)


In [55]:
import pandas as pd
import os
import re

READY_DIR = "../data/ready"

# === Load data ===
all_players_df = pd.read_csv(os.path.join(READY_DIR, "all_players.csv"))
all_matches_df = pd.read_csv(os.path.join(READY_DIR, "all_matches.csv"))
teams_df = pd.read_csv(os.path.join(READY_DIR, "teams.csv"))

# 1️⃣  SAISON TABLE

saison_df = pd.DataFrame([{"saison_id": 1, "year": "2024-2025"}])

# 2️⃣  COMPETITION TABLE

competition_names = all_matches_df["Comp"].dropna().unique()
competition_df = pd.DataFrame({
    "competition_id": range(1, len(competition_names) + 1),
    "competition_name": competition_names
})

# 3️⃣  TEAM TABLE (link to saison and competition)
# Assume all teams play in the same competition & season
teams_df["saison_id"] = 1
# teams_df["competition_id"] = 1  # Premier League for now

# 4️⃣  PLAYER + PLAYER_STATISTICS TABLES
# Player identity
player_df = all_players_df[["Player", "Nation", "Age", "Pos", "team_name"]].drop_duplicates()
player_df = player_df.merge(teams_df[["team_id", "team_name"]], on="team_name", how="left")
player_df.insert(0, "player_id", range(1, len(player_df) + 1))
player_df.drop(columns=["team_name"], inplace=True)

# Player statistics (aggregate-level, not per match)
stat_cols = [c for c in all_players_df.columns if c not in ["Player", "Nation", "Age", "Pos", "team_name"]]
player_statistics_df = all_players_df.merge(
    player_df[["player_id", "Player"]], on="Player", how="left"
)[["player_id"] + stat_cols]



# 5️⃣ MATCH TABLE
# Create match table using only the "Home" matches we kept earlier

match_df = all_matches_df.copy()

# Merge to get the team_id for the main team (the one whose folder we scraped)
match_df = match_df.merge(
    teams_df[["team_id", "team_name"]],
    on="team_name",
    how="left"
)

# Clean and map opponent name to team_id (if opponent exists in teams_df)
match_df["Opponent"] = match_df["Opponent"].astype(str).str.strip()
match_df = match_df.merge(
    teams_df.rename(columns={"team_id": "opponent_id", "team_name": "Opponent"}),
    on="Opponent",
    how="left"
)

# Create match_id
match_df.insert(0, "match_id", range(1, len(match_df) + 1))

# Standardize attendance (remove commas and cast to int)
if "Attendance" in match_df.columns:
    match_df["Attendance"] = (
        match_df["Attendance"].astype(str).str.replace(",", "", regex=False).replace("", None)
    )
    match_df["Attendance"] = pd.to_numeric(match_df["Attendance"], errors="coerce")

# Assign saison and competition (for now we assume single season & competition)
match_df["saison_id"] = 1
match_df["competition_id"] = 1

# Select columns relevant to the 'match' table
match_cols = [
    "match_id", "team_id", "opponent_id", "datetime",
    "Attendance", "Referee", "saison_id", "competition_id"
]
# match_df = match_df[match_cols]

# 6️⃣ MATCH RESULT TABLE
# Each match_id has its result stats (GF, GA, etc.)

match_result_df = all_matches_df.copy()

# Merge with match_df to get the match_id for each row
match_result_df = match_result_df.merge(
    teams_df[["team_id", "team_name"]],
    on="team_name",
    how="left"
)



match_result_df = match_result_df.merge(
    match_df[["match_id", "datetime", "team_id"]],
    on=["datetime", "team_id"],
    how="left"
)

# Keep relevant result columns
result_cols = ["match_id", "GF", "GA", "Result", "xG", "xGA"]
match_result_df = match_result_df[result_cols].drop_duplicates().reset_index(drop=True)

#  SAVE ALL TABLES

READY_DIR = "../data/processed"


saison_df.to_csv(os.path.join(READY_DIR, "saison.csv"), index=False)
competition_df.to_csv(os.path.join(READY_DIR, "competition.csv"), index=False)
teams_df.to_csv(os.path.join(READY_DIR, "team.csv"), index=False)
player_df.to_csv(os.path.join(READY_DIR, "player.csv"), index=False)
player_statistics_df.to_csv(os.path.join(READY_DIR, "player_statistics.csv"), index=False)
match_df.to_csv(os.path.join(READY_DIR, "match.csv"), index=False)
match_result_df.to_csv(os.path.join(READY_DIR, "match_result.csv"), index=False)

print("✅ All relational DataFrames successfully created and saved!")


✅ All relational DataFrames successfully created and saved!
