In [2]:
from pathlib import Path
print("Current working directory:", Path.cwd())

Current working directory: c:\Users\User\Desktop\my-local-repo\sports_modelling\models


In [3]:
import numpy as np
import pandas as pd
from pathlib import Path

# Resolve data directory relative to the notebook location (../data)
data_dir = (Path.cwd().parent / "data").resolve()
processed_dir = data_dir / "processed"
print(processed_dir)

# Helper to choose processed file first, else fallback raw by league code
def find_project_root(start_path: Path) -> Path:
    """
    Walk upward from the current directory until a folder containing 'data' is found.
    """
    for parent in [start_path, *start_path.parents]:
        if (parent / "data").exists():
            return parent
    raise RuntimeError("Could not find project root containing a 'data' folder.")

# Load all requested leagues
leagues = ["ENG", "FRA", "SPA"]
league_dfs = {}

for code in leagues:
    path = find_project_root(Path.cwd()) / "data" / "processed" / f"{code}_combined.csv"
    df = pd.read_csv(path)
    df = df[df["Minute"] > 0]
    league_dfs[code] = df

# Unpack for convenience
premier_league_df = league_dfs["ENG"]
league1_df = league_dfs["FRA"]
laliga_df = league_dfs["SPA"]

print("Loaded leagues:", ", ".join(league_dfs.keys()))
print(f"ENG columns: {premier_league_df.columns.tolist()}")
print(f"FRA columns: {league1_df.columns.tolist()}")
print(f"SPA columns: {laliga_df.columns.tolist()}")

C:\Users\User\Desktop\my-local-repo\sports_modelling\data\processed
Loaded leagues: ENG, FRA, SPA
ENG columns: ['RBallID', 'HomeTeam', 'AwayTeam', 'Timestamp', 'Incident', 'IncidentNumber', 'Minute']
FRA columns: ['RBallID', 'HomeTeam', 'AwayTeam', 'Timestamp', 'Incident', 'IncidentNumber', 'Minute']
SPA columns: ['RBallID', 'HomeTeam', 'AwayTeam', 'Timestamp', 'Incident', 'IncidentNumber', 'Minute']


In [7]:
def assign_season(dt) :
    month = dt.month
    year = dt.year
    return f"{year}-{year+1}" if month >= 8 else f"{year-1}-{year}"

def create_basic_dataset(df: pd.DataFrame) -> pd.DataFrame:
    """
    Create a basic dataset with selected features for modeling.
    """
    stats = {
        "Home Goals": (df["Incident"] == "GOAL1"),
        "Away Goals": (df["Incident"] == "GOAL2"),
        "Home Corners": (df["Incident"] == "CR1"),
        "Away Corners": (df["Incident"] == "CR2"),
        "Home Shots on Target": (df['Incident'] == 'SHG1'),
        "Away Shots on Target": (df['Incident'] == 'SHG2'),
        "Home Fouls": (df['Incident'] == 'F1'),
        "Away Fouls": (df['Incident'] == 'F2'),
        "Home Offsides": (df['Incident'] == 'O1'),
        "Away Offsides": (df['Incident'] == 'O2'),
        "Home Penalties": (df['Incident'] == 'PEN1'),
        "Away Penalties": (df['Incident'] == 'PEN2'),
        "Home Shots Blocked": (df['Incident'] == 'BLOCKED1'),
        "Away Shots Blocked": (df['Incident'] == 'BLOCKED2'),
        "Home Goal Kicks": (df['Incident'] == 'GK1'),
        "Away Goal Kicks": (df['Incident'] == 'GK2'),
        "Home Shots off Target": (df['Incident'] == 'SHB1'),
        "Away Shots off Target": (df['Incident'] == 'SHB2'),
        "Home Throwins": (df['Incident'] == 'TI1'),
        "Away Throwins": (df['Incident'] == 'TI2'),
        "Home Shots Woodwork": (df['Incident'] == 'SHW1'),
        "Away Shots Woodwork": (df['Incident'] == 'SHW2'),
    }
    df["Season"] = pd.to_datetime(df["Timestamp"]).apply(assign_season)

    for name, mask in stats.items():
        df[name] = mask.astype(int)

    agg_cols = list(stats.keys())
    aggregated = df.groupby(["RBallID", "HomeTeam", "AwayTeam", "Season"])[agg_cols].sum().reset_index()
    aggregated["Result"] = np.where(
        aggregated["Home Goals"] > aggregated["Away Goals"], "H",
        np.where(
            aggregated["Home Goals"] < aggregated["Away Goals"],"A", "D"
        )
    )
    return aggregated

premier_basic_df = create_basic_dataset(premier_league_df)
league1_basic_df = create_basic_dataset(league1_df)
laliga_basic_df = create_basic_dataset(laliga_df) 


In [8]:
premier_basic_df.head()

Unnamed: 0,RBallID,HomeTeam,AwayTeam,Season,Home Goals,Away Goals,Home Corners,Away Corners,Home Shots on Target,Away Shots on Target,...,Away Shots Blocked,Home Goal Kicks,Away Goal Kicks,Home Shots off Target,Away Shots off Target,Home Throwins,Away Throwins,Home Shots Woodwork,Away Shots Woodwork,Result
0,849137,Arsenal FC,Leicester City FC,2017-2018,4,3,9,4,6,1,...,0,6,12,10,2,21,32,0,0,H
1,849138,Watford FC,Liverpool FC,2017-2018,3,3,3,3,2,2,...,1,12,7,2,7,27,27,0,1,D
2,849139,Chelsea FC,Burnley FC,2017-2018,3,4,8,5,10,2,...,1,4,10,7,3,12,8,0,1,A
3,849140,Crystal Palace FC,Huddersfield Town FC,2017-2018,0,3,12,9,3,4,...,0,3,10,7,2,19,29,0,0,A
4,849141,Everton FC,Stoke City FC,2017-2018,1,0,6,7,3,2,...,2,10,8,3,4,25,28,0,0,H


In [13]:
def update_elo(home_elo, away_elo, result, k = 30, home_advantage = 80):
    expected_home = 1/(1 +10 ** ((away_elo - (home_elo + home_advantage))/400))
    expected_away = 1 - expected_home
    if result == 'H':
        score_home, score_away = 1, 0
    elif result == 'A':
        score_home, score_away = 0, 1
    else:
        score_home, score_away = 0.5, 0.5
    new_home_elo = home_elo + k * (score_home - expected_home)
    new_away_elo = away_elo + k * (score_away - expected_away)
    return new_home_elo, new_away_elo

In [18]:
# ---------------------------------------------
# Initialize parameters
# ---------------------------------------------
initial_elo = 1500
k = 30
home_advantage = 80

# Columns to store ELOs
home_elos_list = []
away_elos_list = []

# Process each season separately
for season, season_df in premier_basic_df.groupby("Season", sort=True):
    # Get all teams in this season
    teams = set(season_df['HomeTeam']).union(set(season_df['AwayTeam']))
    team_elos = {team: initial_elo for team in teams}

    # Iterate over matches in order
    for idx, row in season_df.iterrows():
        home = row["HomeTeam"]
        away = row["AwayTeam"]
        result = row["Result"]

        home_elo = team_elos[home]
        away_elo = team_elos[away]

        # Append current ELOs
        home_elos_list.append(home_elo)
        away_elos_list.append(away_elo)

        # Update ELOs
        new_home_elo, new_away_elo = update_elo(home_elo, away_elo, result,
                                                k=k, home_advantage=home_advantage)
        team_elos[home] = new_home_elo
        team_elos[away] = new_away_elo

# Assign back to dataframe
premier_basic_df['HomeElo'] = home_elos_list
premier_basic_df['AwayElo'] = away_elos_list

premier_basic_df[premier_basic_df['Season'] == "2017-2018"].tail(50)

Unnamed: 0,RBallID,HomeTeam,AwayTeam,Season,Home Goals,Away Goals,Home Corners,Away Corners,Home Shots on Target,Away Shots on Target,...,Away Goal Kicks,Home Shots off Target,Away Shots off Target,Home Throwins,Away Throwins,Home Shots Woodwork,Away Shots Woodwork,Result,HomeElo,AwayElo
323,958790,Arsenal FC,Southampton FC,2017-2018,3,3,8,6,5,7,...,5,3,2,24,20,0,0,D,1515.73638,1403.992051
324,961178,Burnley FC,Leicester City FC,2017-2018,2,1,4,8,1,4,...,7,2,3,31,31,0,0,H,1539.558404,1518.393279
325,961179,Crystal Palace FC,Brighton & Hove Albion FC,2017-2018,3,2,5,6,4,3,...,9,4,5,16,28,0,0,H,1449.532957,1443.066765
326,961180,Huddersfield Town FC,Watford FC,2017-2018,1,0,9,6,0,1,...,8,4,0,27,28,0,0,H,1396.938946,1414.765687
327,961181,Swansea City AFC,Everton FC,2017-2018,1,1,6,3,7,4,...,6,5,5,29,19,0,1,D,1446.161331,1471.920277
328,961182,Liverpool FC,AFC Bournemouth,2017-2018,3,0,7,5,4,1,...,14,11,4,14,17,0,0,H,1661.535911,1474.133167
329,961183,Tottenham Hotspur FC,Manchester City FC,2017-2018,1,3,5,7,3,5,...,7,3,6,20,19,0,1,A,1668.03838,1706.458091
330,962194,Newcastle United FC,Arsenal FC,2017-2018,2,1,2,5,4,4,...,6,2,7,18,22,1,0,H,1492.605507,1508.20741
331,962195,Manchester United FC,West Bromwich Albion FC,2017-2018,0,1,4,4,4,3,...,12,5,3,17,16,0,0,A,1695.105103,1318.59899
332,962866,West Ham United FC,Stoke City FC,2017-2018,4,1,10,2,6,5,...,5,5,2,31,17,0,0,H,1447.075165,1387.387724
