In [79]:
import pandas as pd

df = pd.read_csv("../data/cleaned_premier_league.csv")
print(df.shape)
print(df.columns)
print(df.head)

(9500, 26)
Index(['MatchID', 'Season', 'MatchWeek', 'Date', 'HomeTeam', 'AwayTeam',
       'FullTimeHomeTeamGoals', 'FullTimeAwayTeamGoals', 'FullTimeResult',
       'HalfTimeHomeTeamGoals', 'HalfTimeAwayTeamGoals', 'HalfTimeResult',
       'HomeTeamShots', 'AwayTeamShots', 'HomeTeamShotsOnTarget',
       'AwayTeamShotsOnTarget', 'HomeTeamCorners', 'AwayTeamCorners',
       'HomeTeamFouls', 'AwayTeamFouls', 'HomeTeamYellowCards',
       'AwayTeamYellowCards', 'HomeTeamRedCards', 'AwayTeamRedCards',
       'HomeTeamPoints', 'AwayTeamPoints'],
      dtype='object')
<bound method NDFrame.head of                                MatchID     Season  MatchWeek        Date  \
0          2000-2001_Charlton_Man City  2000-2001          1  2000-08-19   
1           2000-2001_Chelsea_West Ham  2000-2001          1  2000-08-19   
2     2000-2001_Coventry_Middlesbrough  2000-2001          1  2000-08-19   
3          2000-2001_Derby_Southampton  2000-2001          1  2000-08-19   
4              2000-

In [80]:
# rolling average goal scoring form in last 5 games for both Home and Away
df = df.sort_values("Date").reset_index(drop=True)

df["HomeGoalScoringForm"] = (
    df.groupby("HomeTeam")["FullTimeHomeTeamGoals"]
    .rolling(window=5, min_periods=1)
    .mean()
    .shift(1)
    .reset_index(level=0, drop=True)
)

df["AwayGoalScoringForm"] = (
    df.groupby("AwayTeam")["FullTimeAwayTeamGoals"]
    .rolling(window=5, min_periods=1)
    .mean()
    .shift(1)
    .reset_index(level=0, drop=True)
)

print(df["HomeGoalScoringForm"])
print(df["AwayGoalScoringForm"])

0       1.0
1       0.8
2       1.6
3       1.6
4       0.8
       ... 
9495    0.8
9496    1.4
9497    0.8
9498    2.6
9499    1.8
Name: HomeGoalScoringForm, Length: 9500, dtype: float64
0       0.8
1       1.8
2       1.0
3       0.8
4       0.6
       ... 
9495    1.2
9496    0.6
9497    0.8
9498    1.4
9499    1.4
Name: AwayGoalScoringForm, Length: 9500, dtype: float64


In [81]:
# rolling average goals conceded in last 5 games for both Home and Away
df = df.sort_values("Date").reset_index(drop=True)

df["HomeGoalsConceded"] = (
    df.groupby("HomeTeam")["FullTimeAwayTeamGoals"]
    .rolling(window=5, min_periods=1)
    .mean()
    .shift(1)
    .reset_index(level=0, drop=True)
)

df["AwayGoalsConceded"] = (
    df.groupby("AwayTeam")["FullTimeHomeTeamGoals"]
    .rolling(window=5, min_periods=1)
    .mean()
    .shift(1)
    .reset_index(level=0, drop=True)
)

print(df["HomeGoalsConceded"])
print(df["AwayGoalsConceded"])

0       2.0
1       0.6
2       0.6
3       0.8
4       2.8
       ... 
9495    1.0
9496    1.6
9497    0.4
9498    1.2
9499    1.0
Name: HomeGoalsConceded, Length: 9500, dtype: float64
0       2.8
1       2.6
2       2.2
3       3.0
4       2.6
       ... 
9495    1.2
9496    0.8
9497    0.8
9498    0.4
9499    0.8
Name: AwayGoalsConceded, Length: 9500, dtype: float64


In [82]:
# rolling average points in last 5 matches for both Home and Away
df = df.sort_values("Date").reset_index(drop=True)

df["HomeWinningForm"] = (
    df.groupby("HomeTeam")["HomeTeamPoints"]
    .rolling(window=5, min_periods=1)
    .mean()
    .shift(1)
    .reset_index(level=0, drop=True)
)

df["AwayWinningForm"] = (
    df.groupby("AwayTeam")["AwayTeamPoints"]
    .rolling(window=5, min_periods=1)
    .mean()
    .shift(1)
    .reset_index(level=0, drop=True)
)

print(df["HomeWinningForm"])
print(df["AwayWinningForm"])




0       0.6
1       1.6
2       2.6
3       2.2
4       0.0
       ... 
9495    0.8
9496    0.0
9497    1.2
9498    0.8
9499    2.0
Name: HomeWinningForm, Length: 9500, dtype: float64
0       0.0
1       0.8
2       0.2
3       0.0
4       0.0
       ... 
9495    0.4
9496    0.8
9497    1.6
9498    1.4
9499    2.0
Name: AwayWinningForm, Length: 9500, dtype: float64


In [83]:
# overall average recent form for goals scored, goals conceded and points won in the last 5 games

home = df[["Date", "HomeTeam", "FullTimeHomeTeamGoals", "FullTimeAwayTeamGoals", "HomeTeamPoints"]].copy()
home.rename(
    columns={
        "HomeTeam": "Team",
        "FullTimeHomeTeamGoals": "GoalsScored",
        "FullTimeAwayTeamGoals": "GoalsConceded",
        "HomeTeamPoints": "Points",
    },
    inplace=True,
)

away = df[["Date", "AwayTeam", "FullTimeAwayTeamGoals", "FullTimeHomeTeamGoals", "AwayTeamPoints"]].copy()
away.rename(
    columns={
        "AwayTeam": "Team",
        "FullTimeAwayTeamGoals": "GoalsScored",
        "FullTimeHomeTeamGoals": "GoalsConceded",
        "AwayTeamPoints": "Points"
    },
    inplace=True,
)

# build combined team-level match history
combined_df = pd.concat([home, away], ignore_index=True)
combined_df.sort_values(["Team", "Date"], inplace=True)

print(combined_df)

combined_df["RecentGoalScoringForm"] = (
    combined_df.groupby("Team")["GoalsScored"]
    .rolling(window=5, min_periods=1)
    .mean()
    .shift(1)
    .reset_index(level=0, drop=True)
)

combined_df["RecentGoalsConceded"] = (
    combined_df.groupby("Team")["GoalsConceded"]
    .rolling(window=5, min_periods=1)
    .mean()
    .shift(1)
    .reset_index(level=0, drop=True)
)

combined_df["RecentWinningForm"] = (
    combined_df.groupby("Team")["Points"]
    .rolling(window=5, min_periods=1)
    .mean()
    .shift(1)
    .reset_index(level=0, drop=True)
)

print(combined_df["RecentGoalScoringForm"])
print(combined_df["RecentGoalsConceded"])
print(combined_df["RecentWinningForm"])

# merge back into main dataset
# Home team recent form
home_recent = combined_df.copy()
home_recent.rename(columns={
    "Team": "HomeTeam",
    "RecentGoalScoringForm": "HomeRecentGoalScoringForm",
    "RecentGoalsConceded": "HomeRecentGoalsConceded",
    "RecentWinningForm": "HomeRecentWinningForm"
}, inplace=True)

home_recent = home_recent[["Date", "HomeTeam", "HomeRecentGoalScoringForm", "HomeRecentGoalsConceded", "HomeRecentWinningForm"]]

df = df.merge(home_recent, on=["Date", "HomeTeam"], how="left")

# Away team recent form
away_recent = combined_df.copy()
away_recent.rename(columns={
    "Team": "AwayTeam",
    "RecentGoalScoringForm": "AwayRecentGoalScoringForm",
    "RecentGoalsConceded": "AwayRecentGoalsConceded",
    "RecentWinningForm": "AwayRecentWinningForm"
}, inplace=True)

away_recent = away_recent[["Date", "AwayTeam", "AwayRecentGoalScoringForm", "AwayRecentGoalsConceded", "AwayRecentWinningForm"]]

df = df.merge(away_recent, on=["Date", "AwayTeam"], how="left")

print(df.columns)

# Fill na with 0 since first match has no form precedence
df.fillna(0, inplace=True)

             Date     Team  GoalsScored  GoalsConceded  Points
9507   2000-08-19  Arsenal            0              1       0
10     2000-08-21  Arsenal            2              0       3
22     2000-08-26  Arsenal            5              3       3
9538   2000-09-06  Arsenal            2              2       1
9539   2000-09-09  Arsenal            1              1       1
...           ...      ...          ...            ...     ...
9452   2025-04-26   Wolves            3              0       3
18960  2025-05-02   Wolves            0              1       0
9470   2025-05-10   Wolves            0              2       0
18988  2025-05-20   Wolves            2              4       0
9499   2025-05-25   Wolves            1              1       1

[19000 rows x 5 columns]
9507          NaN
10       0.000000
22       1.000000
9538     2.333333
9539     2.250000
           ...   
9452     2.000000
18960    2.200000
9470     2.000000
18988    1.600000
9499     1.200000
Name: RecentGoalScor

In [84]:
# days since last match played

# build team-level match history log
home = df[["Date", "HomeTeam"]].copy()
home.rename(columns={"HomeTeam": "Team"}, inplace=True)

away = df[["Date", "AwayTeam"]].copy()
away.rename(columns={"AwayTeam": "Team"}, inplace=True)

# combine home and away into one team match history
team_matches = pd.concat([home, away], ignore_index=True)

team_matches = team_matches.sort_values(["Team", "Date"]).reset_index(drop=True)

team_matches["LastMatchDate"] = team_matches.groupby("Team")["Date"].shift(1)

team_matches["DaysSinceLastMatch"] = (
    pd.to_datetime(team_matches["Date"]) - pd.to_datetime(team_matches["LastMatchDate"])
).dt.days

print(team_matches["DaysSinceLastMatch"])
print(team_matches["DaysSinceLastMatch"].median())


# merge back into main dataset
home_rest = team_matches.rename(
    columns={
        "Team": "HomeTeam", 
        "DaysSinceLastMatch": "HomeDaysSinceLastMatch"
    })[["Date", "HomeTeam", "HomeDaysSinceLastMatch"]]
df = df.merge(home_rest, on=["Date", "HomeTeam"], how="left")

away_rest = team_matches.rename(
    columns={
        "Team": "AwayTeam", 
        "DaysSinceLastMatch": "AwayDaysSinceLastMatch"
    })[["Date", "AwayTeam", "AwayDaysSinceLastMatch"]]
df = df.merge(away_rest, on=["Date", "AwayTeam"], how="left")

print(df.columns)

# impute first match rest day with median value of 7 since no prior match
df["HomeDaysSinceLastMatch"].fillna(7, inplace=True)
df["AwayDaysSinceLastMatch"].fillna(7, inplace=True)


0         NaN
1         2.0
2         5.0
3        11.0
4         3.0
         ... 
18995     6.0
18996     6.0
18997     8.0
18998    10.0
18999     5.0
Name: DaysSinceLastMatch, Length: 19000, dtype: float64
7.0
Index(['MatchID', 'Season', 'MatchWeek', 'Date', 'HomeTeam', 'AwayTeam',
       'FullTimeHomeTeamGoals', 'FullTimeAwayTeamGoals', 'FullTimeResult',
       'HalfTimeHomeTeamGoals', 'HalfTimeAwayTeamGoals', 'HalfTimeResult',
       'HomeTeamShots', 'AwayTeamShots', 'HomeTeamShotsOnTarget',
       'AwayTeamShotsOnTarget', 'HomeTeamCorners', 'AwayTeamCorners',
       'HomeTeamFouls', 'AwayTeamFouls', 'HomeTeamYellowCards',
       'AwayTeamYellowCards', 'HomeTeamRedCards', 'AwayTeamRedCards',
       'HomeTeamPoints', 'AwayTeamPoints', 'HomeGoalScoringForm',
       'AwayGoalScoringForm', 'HomeGoalsConceded', 'AwayGoalsConceded',
       'HomeWinningForm', 'AwayWinningForm', 'HomeRecentGoalScoringForm',
       'HomeRecentGoalsConceded', 'HomeRecentWinningForm',
       'AwayRecentGoal

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["HomeDaysSinceLastMatch"].fillna(7, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["AwayDaysSinceLastMatch"].fillna(7, inplace=True)


In [85]:
# building the ELO system to estimate team strength and rankings

# starting elo for all teams
BASE_ELO = 1500

# sensitivity factor
K = 30

# elo points added for home advantage
HOME_ADVANTAGE = 50

elo_dict = {}

df["HomeEloBefore"] = 0
df["HomeEloAfter"] = 0

df = df.sort_values("Date").reset_index(drop=True)

# calculate each team's elo and how it changes after each match
for index, row in df.iterrows():
    home = row["HomeTeam"]
    away = row["AwayTeam"]
    
    home_elo = elo_dict.get(home, BASE_ELO)
    away_elo = elo_dict.get(away, BASE_ELO)
    
    # store elo ratings before match
    df.loc[index, "HomeEloBefore"] = home_elo
    df.loc[index, "AwayEloBefore"] = away_elo
    
    # adjusts home elo value for home advantage
    home_elo_adjusted = home_elo + HOME_ADVANTAGE
    
    # calculate expected result for home team
    expected_home_result = 1 / (1 + 10 ** ((away_elo - home_elo_adjusted) / 400))
    
    # actual result of game
    if row["FullTimeResult"] == "H":
        actual_home_result = 1
    elif row["FullTimeResult"] == "A":
        actual_home_result = 0
    else:
        actual_home_result = 0.5
        
    # calculate goal margin adjustment: higher goal difference, bigger elo change
    margin = abs(row["FullTimeHomeTeamGoals"] - row["FullTimeAwayTeamGoals"])
    margin = max(1, margin)
    
    # square root margin weight
    margin_multiplier = margin ** 0.5
    
    # calculating elo change with margin adjustment
    elo_change = K * (actual_home_result - expected_home_result) * margin_multiplier
    
    updated_home_elo = home_elo + elo_change
    updated_away_elo = away_elo - elo_change
    
    elo_dict[home] = updated_home_elo
    elo_dict[away] = updated_away_elo
    

df[["Date", "HomeTeam", "HomeEloBefore", "AwayTeam", "AwayEloBefore", "FullTimeResult"]].tail(20)
    
    
    



  df.loc[index, "HomeEloBefore"] = home_elo


Unnamed: 0,Date,HomeTeam,HomeEloBefore,AwayTeam,AwayEloBefore,FullTimeResult
9480,2025-05-16,Aston Villa,1736.752047,Tottenham,1548.412891,H
9481,2025-05-16,Chelsea,1711.858308,Man United,1528.769224,H
9482,2025-05-18,West Ham,1548.081415,Nott'm Forest,1651.691986,A
9483,2025-05-18,Everton,1622.20086,Southampton,1298.678692,H
9484,2025-05-18,Brentford,1677.253626,Fulham,1604.173799,A
9485,2025-05-18,Arsenal,1822.927858,Newcastle,1742.413798,H
9486,2025-05-18,Leicester,1380.923668,Ipswich,1401.76221,H
9487,2025-05-19,Brighton,1635.344337,Liverpool,1874.774962,H
9488,2025-05-20,Man City,1812.743299,Bournemouth,1656.253499,H
9489,2025-05-20,Crystal Palace,1670.230107,Wolves,1572.12328,H


In [86]:
# building Tilt feature: measures a team's offensiveness, whether a team tends to play high-scoring or low-scoring games, independant of match outcome

tilt_dict = {}
TILT_DECAY = 0.98
TILT_WEIGHT = 0.02
EXPECTED_TOTAL_GOALS = df["FullTimeHomeTeamGoals"].add(df["FullTimeAwayTeamGoals"]).mean()


df["HomeTiltBefore"] = 1
df["AwayTiltBefore"] = 1

df = df.sort_values("Date").reset_index(drop=True)

for index, row in df.iterrows():
    home = row["HomeTeam"]
    away = row["AwayTeam"]
    
    home_tilt = tilt_dict.get(home, 1)
    away_tilt = tilt_dict.get(away, 1)
    df.loc[index, "HomeTiltBefore"] = home_tilt
    df.loc[index, "AwayTiltBefore"] = away_tilt
    
    # total goals in this matchup
    total_goals = row["FullTimeHomeTeamGoals"] + row["FullTimeAwayTeamGoals"]
    
    # update new tilt values for home and away
    home_tilt_new = (
        TILT_DECAY * home_tilt +
        TILT_WEIGHT * total_goals / (away_tilt * EXPECTED_TOTAL_GOALS)
    )

    away_tilt_new = (
        TILT_DECAY * away_tilt +
        TILT_WEIGHT * total_goals / (home_tilt * EXPECTED_TOTAL_GOALS)
    )

    tilt_dict[home] = home_tilt_new
    tilt_dict[away] = away_tilt_new

df[["Date", "HomeTeam", "HomeEloBefore", "HomeTiltBefore",
    "AwayTeam", "AwayEloBefore", "AwayTiltBefore", "FullTimeResult"]].tail(20)

  df.loc[index, "HomeTiltBefore"] = home_tilt
  df.loc[index, "AwayTiltBefore"] = away_tilt


Unnamed: 0,Date,HomeTeam,HomeEloBefore,HomeTiltBefore,AwayTeam,AwayEloBefore,AwayTiltBefore,FullTimeResult
9480,2025-05-16,Aston Villa,1736.752047,1.057039,Tottenham,1548.412891,1.195287,H
9481,2025-05-16,Chelsea,1711.858308,1.056052,Man United,1528.769224,0.993733,H
9482,2025-05-18,Leicester,1380.923668,1.081445,Ipswich,1401.76221,1.055143,H
9483,2025-05-18,West Ham,1548.081415,1.038146,Nott'm Forest,1651.691986,1.0139,A
9484,2025-05-18,Everton,1622.20086,0.836306,Southampton,1298.678692,1.047147,H
9485,2025-05-18,Brentford,1677.253626,1.07353,Fulham,1604.173799,0.99659,A
9486,2025-05-18,Arsenal,1822.927858,1.045011,Newcastle,1742.413798,1.158434,H
9487,2025-05-19,Brighton,1635.344337,1.086625,Liverpool,1874.774962,1.166282,H
9488,2025-05-20,Crystal Palace,1670.230107,0.954035,Wolves,1572.12328,1.00911,H
9489,2025-05-20,Man City,1812.743299,1.133852,Bournemouth,1656.253499,1.002145,H


In [87]:
# head to head matchup feature: last 5 encounters

# matchup key
df["Matchup"] = df["HomeTeam"] + " vs " + df["AwayTeam"]
df = df.sort_values("Date").reset_index(drop=True)

# dataframe to store historical matchups
h2h = []

for index, row in df.iterrows():
    home = row["HomeTeam"]
    away = row["AwayTeam"]
    date = row["Date"]
    
    # filter for previous matches between these 2 teams before current match
    mask = (
        (df["Date"] < date) &
        (((df["HomeTeam"] == home) & (df["AwayTeam"] == away)) |
         ((df["HomeTeam"] == away) & (df["AwayTeam"] == home)))
    )
    
    previous_matches = df.loc[mask].sort_values("Date", ascending=False).head(5)
    
    total_matches = previous_matches.shape[0]
    total_home_goals = 0
    total_away_goals = 0
    total_home_wins = 0
    
    for _, prev_row in previous_matches.iterrows():
        if prev_row["HomeTeam"] == home:
            total_home_goals += prev_row["FullTimeHomeTeamGoals"]
            total_away_goals += prev_row["FullTimeAwayTeamGoals"]
            if prev_row["FullTimeResult"] == "H":
                total_home_wins += 1
            elif prev_row["FullTimeResult"] == "D":
                total_home_wins += 0.5
        # reverse fixture
        else:
            total_home_goals += prev_row["FullTimeAwayTeamGoals"]
            total_away_goals += prev_row["FullTimeHomeTeamGoals"]
            if prev_row["FullTimeResult"] == "A":
                total_home_wins += 1
            elif prev_row["FullTimeResult"] == "D":
                total_home_wins += 0.5
    
    h2h.append(
        {
            "Index": index,
            "H2H_TotalMatches": total_matches,
            "H2H_HomeGoalsScored": total_home_goals / total_matches if total_matches > 0 else 0,
            "H2H_AwayGoalsScored": total_away_goals / total_matches if total_matches > 0 else 0,
            "H2H_HomeWinRate": total_home_wins / total_matches if total_matches > 0 else 0
        }
    )

# merge h2h data into main dataframe
h2h_df = pd.DataFrame(h2h)
df = df.merge(h2h_df, left_index=True, right_on="Index", how="left")
df.drop(columns=["Index"], inplace=True)
print(df.columns)
print(df.tail(10))


Index(['MatchID', 'Season', 'MatchWeek', 'Date', 'HomeTeam', 'AwayTeam',
       'FullTimeHomeTeamGoals', 'FullTimeAwayTeamGoals', 'FullTimeResult',
       'HalfTimeHomeTeamGoals', 'HalfTimeAwayTeamGoals', 'HalfTimeResult',
       'HomeTeamShots', 'AwayTeamShots', 'HomeTeamShotsOnTarget',
       'AwayTeamShotsOnTarget', 'HomeTeamCorners', 'AwayTeamCorners',
       'HomeTeamFouls', 'AwayTeamFouls', 'HomeTeamYellowCards',
       'AwayTeamYellowCards', 'HomeTeamRedCards', 'AwayTeamRedCards',
       'HomeTeamPoints', 'AwayTeamPoints', 'HomeGoalScoringForm',
       'AwayGoalScoringForm', 'HomeGoalsConceded', 'AwayGoalsConceded',
       'HomeWinningForm', 'AwayWinningForm', 'HomeRecentGoalScoringForm',
       'HomeRecentGoalsConceded', 'HomeRecentWinningForm',
       'AwayRecentGoalScoringForm', 'AwayRecentGoalsConceded',
       'AwayRecentWinningForm', 'HomeDaysSinceLastMatch',
       'AwayDaysSinceLastMatch', 'HomeEloBefore', 'HomeEloAfter',
       'AwayEloBefore', 'HomeTiltBefore', 'AwayTi

In [88]:
# adding goal difference as rolling average for last 5 games 
df["HomeGoalDifferenceForm"] = df["HomeRecentGoalScoringForm"] - df["HomeRecentGoalsConceded"]
df["AwayGoalDifferenceForm"] = df["AwayRecentGoalScoringForm"] - df["AwayRecentGoalsConceded"]

# adding season to date cumulative form to smooth out short-term noise
home_df = df[["Date", "Season", "HomeTeam", "FullTimeHomeTeamGoals", "FullTimeAwayTeamGoals", "HomeTeamPoints"]].copy()
home_df.rename(columns={
    "HomeTeam": "Team",
    "FullTimeHomeTeamGoals": "GoalsScored",
    "FullTimeAwayTeamGoals": "GoalsConceded",
    "HomeTeamPoints": "Points"
}, inplace=True)


away_df = df[["Date", "Season", "AwayTeam", "FullTimeAwayTeamGoals", "FullTimeHomeTeamGoals", "AwayTeamPoints"]].copy()
away_df.rename(columns={
    "AwayTeam": "Team",
    "FullTimeAwayTeamGoals": "GoalsScored",
    "FullTimeHomeTeamGoals": "GoalsConceded",
    "AwayTeamPoints": "Points"
}, inplace=True)

# combine into team-level match history log
combined_df = pd.concat([home_df, away_df], ignore_index=True)
combined_df.sort_values(["Team", "Season", "Date"], inplace=True)

combined_df["CumulativeGoalsScored"] = (
    combined_df.groupby(["Season", "Team"])["GoalsScored"].cumsum().shift(1)
)

combined_df["CumulativeGoalsConceded"] = (
    combined_df.groupby(["Season", "Team"])["GoalsConceded"].cumsum().shift(1)
)

combined_df["CumulativeGoalDifference"] = combined_df["CumulativeGoalsScored"] - combined_df["CumulativeGoalsConceded"]

combined_df["CumulativePoints"] = (
    combined_df.groupby(["Season", "Team"])["Points"].cumsum().shift(1)
)

combined_df.fillna(0, inplace=True)


home_cumulative = combined_df.rename(columns={
    "Team": "HomeTeam",
    "CumulativeGoalsScored": "HomeSeasonToDateGoalsScored",
    "CumulativeGoalsConceded": "HomeSeasonToDateGoalsConceded",
    "CumulativeGoalDifference": "HomeSeasonToDateGoalDifference",
    "CumulativePoints": "HomeSeasonToDatePoints",
})[["Date", "Season", "HomeTeam", "HomeSeasonToDateGoalsScored", "HomeSeasonToDateGoalsConceded", "HomeSeasonToDateGoalDifference", "HomeSeasonToDatePoints"]]

df = df.merge(home_cumulative, on=["Date", "Season", "HomeTeam"], how="left")

away_cumulative = combined_df.rename(columns={
    "Team": "AwayTeam",
    "CumulativeGoalsScored": "AwaySeasonToDateGoalsScored",
    "CumulativeGoalsConceded": "AwaySeasonToDateGoalsConceded",
    "CumulativeGoalDifference": "AwaySeasonToDateGoalDifference",
    "CumulativePoints": "AwaySeasonToDatePoints",
})[["Date", "Season", "AwayTeam", "AwaySeasonToDateGoalsScored", "AwaySeasonToDateGoalsConceded", "AwaySeasonToDateGoalDifference", "AwaySeasonToDatePoints"]]

df = df.merge(away_cumulative, on=["Date", "Season", "AwayTeam"], how="left")

print(df.tail(10))
print(df.columns)


                                 MatchID     Season  MatchWeek        Date  \
9490     2024-2025_Bournemouth_Leicester  2024-2025         38  2025-05-25   
9491           2024-2025_Fulham_Man City  2024-2025         38  2025-05-25   
9492          2024-2025_Ipswich_West Ham  2024-2025         38  2025-05-25   
9493  2024-2025_Liverpool_Crystal Palace  2024-2025         38  2025-05-25   
9494    2024-2025_Man United_Aston Villa  2024-2025         38  2025-05-25   
9495       2024-2025_Southampton_Arsenal  2024-2025         38  2025-05-25   
9496         2024-2025_Newcastle_Everton  2024-2025         38  2025-05-25   
9497     2024-2025_Nott'm Forest_Chelsea  2024-2025         38  2025-05-25   
9498        2024-2025_Tottenham_Brighton  2024-2025         38  2025-05-25   
9499          2024-2025_Wolves_Brentford  2024-2025         38  2025-05-25   

           HomeTeam        AwayTeam  FullTimeHomeTeamGoals  \
9490    Bournemouth       Leicester                      2   
9491         Fulh

In [89]:
# features for match statistics: average corners taken, average shots taken, average shots on target over last 5 games

home_stats = df[["Date", "HomeTeam", "HomeTeamShots", "HomeTeamShotsOnTarget", "HomeTeamCorners"]].copy()
home_stats.rename(columns={
    "HomeTeam": "Team",
    "HomeTeamShots": "Shots",
    "HomeTeamShotsOnTarget": "ShotsOnTarget",
    "HomeTeamCorners": "Corners"
}, inplace=True)

away_stats = df[["Date", "AwayTeam", "AwayTeamShots", "AwayTeamShotsOnTarget", "AwayTeamCorners"]].copy()
away_stats.rename(columns={
    "AwayTeam": "Team",
    "AwayTeamShots": "Shots",
    "AwayTeamShotsOnTarget": "ShotsOnTarget",
    "AwayTeamCorners": "Corners"
}, inplace=True)

combined_stats = pd.concat([home_stats, away_stats], ignore_index=True)
combined_stats.sort_values(["Team", "Date"], inplace=True)

combined_stats["AvgShotsTaken"] = (
    combined_stats.groupby("Team")["Shots"].rolling(window=5, min_periods=1).mean().shift(1).reset_index(level=0, drop=True)
)


combined_stats["AvgShotsOnTarget"] = (
    combined_stats.groupby("Team")["ShotsOnTarget"].rolling(window=5, min_periods=1).mean().shift(1).reset_index(level=0, drop=True)
)

combined_stats["AvgCornersTaken"] = (
    combined_stats.groupby("Team")["Corners"].rolling(window=5, min_periods=1).mean().shift(1).reset_index(level=0, drop=True)
)

home_form = combined_stats.rename(columns={
    "Team": "HomeTeam",
    "AvgShotsTaken": "HomeAvgShotsTaken",
    "AvgShotsOnTarget": "HomeAvgShotsOnTarget",
    "AvgCornersTaken": "HomeAvgCornersTaken"
})[["Date", "HomeTeam", "HomeAvgShotsTaken", "HomeAvgShotsOnTarget", "HomeAvgCornersTaken"]]

df = df.merge(home_form, on=["Date", "HomeTeam"], how="left")

away_form = combined_stats.rename(columns={
    "Team": "AwayTeam",
    "AvgShotsTaken": "AwayAvgShotsTaken",
    "AvgShotsOnTarget": "AwayAvgShotsOnTarget",
    "AvgCornersTaken": "AwayAvgCornersTaken"
})[["Date", "AwayTeam", "AwayAvgShotsTaken", "AwayAvgShotsOnTarget", "AwayAvgCornersTaken"]]

df = df.merge(away_form, on=["Date", "AwayTeam"], how="left")

print(df.columns)
print(df.tail(10))




Index(['MatchID', 'Season', 'MatchWeek', 'Date', 'HomeTeam', 'AwayTeam',
       'FullTimeHomeTeamGoals', 'FullTimeAwayTeamGoals', 'FullTimeResult',
       'HalfTimeHomeTeamGoals', 'HalfTimeAwayTeamGoals', 'HalfTimeResult',
       'HomeTeamShots', 'AwayTeamShots', 'HomeTeamShotsOnTarget',
       'AwayTeamShotsOnTarget', 'HomeTeamCorners', 'AwayTeamCorners',
       'HomeTeamFouls', 'AwayTeamFouls', 'HomeTeamYellowCards',
       'AwayTeamYellowCards', 'HomeTeamRedCards', 'AwayTeamRedCards',
       'HomeTeamPoints', 'AwayTeamPoints', 'HomeGoalScoringForm',
       'AwayGoalScoringForm', 'HomeGoalsConceded', 'AwayGoalsConceded',
       'HomeWinningForm', 'AwayWinningForm', 'HomeRecentGoalScoringForm',
       'HomeRecentGoalsConceded', 'HomeRecentWinningForm',
       'AwayRecentGoalScoringForm', 'AwayRecentGoalsConceded',
       'AwayRecentWinningForm', 'HomeDaysSinceLastMatch',
       'AwayDaysSinceLastMatch', 'HomeEloBefore', 'HomeEloAfter',
       'AwayEloBefore', 'HomeTiltBefore', 'AwayTi

In [91]:
# save feature-engineered data 
import openpyxl

df.to_excel("../data/final_premier_league.xlsx", index=False)
df.to_csv("../data/final_premier_league.csv", index=False)