<a href="https://colab.research.google.com/github/husainal1/epl-predictor-app/blob/main/Predicting_Football_Matches.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# basic setup - install libraries
!pip install pandas==2.2.2 numpy==1.26.4 requests==2.32.3 tqdm==4.66.5 scikit-learn==1.5.2 xgboost==2.1.1

In [7]:
#imports - pandas/numpy for data, requests for api, xgboost + sklearn for model
import pandas as pd, numpy as np, requests, time, json, math, datetime as dt
from tqdm import tqdm
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from xgboost import XGBRegressor

pd.set_option("display.max_columns", 200)


In [8]:
# function to grab json data from the FPL api with a retry
BASE = "https://fantasy.premierleague.com/api"

def get_json(url, retries=5, sleep=0.5):
    for i in range(retries):
        r = requests.get(url, timeout=30)
        if r.status_code == 200:
            return r.json()
        time.sleep(sleep*(i+1))
    r.raise_for_status()

In [9]:
# grab static data: players, teams, fixtures
bootstrap = get_json(f"{BASE}/bootstrap-static/")
players_meta = pd.DataFrame(bootstrap['elements'])
teams_meta   = pd.DataFrame(bootstrap['teams'])
fixtures     = pd.DataFrame(get_json(f"{BASE}/fixtures/"))

# keep only useful team cols
teams = teams_meta[['id','name','short_name','strength',
                    'strength_attack_home','strength_attack_away',
                    'strength_defence_home','strength_defence_away']].rename(columns={'id':'team_id'})

# minimal player info
players = players_meta[['id','first_name','second_name','web_name','team','element_type']] \
            .rename(columns={'id':'player_id','team':'team_id'}) \
            .merge(teams, on='team_id', how='left')

players.head()


Unnamed: 0,player_id,first_name,second_name,web_name,team_id,element_type,name,short_name,strength,strength_attack_home,strength_attack_away,strength_defence_home,strength_defence_away
0,1,David,Raya Martín,Raya,1,1,Arsenal,ARS,4,1350,1350,1290,1300
1,2,Kepa,Arrizabalaga Revuelta,Arrizabalaga,1,1,Arsenal,ARS,4,1350,1350,1290,1300
2,3,Karl,Hein,Hein,1,1,Arsenal,ARS,4,1350,1350,1290,1300
3,4,Tommy,Setford,Setford,1,1,Arsenal,ARS,4,1350,1350,1290,1300
4,5,Gabriel,dos Santos Magalhães,Gabriel,1,2,Arsenal,ARS,4,1350,1350,1290,1300


In [11]:
# function to get match-by-match history for a player
def fetch_player_history(pid):
    j = get_json(f"{BASE}/element-summary/{pid}/")
    df = pd.DataFrame(j.get('history', []))
    if df.empty:
        return df
    needed = ['element','opponent_team','round','minutes','total_points','goals_scored','assists',
              'ict_index','creativity','influence','threat',
              'expected_goals','expected_assists','expected_goal_involvements',
              'expected_goals_conceded','was_home','kickoff_time']
    for c in needed:
        if c not in df.columns: df[c] = np.nan
    df['player_id'] = pid
    return df


In [12]:
# loop over all players and get their match history
all_hist = []
for pid in tqdm(players['player_id'], desc="fetching players"):
    try:
        h = fetch_player_history(pid)
        if not h.empty: all_hist.append(h)
    except:
        pass  # if one player fails, skip

hist = pd.concat(all_hist, ignore_index=True)
hist['kickoff_time'] = pd.to_datetime(hist['kickoff_time'], errors='coerce')
hist['round'] = pd.to_numeric(hist['round'], errors='coerce')
hist['was_home'] = hist['was_home'].astype('Int64')
hist = hist[hist['kickoff_time'].notna()].sort_values(['player_id','kickoff_time']).reset_index(drop=True)

hist.head()


fetching players: 100%|██████████| 740/740 [00:50<00:00, 14.74it/s]


Unnamed: 0,element,fixture,opponent_team,total_points,was_home,kickoff_time,team_h_score,team_a_score,round,modified,minutes,goals_scored,assists,clean_sheets,goals_conceded,own_goals,penalties_saved,penalties_missed,yellow_cards,red_cards,saves,bonus,bps,influence,creativity,threat,ict_index,clearances_blocks_interceptions,recoveries,tackles,defensive_contribution,starts,expected_goals,expected_assists,expected_goal_involvements,expected_goals_conceded,value,transfers_balance,selected,transfers_in,transfers_out,player_id
0,1,9,14,10,0,2025-08-17 15:30:00+00:00,0,1,1,False,90,0,0,1,0,0,0,0,1,0,7,3,38,49.2,0.0,0.0,4.9,1,13,0,0,1,0.0,0.0,0.0,1.52,55,0,1531911,0,0,1
1,1,11,11,6,1,2025-08-23 16:30:00+00:00,5,0,2,False,90,0,0,1,0,0,0,0,0,0,1,0,28,13.4,0.0,0.0,1.3,0,3,0,0,1,0.0,0.0,0.0,0.17,55,218659,2284634,277339,58680,1
2,1,25,12,2,0,2025-08-31 15:30:00+00:00,1,0,3,False,90,0,0,0,1,0,0,0,0,0,2,0,12,20.0,10.0,0.0,3.0,0,12,0,0,1,0.0,0.02,0.02,0.52,55,-12311,2406964,146739,159050,1
3,1,31,16,6,1,2025-09-13 11:30:00+00:00,3,0,4,False,90,0,0,1,0,0,0,0,0,0,1,0,24,12.8,0.0,0.0,1.3,0,9,0,0,1,0.0,0.0,0.0,0.2,55,171289,2765759,289041,117752,1
4,2,9,14,0,0,2025-08-17 15:30:00+00:00,0,1,1,False,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,0,0.0,0.0,0.0,0.0,45,0,90618,0,0,2


In [13]:
# add opponent info (strength etc.)
opp = teams.rename(columns={'team_id':'opp_team_id','name':'opp_name','short_name':'opp_short_name',
                            'strength':'opp_strength',
                            'strength_defence_home':'opp_strength_defence_home',
                            'strength_defence_away':'opp_strength_defence_away'})

hist = hist.merge(players[['player_id','team_id','web_name','element_type',
                           'strength','strength_attack_home','strength_attack_away',
                           'strength_defence_home','strength_defence_away']],
                  on='player_id', how='left')

hist = hist.merge(opp[['opp_team_id','opp_strength','opp_strength_defence_home','opp_strength_defence_away']],
                  left_on='opponent_team', right_on='opp_team_id', how='left')

hist['team_strength_diff'] = hist['strength'] - hist['opp_strength']
hist[['web_name','round','total_points','minutes','team_strength_diff']].head(10)


Unnamed: 0,web_name,round,total_points,minutes,team_strength_diff
0,Raya,1,10,90,1
1,Raya,2,6,90,2
2,Raya,3,2,90,-1
3,Raya,4,6,90,1
4,Arrizabalaga,1,0,0,1
5,Arrizabalaga,2,0,0,2
6,Arrizabalaga,3,0,0,-1
7,Arrizabalaga,4,0,0,1
8,Hein,1,0,0,1
9,Hein,2,0,0,2


In [14]:
# add lag + rolling features so model can see "recent form"
def add_player_features(df, lags=(1,2,3), windows=(3,5,8)):
    df = df.copy()
    grp = df.groupby('player_id', group_keys=False)
    base_cols = ['total_points','minutes','goals_scored','assists',
                 'ict_index','creativity','influence','threat',
                 'expected_goals','expected_assists','expected_goal_involvements']

    # lag features
    for col in base_cols:
        for L in lags:
            df[f'{col}_lag{L}'] = grp[col].shift(L)

    # rolling means/sums
    for W in windows:
        for col in base_cols:
            df[f'{col}_roll{W}_mean'] = grp[col].shift(1).rolling(W).mean()
            df[f'{col}_roll{W}_sum']  = grp[col].shift(1).rolling(W).sum()

    # availability
    df['played_last_match'] = grp['minutes'].shift(1).fillna(0).gt(0).astype(int)
    df['played_last3_pct']  = grp['minutes'].shift(1).rolling(3).apply(lambda x: np.mean(x>0), raw=True)

    # attack vs defence diff
    df['attack_v_def_diff'] = np.where(
        df['was_home']==1,
        df['strength_attack_home'] - df['opp_strength_defence_away'],
        df['strength_attack_away'] - df['opp_strength_defence_home']
    )

    # time features
    df['month'] = df['kickoff_time'].dt.month
    df['dow'] = df['kickoff_time'].dt.dayofweek
    return df

fe = add_player_features(hist)


  df[f'{col}_roll{W}_mean'] = grp[col].shift(1).rolling(W).mean()
  df[f'{col}_roll{W}_sum']  = grp[col].shift(1).rolling(W).sum()
  df['played_last_match'] = grp['minutes'].shift(1).fillna(0).gt(0).astype(int)
  df['played_last3_pct']  = grp['minutes'].shift(1).rolling(3).apply(lambda x: np.mean(x>0), raw=True)
  df['attack_v_def_diff'] = np.where(
  df['month'] = df['kickoff_time'].dt.month
  df['dow'] = df['kickoff_time'].dt.dayofweek


In [15]:
# we want to predict NEXT match total_points
fe['y_next_points'] = fe.groupby('player_id')['total_points'].shift(-1)

# drop rows that don’t have enough history/future
model_df = fe.dropna(subset=['y_next_points','total_points_lag1','minutes_lag1']).copy()
model_df.shape


  fe['y_next_points'] = fe.groupby('player_id')['total_points'].shift(-1)


(1395, 160)

In [16]:
exclude = {'y_next_points','total_points','kickoff_time','web_name','opp_name','opp_short_name',
           'opp_team_id','team_id','opponent_team','name','short_name'}
feature_cols = [c for c in model_df.columns if c not in exclude and c != 'was_home'
                and pd.api.types.is_numeric_dtype(model_df[c])]

X = model_df[feature_cols].fillna(0)
y = model_df['y_next_points'].astype(float)
groups = model_df['player_id']

# baseline = 3 game avg
baseline = model_df['total_points_roll3_mean'].fillna(model_df['total_points_lag1'])


In [17]:
# groupkfold so same player doesn't leak train/val
gkf = GroupKFold(n_splits=5)
oof_pred = np.zeros(len(model_df))

for tr, va in gkf.split(X, y, groups):
    model = XGBRegressor(
        n_estimators=600, learning_rate=0.05, max_depth=6,
        subsample=0.8, colsample_bytree=0.8,
        random_state=42, n_jobs=-1, tree_method="hist"
    )
    model.fit(X.iloc[tr], y.iloc[tr], eval_set=[(X.iloc[va], y.iloc[va])], verbose=False)
    oof_pred[va] = model.predict(X.iloc[va])

print("Model MAE:", mean_absolute_error(y, oof_pred))
print("Baseline MAE:", mean_absolute_error(y, baseline))


Model MAE: 1.0945412781540602
Baseline MAE: 1.3512544802867383


In [18]:
final_model = XGBRegressor(
    n_estimators=800, learning_rate=0.04, max_depth=6,
    subsample=0.9, colsample_bytree=0.9,
    random_state=42, n_jobs=-1, tree_method="hist"
)
final_model.fit(X, y, verbose=False)


In [24]:
# get latest row per player
latest = fe.sort_values(['player_id','kickoff_time']).groupby('player_id').tail(1).copy()

# Merge latest with players to get team_id and other player info
latest = latest.merge(players[['player_id','team_id','web_name','element_type',
                               'strength','strength_attack_home','strength_attack_away',
                               'strength_defence_home','strength_defence_away']],
                      on='player_id', how='left')


# next gameweek fixtures
upcoming = fixtures.copy()
next_gw = upcoming.loc[(~upcoming['finished']) & (upcoming['event'].notna()), 'event'].min()
upcoming_next = upcoming[(upcoming['event']==next_gw) & (~upcoming['finished'])].copy()

# team vs opponent
home = upcoming_next[['team_h','team_a']].rename(columns={'team_h':'team_id','team_a':'opp_team_id'}).assign(was_home=1)
away = upcoming_next[['team_a','team_h']].rename(columns={'team_a':'team_id','team_h':'opp_team_id'}).assign(was_home=0)
team_next = pd.concat([home,away])


# attach team_next to players' latest data based on the player's team_id
# Use left_on to specify the column from 'latest' and right_on for the column from 'team_next'
latest2 = latest.merge(team_next, left_on='team_id_x', right_on='team_id', how='left')

# recompute attack vs def for the new fixture
opp = teams.rename(columns={'team_id':'opp_team_id','strength':'opp_strength',
                            'strength_defence_home':'opp_strength_defence_home',
                            'strength_defence_away':'opp_strength_defence_away'})
# Use the correct column name 'opp_team_id_y' for the merge
latest2 = latest2.merge(opp[['opp_team_id','opp_strength','opp_strength_defence_home','opp_strength_defence_away']],
                        left_on='opp_team_id_y', right_on='opp_team_id', how='left')

# Recompute attack vs defence diff using the correctly named strength columns
latest2['attack_v_def_diff'] = np.where(
    latest2['was_home_y']==1, # Use was_home_y from the first merge
    latest2['strength_attack_home_x'] - latest2['opp_strength_defence_away_y'], # Use suffixed columns
    latest2['strength_attack_away_x'] - latest2['opp_strength_defence_home_y'] # Use suffixed columns
)

# predict
# Ensure X_pred has the same columns as X used for training
X_pred = latest2.reindex(columns=feature_cols).fillna(0)
latest2['pred_next_points'] = final_model.predict(X_pred)

# show top 20
# Use the correct column names for the final display
latest2[['web_name_x','team_id_x','opp_team_id_y','was_home_y','pred_next_points']].sort_values('pred_next_points', ascending=False).head(20).rename(columns={'web_name_x':'web_name','team_id_x':'team_id','opp_team_id_y':'opp_team_id','was_home_y':'was_home'})

Unnamed: 0,web_name,team_id,opp_team_id,was_home,pred_next_points
71,Senesi,4,15,1,8.501292
316,Andersen,10,5,1,7.671119
435,De Ligt,14,7,1,7.434175
347,Rodon,11,20,0,6.736757
4,Gabriel,1,13,1,6.709008
487,Bruno G.,15,4,0,6.621367
469,Dúbravka,3,16,1,6.387139
66,Petrović,4,15,1,6.341785
298,Ndiaye,9,12,0,6.308882
83,Tavernier,4,15,1,6.204838
