## Preparations

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sqlite3

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.metrics import r2_score, mean_squared_error

In [3]:
con = sqlite3.connect("../laliga.sqlite")
df = pd.read_sql_query("SELECT * from Matches", con)

In [4]:
def get_result(score: str):
    if score is None:
        return None
    goals = list(map(int, score.split(':')))
    if goals[0]>goals[1]:
        return "1"
    elif goals[1]>goals[0]:
        return "2"
    else:
        return "X"

In [5]:
df['result'] = df['score'].apply(get_result)
df['result'] = df['result'].map(str)

In [6]:
# Function to compute the accuracy of a model
def model_accuracy(y_test, y_pred):
    return sum(y_test==y_pred)/len(y_test)

We have to represent the result value by numbers, so it can be predicted by sklearn. For that, we use 1 for a home win, 0 for a tie and -1 for an away win. As the output value will most likely be a float, we can use the prediction value as the confidence of the prediction.

In [7]:
#df['result'] = df['result'].replace({'1': 1, 'X': 0, '2': -1})

## Last 5 games and current rank

First, we want to include the last 5 games and the rank of both teams before the game in question. For that, we use the standings table computed in the analysis exercises. 

For the last 5 games, we will produce a number representing a form of a team: A win gets the value 1, a loss the value 0 and a loss the value -1. These are then added up for the last 5 games. 

In [8]:
def get_form(last_5):
    if last_5 != "[]":
        last_5 = list(last_5)
        form = 0
        for result in last_5:
            if result == "W":
                form += 1
            elif result == "T":
                pass
            elif result == "L":
                form -= 1
        return form
    else:
        return None

In [9]:
df_standings = pd.read_excel('../reports/MatchdayStandings.xlsx', engine='openpyxl')
df_standings['form'] = df_standings['last_5'].map(get_form)
df_standings['matchday'] += 1
df_standings.head()

Unnamed: 0,season,division,matchday,rank,team,GF,GA,GD,W,L,T,Pts,last_5,form
0,1928-1929,1,2,1,Real Madrid,5,0,5,1,0,0,3,['W'],1.0
1,1928-1929,1,2,2,Barcelona,2,0,2,1,0,0,3,['W'],1.0
2,1928-1929,1,2,3,Espanyol,3,2,1,1,0,0,3,['W'],1.0
3,1928-1929,1,2,4,Athletic Madrid,3,2,1,1,0,0,3,['W'],1.0
4,1928-1929,1,2,5,Donostia,1,1,0,0,0,1,1,['T'],0.0


In [10]:
df_standings.rename(columns={'team': 'home_team'}, inplace=True)
df = df.merge(df_standings[['season', 'division', 'matchday', 'rank', 'home_team', 'form']], left_on=['season', 'division', 'matchday', 'home_team'], right_on=['season', 'division', 'matchday', 'home_team'], how='left')
df.rename(columns={'rank': 'rank_home', 'form': 'form_home'}, inplace=True)
df_standings.rename(columns={'home_team': 'away_team'}, inplace=True)
df = df.merge(df_standings[['season', 'division', 'matchday', 'rank', 'away_team', 'form']], left_on=['season', 'division', 'matchday', 'away_team'], right_on=['season', 'division', 'matchday', 'away_team'], how='left')
df.rename(columns={'rank': 'rank_away', 'form': 'form_away'}, inplace=True)


In [11]:
df.columns

Index(['season', 'division', 'matchday', 'date', 'time', 'home_team',
       'away_team', 'score', 'result', 'rank_home', 'form_home', 'rank_away',
       'form_away'],
      dtype='object')

First, we train a Gradient Boosted regression model on the ranks and forms of both teams.

In [12]:
gbm_hyperparams = {
    'n_estimators': 100,
    'max_depth': 10,
    'learning_rate': 0.05,
    'loss': 'deviance'
}
features = ['rank_home', 'form_home', 'rank_away', 'form_away']
target = 'result'
df_nona = df.dropna()
X = df_nona[features]
y = df_nona[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape)
gbm_model = GradientBoostingClassifier(**gbm_hyperparams)
gbm_model.fit(X_train, y_train)
gbm_y_pred = gbm_model.predict(X_test)

(12964, 4)


In [13]:
results_df = X_test.copy()
results_df["y_real"] = y_test
results_df["y_pred"] = gbm_y_pred
results_df.head()

Unnamed: 0,rank_home,form_home,rank_away,form_away,y_real,y_pred
19932,11.0,1.0,13.0,3.0,1,2
20541,9.0,-1.0,13.0,-2.0,1,1
19756,5.0,0.0,6.0,3.0,1,1
46773,4.0,2.0,5.0,4.0,1,1
44236,13.0,1.0,7.0,3.0,1,1


In [14]:
print(f"Model accuracy: {model_accuracy(results_df['y_real'], results_df['y_pred'])}")

Model accuracy: 0.42702869484726935


In [15]:
importances = pd.Series(gbm_model.feature_importances_, index=features)
importances.sort_values(ascending=False)

rank_home    0.285115
rank_away    0.278369
form_home    0.226369
form_away    0.210147
dtype: float64

## Only difference of rank and form

In [16]:
df['rank_diff'] = df['rank_home'] - df['rank_away']
df['form_diff'] = df['form_home'] - df['form_away']

In [17]:
gbm_hyperparams = {
    'n_estimators': 100,
    'max_depth': 10,
    'learning_rate': 0.005,
    'loss': 'deviance'
}
features = ['rank_diff', 'form_diff']
target = 'result'
df_nona = df.dropna()
X = df_nona[features]
y = df_nona[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape)
gbm_model_diff = GradientBoostingClassifier(**gbm_hyperparams)
gbm_model_diff.fit(X_train, y_train)
gbm_y_pred_diff = gbm_model_diff.predict(X_test)

results_df_diff = X_test.copy()
results_df_diff["y_real"] = y_test
results_df_diff["y_pred"] = gbm_y_pred_diff
results_df_diff["y_conf"] = gbm_y_pred_diff
print(f"Model accuracy: {model_accuracy(results_df_diff['y_real'], results_df_diff['y_pred'])}")
# print(f"RMSE: {mean_squared_error(y_test, gbm_y_pred_diff)**0.5}")
# print(f"R^2: {r2_score(y_test, gbm_y_pred_diff)}")

(12964, 2)
Model accuracy: 0.4634372107374267


This increases the model accuracy by a little bit.

## With GF/GA
The next check will be to include the goals for and goals against the teams in the respective point of the season

In [18]:
df_standings.rename(columns={'team': 'home_team', 'away_team': 'home_team'}, inplace=True)
df = df.merge(df_standings[['season', 'division', 'matchday', 'home_team', 'GF', 'GA']], left_on=['season', 'division', 'matchday', 'home_team'], right_on=['season', 'division', 'matchday', 'home_team'], how='left')
df.rename(columns={'GF': 'GF_home', 'GA': 'GA_home'}, inplace=True)
df_standings.rename(columns={'home_team': 'away_team'}, inplace=True)
df = df.merge(df_standings[['season', 'division', 'matchday', 'away_team', 'GF', 'GA']], left_on=['season', 'division', 'matchday', 'away_team'], right_on=['season', 'division', 'matchday', 'away_team'], how='left')
df.rename(columns={'GF': 'GF_away', 'GA': 'GA_away'}, inplace=True)

In [19]:
gbm_hyperparams = {
    'n_estimators': 100,
    'max_depth': 10,
    'learning_rate': 0.005,
    'loss': 'deviance'
}
features = ['rank_home', 'form_home', 'rank_away', 'form_away', 'GF_home', 'GA_home', 'GF_away', 'GA_away']
target = 'result'
df_nona = df.dropna()
X = df_nona[features]
y = df_nona[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape)
gbm_model_goals = GradientBoostingClassifier(**gbm_hyperparams)
gbm_model_goals.fit(X_train, y_train)
gbm_y_pred_goals = gbm_model_goals.predict(X_test)

results_df_goals = X_test.copy()
results_df_goals["y_real"] = y_test
results_df_goals["y_pred"] = gbm_y_pred_goals
print(f"Model accuracy: {model_accuracy(results_df_goals['y_real'], results_df_goals['y_pred'])}")

(12964, 8)
Model accuracy: 0.4794816414686825


## Goals scored and conceded in last 5 matches

In [20]:
df_standings = pd.read_excel('../reports/MatchdayStandings.xlsx', engine='openpyxl')
df_standings['form'] = df_standings['last_5'].map(get_form)

In [21]:
def get_goals(score: str, home_away: int):
    if score is None:
        return None
    goals = list(map(int, score.split(':')))
    return goals[home_away]

In [22]:
df['home_goals'] = df['score'].apply(get_goals, args=(0,))
df['away_goals'] = df['score'].apply(get_goals, args=(1,))

In [23]:
df_past = df[df['season']!='2021-2022'].copy()
dfs = []
for season in df_past['season'].drop_duplicates():
    for division in df_past.loc[(df_past['season']==season), 'division'].drop_duplicates():
        df_games = df_past.loc[(df_past['season']==season) & (df_past['division']==division)]
        team = df_games['home_team'].drop_duplicates().rename('team')
        init_data = [([], []) for _ in team]
        df_goals = pd.DataFrame(init_data, columns=['last_5_goals_scored', 'last_5_goals_conceded'], index=team)
        last_5_goals_scored = df_goals['last_5_goals_scored'].copy()
        last_5_goals_conceded = df_goals['last_5_goals_conceded'].copy()
        for matchday in df_games['matchday'].drop_duplicates():
            df_standings_matchday = df_standings.loc[(df_standings['season']==season) & (df_standings['division']==division) & (df_standings['matchday']==matchday)].copy()
            df_matchday = df_games.loc[df_games['matchday']==matchday]
            last_5_goals_scored = df_goals['last_5_goals_scored'].apply(lambda x: x[:4])
            last_5_goals_conceded = df_goals['last_5_goals_conceded'].apply(lambda x: x[:4])
            for i in df_matchday.index:
                game = df_matchday.loc[i, :]
                last_5_goals_scored.loc[game['home_team']] = [game['home_goals']] + last_5_goals_scored.loc[game['home_team']]
                last_5_goals_scored.loc[game['away_team']] = [game['away_goals']] + last_5_goals_scored.loc[game['away_team']]
                last_5_goals_conceded.loc[game['home_team']] = [game['away_goals']] + last_5_goals_conceded.loc[game['home_team']]
                last_5_goals_conceded.loc[game['away_team']] = [game['home_goals']] + last_5_goals_conceded.loc[game['away_team']]
            df_goals['last_5_goals_scored'] = last_5_goals_scored
            df_goals['last_5_goals_conceded'] = last_5_goals_conceded
            df_goals.reset_index(drop=False, inplace=True)
            df_standings_matchday = df_standings_matchday.merge(df_goals, left_on='team', right_on='team', how='left')
            dfs.append(df_standings_matchday)
            df_goals = df_goals.set_index('team', drop=True)
df_standings = pd.concat(dfs, ignore_index=True)


In [24]:
df_standings['matchday'] += 1

In [25]:
df_standings.rename(columns={'team': 'home_team'}, inplace=True)
df = df.merge(df_standings[['season', 'division', 'matchday', 'home_team', 'last_5_goals_scored', 'last_5_goals_conceded']], left_on=['season', 'division', 'matchday', 'home_team'], right_on=['season', 'division', 'matchday', 'home_team'], how='left')
df.rename(columns={'last_5_goals_scored': 'l5_goals_scored_home', 'last_5_goals_conceded': 'l5_goals_conceded_home'}, inplace=True)
df_standings.rename(columns={'home_team': 'away_team'}, inplace=True)
df = df.merge(df_standings[['season', 'division', 'matchday', 'away_team', 'last_5_goals_scored', 'last_5_goals_conceded']], left_on=['season', 'division', 'matchday', 'away_team'], right_on=['season', 'division', 'matchday', 'away_team'], how='left')
df.rename(columns={'last_5_goals_scored': 'l5_goals_scored_away', 'last_5_goals_conceded': 'l5_goals_conceded_away'}, inplace=True)

In [26]:
def sum_last_goals(goals_list):
    if isinstance(goals_list, list):
        return sum(goals_list)
    else:
        return np.nan

In [27]:
df[['l5_goals_scored_home', 'l5_goals_conceded_home', 'l5_goals_scored_away', 'l5_goals_conceded_away']] = df[['l5_goals_scored_home', 'l5_goals_conceded_home', 'l5_goals_scored_away', 'l5_goals_conceded_away']].applymap(sum_last_goals)


In [28]:
gbm_hyperparams = {
    'n_estimators': 100,
    'max_depth': 10,
    'learning_rate': 0.01,
    'loss': 'deviance'
}
features = ['rank_home', 'form_home', 'rank_away', 'form_away', 'l5_goals_scored_home', 'l5_goals_conceded_home', 'l5_goals_scored_away', 'l5_goals_conceded_away']
#features = ['rank_diff', 'form_diff', 'l5_goals_scored_home', 'l5_goals_conceded_home', 'l5_goals_scored_away', 'l5_goals_conceded_away']
target = 'result'
df_nona = df.dropna()
X = df_nona[features]
y = df_nona[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape)
gbm_model_l5goals = GradientBoostingClassifier(**gbm_hyperparams)
gbm_model_l5goals.fit(X_train, y_train)
gbm_y_pred_l5goals = gbm_model_l5goals.predict(X_test)

results_df_l5goals = X_test.copy()
results_df_l5goals["y_real"] = y_test
results_df_l5goals["y_pred"] = gbm_y_pred_l5goals
print(f"Model accuracy: {model_accuracy(results_df_l5goals['y_real'], results_df_l5goals['y_pred'])}")

(12964, 8)
Model accuracy: 0.4634372107374267


In [29]:
importances = pd.Series(gbm_model_l5goals.feature_importances_, index=features)
importances.sort_values(ascending=False)

rank_home                 0.167082
rank_away                 0.166079
l5_goals_scored_away      0.131823
l5_goals_conceded_home    0.125621
l5_goals_scored_home      0.121078
l5_goals_conceded_away    0.119641
form_home                 0.086445
form_away                 0.082231
dtype: float64

## Only difference with GF/GA

In [30]:
df['GF_diff'] = df['GF_home'] - df['GF_away']
df['GA_diff'] = df['GA_home'] - df['GA_away']

In [31]:
gbm_hyperparams = {
    'n_estimators': 100,
    'max_depth': 10,
    'learning_rate': 0.01,
    'loss': 'deviance'
}
features = ['rank_diff', 'form_diff', 'GF_diff', 'GA_diff']
target = 'result'
df_nona = df.dropna()
X = df_nona[features]
y = df_nona[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape)
gbm_model_goals_diff = GradientBoostingClassifier(**gbm_hyperparams)
gbm_model_goals_diff.fit(X_train, y_train)
gbm_y_pred_goals_diff = gbm_model_goals_diff.predict(X_test)

results_df_goals_diff = X_test.copy()
results_df_goals_diff["y_real"] = y_test
results_df_goals_diff["y_pred"] = gbm_y_pred_goals_diff
print(f"Model accuracy: {model_accuracy(results_df_goals_diff['y_real'], results_df_goals_diff['y_pred'])}")

(12964, 4)
Model accuracy: 0.46436285097192226


## Only difference with goals in last 5 matches

In [32]:
df['l5_goals_scored_diff'] = df['l5_goals_scored_home'] - df['l5_goals_scored_away']
df['l5_goals_conceded_diff'] = df['l5_goals_conceded_home'] - df['l5_goals_conceded_away']

In [33]:
gbm_hyperparams = {
    'n_estimators': 100,
    'max_depth': 10,
    'learning_rate': 0.01,
    'loss': 'deviance'
}
features = ['rank_diff', 'form_diff', 'l5_goals_scored_diff', 'l5_goals_conceded_diff', 'GF_diff', 'GA_diff']
target = 'result'
df_nona = df.dropna()
X = df_nona[features]
y = df_nona[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape)
gbm_model_l5goals_diff = GradientBoostingClassifier(**gbm_hyperparams)
gbm_model_l5goals_diff.fit(X_train, y_train)
gbm_y_pred_l5goals_diff = gbm_model_l5goals_diff.predict(X_test)

results_df_l5goals_diff = X_test.copy()
results_df_l5goals_diff["y_real"] = y_test
results_df_l5goals_diff["y_pred"] = gbm_y_pred_l5goals_diff
print(f"Model accuracy: {model_accuracy(results_df_l5goals_diff['y_real'], results_df_l5goals_diff['y_pred'])}")

(12964, 6)
Model accuracy: 0.4581919160752854


In [34]:
importances = pd.Series(gbm_model_l5goals_diff.feature_importances_, index=features)
importances.sort_values(ascending=False)

GF_diff                   0.234492
rank_diff                 0.212704
GA_diff                   0.181593
l5_goals_scored_diff      0.138275
l5_goals_conceded_diff    0.128383
form_diff                 0.104554
dtype: float64

We get very similar accuracies for the models with the different features.

## Include Rank of last season
We now also want to use the rank of last season, mainly to have something to use for the first matchday. All other features we used until now don't have data on the first matchday

## Random Forest

In [35]:
from sklearn.ensemble import RandomForestClassifier

In [36]:
rforest_hyperparams = {
    'n_estimators': 500,
}
#features = ['rank_diff', 'form_diff', 'l5_goals_scored_diff', 'l5_goals_conceded_diff', 'GF_diff', 'GA_diff']
features = ['rank_diff', 'form_diff']
target = 'result'
df_nona = df.dropna()
X = df_nona[features]
y = df_nona[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print(X_train.shape)
rforest_model_l5goals_diff = RandomForestClassifier(**rforest_hyperparams)
rforest_model_l5goals_diff.fit(X_train, y_train)
rforest_y_pred_l5goals_diff = rforest_model_l5goals_diff.predict(X_test)

results_df_l5goals_diff = X_test.copy()
results_df_l5goals_diff["y_real"] = y_test
results_df_l5goals_diff["y_pred"] = rforest_y_pred_l5goals_diff
print(f"Model accuracy: {model_accuracy(results_df_l5goals_diff['y_real'], results_df_l5goals_diff['y_pred'])}")

(12964, 2)
Model accuracy: 0.44955260721999385
