## Preparations

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sqlite3

We want to use the sklearn package to implement our models, here we import the needed functions.

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix

In [3]:
con = sqlite3.connect("../laliga.sqlite")
df = pd.read_sql_query("SELECT * from Matches", con)

First we need to extract the result of the matches from the database. We save this as "1" for a home win, "2" for a win of the away team and "X" for a tie.

In [4]:
def get_result(score: str):
    if score is None:
        return None
    goals = list(map(int, score.split(':')))
    if goals[0]>goals[1]:
        return "1"
    elif goals[1]>goals[0]:
        return "2"
    else:
        return "X"

In [5]:
df['result'] = df['score'].apply(get_result)
df['result'] = df['result'].map(str)

## Computing the features
### Rank and form

As features to train the model we want to use different values from the standings, most of them were already computed in the exercises of the analytical work.

The most obvious one is the rank of the two teams before the match. We make this rank relative, meaning we divide it by the number of teams in the respective division in the same season to make it comparable over the seasons.

We also want to use the results of the last 5 matches, which we convert into a numerical value by assigning the values +1, 0, -1 to a win, tie and a loss and summing them up. We'll call this value the "form" of the team.

In [6]:
def get_form(last_5):
    if last_5 != "[]":
        last_5 = list(last_5)
        form = 0
        for result in last_5:
            if result == "W":
                form += 1
            elif result == "T":
                pass
            elif result == "L":
                form -= 1
        return form
    else:
        return None

In [7]:
def calc_relative_rank(df_standings):
    for season in df_standings.season.drop_duplicates():
        num_teams = df_standings.loc[df_standings['season']==season, 'team'].drop_duplicates().count()
        df_standings.loc[df_standings['season']==season, 'rank'] /= num_teams
    return df_standings

In [8]:
df_standings = pd.read_excel('../reports/MatchdayStandings.xlsx', engine='openpyxl')
df_standings['form'] = df_standings['last_5'].map(get_form)
df_standings = calc_relative_rank(df_standings)
df_standings['GF'] /= df_standings['matchday']
df_standings['GA'] /= df_standings['matchday']
df_standings['matchday'] += 1
df_standings.head()

Unnamed: 0,season,division,matchday,rank,team,GF,GA,GD,W,L,T,Pts,last_5,form
0,1928-1929,1,2,0.1,Real Madrid,5.0,0.0,5,1,0,0,3,['W'],1.0
1,1928-1929,1,2,0.2,Barcelona,2.0,0.0,2,1,0,0,3,['W'],1.0
2,1928-1929,1,2,0.3,Espanyol,3.0,2.0,1,1,0,0,3,['W'],1.0
3,1928-1929,1,2,0.4,Athletic Madrid,3.0,2.0,1,1,0,0,3,['W'],1.0
4,1928-1929,1,2,0.5,Donostia,1.0,1.0,0,0,0,1,1,['T'],0.0


In [9]:
df_standings.rename(columns={'team': 'home_team'}, inplace=True)
df = df.merge(df_standings[['season', 'division', 'matchday', 'rank', 'home_team', 'form']], left_on=['season', 'division', 'matchday', 'home_team'], right_on=['season', 'division', 'matchday', 'home_team'], how='left')
df.rename(columns={'rank': 'rank_home', 'form': 'form_home'}, inplace=True)
df_standings.rename(columns={'home_team': 'away_team'}, inplace=True)
df = df.merge(df_standings[['season', 'division', 'matchday', 'rank', 'away_team', 'form']], left_on=['season', 'division', 'matchday', 'away_team'], right_on=['season', 'division', 'matchday', 'away_team'], how='left')
df.rename(columns={'rank': 'rank_away', 'form': 'form_away'}, inplace=True)


In [10]:
df.columns

Index(['season', 'division', 'matchday', 'date', 'time', 'home_team',
       'away_team', 'score', 'result', 'rank_home', 'form_home', 'rank_away',
       'form_away'],
      dtype='object')

### Goals scored and conceded
We also want to include the average goals scored and conceded by each team in the season up until the match.

In [11]:
df_standings.rename(columns={'team': 'home_team', 'away_team': 'home_team'}, inplace=True)
df = df.merge(df_standings[['season', 'division', 'matchday', 'home_team', 'GF', 'GA']], left_on=['season', 'division', 'matchday', 'home_team'], right_on=['season', 'division', 'matchday', 'home_team'], how='left')
df.rename(columns={'GF': 'GF_home', 'GA': 'GA_home'}, inplace=True)
df_standings.rename(columns={'home_team': 'away_team'}, inplace=True)
df = df.merge(df_standings[['season', 'division', 'matchday', 'away_team', 'GF', 'GA']], left_on=['season', 'division', 'matchday', 'away_team'], right_on=['season', 'division', 'matchday', 'away_team'], how='left')
df.rename(columns={'GF': 'GF_away', 'GA': 'GA_away'}, inplace=True)

Another idea is using only the average goals scored and conceded in the last 5 matches so that the form of the teams in the last matches is more important.

In [12]:
df_standings = pd.read_excel('../reports/MatchdayStandings.xlsx', engine='openpyxl')
df_standings['form'] = df_standings['last_5'].map(get_form)

In [13]:
def get_goals(score: str, home_away: int):
    if score is None:
        return None
    goals = list(map(int, score.split(':')))
    return goals[home_away]

In [14]:
df['home_goals'] = df['score'].apply(get_goals, args=(0,))
df['away_goals'] = df['score'].apply(get_goals, args=(1,))

In [15]:
df_past = df[df['season']!='2021-2022'].copy()
dfs = []
for season in df_past['season'].drop_duplicates():
    for division in df_past.loc[(df_past['season']==season), 'division'].drop_duplicates():
        df_games = df_past.loc[(df_past['season']==season) & (df_past['division']==division)]
        team = df_games['home_team'].drop_duplicates().rename('team')
        init_data = [([], []) for _ in team]
        df_goals = pd.DataFrame(init_data, columns=['last_5_goals_scored', 'last_5_goals_conceded'], index=team)
        last_5_goals_scored = df_goals['last_5_goals_scored'].copy()
        last_5_goals_conceded = df_goals['last_5_goals_conceded'].copy()
        for matchday in df_games['matchday'].drop_duplicates():
            df_standings_matchday = df_standings.loc[(df_standings['season']==season) & (df_standings['division']==division) & (df_standings['matchday']==matchday)].copy()
            df_matchday = df_games.loc[df_games['matchday']==matchday]
            last_5_goals_scored = df_goals['last_5_goals_scored'].apply(lambda x: x[:4])
            last_5_goals_conceded = df_goals['last_5_goals_conceded'].apply(lambda x: x[:4])
            for i in df_matchday.index:
                game = df_matchday.loc[i, :]
                last_5_goals_scored.loc[game['home_team']] = [game['home_goals']] + last_5_goals_scored.loc[game['home_team']]
                last_5_goals_scored.loc[game['away_team']] = [game['away_goals']] + last_5_goals_scored.loc[game['away_team']]
                last_5_goals_conceded.loc[game['home_team']] = [game['away_goals']] + last_5_goals_conceded.loc[game['home_team']]
                last_5_goals_conceded.loc[game['away_team']] = [game['home_goals']] + last_5_goals_conceded.loc[game['away_team']]
            df_goals['last_5_goals_scored'] = last_5_goals_scored
            df_goals['last_5_goals_conceded'] = last_5_goals_conceded
            df_goals.reset_index(drop=False, inplace=True)
            df_standings_matchday = df_standings_matchday.merge(df_goals, left_on='team', right_on='team', how='left')
            dfs.append(df_standings_matchday)
            df_goals = df_goals.set_index('team', drop=True)
df_standings = pd.concat(dfs, ignore_index=True)


In [16]:
df_standings['GF'] /= df_standings['matchday']
df_standings['GA'] /= df_standings['matchday']
df_standings['matchday'] += 1
df_standings.rename(columns={'team': 'home_team'}, inplace=True)
df = df.merge(df_standings[['season', 'division', 'matchday', 'home_team', 'last_5_goals_scored', 'last_5_goals_conceded']], left_on=['season', 'division', 'matchday', 'home_team'], right_on=['season', 'division', 'matchday', 'home_team'], how='left')
df.rename(columns={'last_5_goals_scored': 'l5_goals_scored_home', 'last_5_goals_conceded': 'l5_goals_conceded_home'}, inplace=True)
df_standings.rename(columns={'home_team': 'away_team'}, inplace=True)
df = df.merge(df_standings[['season', 'division', 'matchday', 'away_team', 'last_5_goals_scored', 'last_5_goals_conceded']], left_on=['season', 'division', 'matchday', 'away_team'], right_on=['season', 'division', 'matchday', 'away_team'], how='left')
df.rename(columns={'last_5_goals_scored': 'l5_goals_scored_away', 'last_5_goals_conceded': 'l5_goals_conceded_away'}, inplace=True)

In [17]:
def sum_last_goals(goals_list):
    if isinstance(goals_list, list):
        if len(goals_list)==0:
            return 0
        else:
            return sum(goals_list)/len(goals_list)
    else:
        return np.nan

In [18]:
df[['l5_goals_scored_home', 'l5_goals_conceded_home', 'l5_goals_scored_away', 'l5_goals_conceded_away']] = df[['l5_goals_scored_home', 'l5_goals_conceded_home', 'l5_goals_scored_away', 'l5_goals_conceded_away']].applymap(sum_last_goals)


### Rank of last season
The features computed so far have no values for the first matchday of each season. To cope with that, we also introduce the rank of each team in the previous season.

In [19]:
df_season_standings = pd.read_excel('../reports/SeasonStandings.xlsx', engine='openpyxl')
seasons = df['season'].drop_duplicates().copy().to_numpy()
df['last_rank_home'] = [0 for _ in df.index]
df['last_rank_away'] = [0 for _ in df.index]
for i in np.arange(len(seasons)-1):
    df_this_season = df_season_standings.loc[df_season_standings['season']==seasons[i],:]
    teams = df_this_season['team'].drop_duplicates().copy()
    for team in teams:
        rank_team = ((df_this_season.loc[(df_this_season['team']==team), 'division'])*df_this_season.loc[(df_this_season['team']==team), 'rank']).to_numpy()[0]
        df.loc[(df['season']==seasons[i+1]) & (df['home_team']==team), 'last_rank_home'] = rank_team
        df.loc[(df['season']==seasons[i+1]) & (df['away_team']==team), 'last_rank_away'] = rank_team

### Using the differences of the features
To reduce the amount of features we will try to train the model not with the features for each team but the differences of the features between the teams. Intuitively this should retain most of the information.

In [20]:
df['rank_diff'] = df['rank_home'] - df['rank_away']
df['form_diff'] = df['form_home'] - df['form_away']
df['GF_diff'] = df['GF_home'] - df['GF_away']
df['GA_diff'] = df['GA_home'] - df['GA_away']
df['l5_goals_scored_diff'] = df['l5_goals_scored_home'] - df['l5_goals_scored_away']
df['l5_goals_conceded_diff'] = df['l5_goals_conceded_home'] - df['l5_goals_conceded_away']
df['last_rank_diff'] = df['last_rank_home'] - df['last_rank_away']

## Training the model
### GradientBoostedClassifier
As in the lectures, we first experiment with a gradient-boosted model to classify the matches. For that we introduce a function that fills NaN values in the data with zeros to use more data points, does a train/test split of the data, trains the model and returns the it as well as the predictions on the test set. We also introduce a function to compute the accuracy of the model, that is, the percentage of correctly predicted results.

In [21]:
def model_accuracy(y_test, y_pred):
    return sum(y_test==y_pred)/len(y_test)

In [22]:
def train_gbm(features, target, gbm_hyperparams):
    df_past = df.loc[df['season']!='2021-2022']
    X = df_past[features].copy()
    y = df_past[target].copy()
    X[X.isna()] = 0
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    gbm_model = GradientBoostingClassifier(**gbm_hyperparams)
    gbm_model.fit(X_train, y_train)
    gbm_y_pred = gbm_model.predict(X_test)
    results_df = X_test.copy()
    results_df["y_real"] = y_test
    results_df["y_pred"] = gbm_y_pred
    print(f"Model accuracy: {model_accuracy(results_df['y_real'], results_df['y_pred'])}")
    return gbm_model, results_df
    

First, we train a model on all the features we computed in the first section except for the goals scored and conceded in the last 5 matches. After testing some different learning rates, we saw that a smaller learning rate than the default 0.1 yields much better results.

In [23]:
gbm_hyperparams = {
    'n_estimators': 100,
    'max_depth': 10,
    'learning_rate': 0.01,
    'loss': 'deviance'
}
features = ['rank_home', 'form_home', 'rank_away', 'form_away', 'GF_home', 'GA_home', 'GF_away', 'GA_away', 'last_rank_home', 'last_rank_away']
target = 'result'
model_all_features, results_all_features = train_gbm(features, target, gbm_hyperparams)

Model accuracy: 0.5321710253217102


The accuracy on this model already looks quite good. However we see a big problem, when we check the distribution of the predicted values: It nearly only predicts a home win.

In [24]:
results_all_features['y_pred'].value_counts()

1    9304
2     261
X      71
Name: y_pred, dtype: int64

This is caused by the class imbalance: As we have seen in the analysis exercises, about 52% of the games are home wins. To cope with that, we use the `imbalanced_learn` library. From there we use the `SMOTE` class, which implements the Synthetic Minority Over-sampling Technique to create artificial samples of the minority classes to balance out the dataset.

In [25]:
from imblearn.over_sampling import SMOTE

In [26]:
def train_gbm(features, target, gbm_hyperparams):
    sm = SMOTE()
    df_past = df.loc[df['season']!='2021-2022']
    X = df_past[features].copy()
    y = df_past[target].copy()
    X[X.isna()] = 0
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    X_train, y_train = sm.fit_resample(X_train, y_train)
    gbm_model = GradientBoostingClassifier(**gbm_hyperparams)
    gbm_model.fit(X_train, y_train)
    gbm_y_pred = gbm_model.predict(X_test)
    results_df = X_test.copy()
    results_df["y_real"] = y_test
    results_df["y_pred"] = gbm_y_pred
    print(f"Model accuracy: {model_accuracy(results_df['y_real'], results_df['y_pred'])}")
    return gbm_model, results_df

In [27]:
gbm_hyperparams = {
    'n_estimators': 100,
    'max_depth': 10,
    'learning_rate': 0.05,
    'loss': 'deviance'
}
features = ['rank_home', 'form_home', 'rank_away', 'form_away', 'GF_home', 'GA_home', 'GF_away', 'GA_away', 'last_rank_home', 'last_rank_away']
target = 'result'
model_all_features_smote, results_all_features_smote = train_gbm(features, target, gbm_hyperparams)

Model accuracy: 0.4950186799501868


In [28]:
results_all_features_smote['y_pred'].value_counts()

1    6136
X    1781
2    1719
Name: y_pred, dtype: int64

We observe that this makes the model predict a tie or an away win more often while preserving most of the accuracy.

Now we want to test some different combinations of the features. First we will drop the overall goals scored and conceded and consider the goals scored and conceded only in the last 5 matches.

In [29]:
gbm_hyperparams = {
    'n_estimators': 100,
    'max_depth': 10,
    'learning_rate': 0.05,
    'loss': 'deviance'
}
features = ['rank_home', 'form_home', 'rank_away', 'form_away', 'l5_goals_scored_home', 'l5_goals_conceded_home', 'l5_goals_scored_away', 'l5_goals_conceded_away', 'last_rank_home', 'last_rank_away']
target = 'result'
model_all_features_l5goals, results_all_features_l5goals = train_gbm(features, target, gbm_hyperparams)

Model accuracy: 0.5158779576587795


We see that the model performance is comparable. Because the overall goals scored and conceded requires much less computation time (as we can just extract it from our standings table), we will use them.

The next thing we check is how the model performs when we use only the differences in the features.

In [30]:
gbm_hyperparams = {
    'n_estimators': 100,
    'max_depth': 10,
    'learning_rate': 0.05,
    'loss': 'deviance'
}
features = ['rank_diff', 'form_diff', 'GF_diff', 'GA_diff', 'last_rank_diff', 'matchday']
target = 'result'
model_diffs, results_diffs = train_gbm(features, target, gbm_hyperparams)

Model accuracy: 0.47063096720630965


Also this model has a similar performance to the other ones, but we managed to achieve this with half of the features. Experimenting with the hyper-parameters gives the insight that a higher learning rate biases the model more towards a home-win but increases the overall accuracy. This trade-off has to be balanced.

## Model Analysis

The model still predicts mostly home wins, but not as often as our first model.

In [31]:
results_diffs['y_pred'].value_counts(normalize=True)

1    0.592258
X    0.213263
2    0.194479
Name: y_pred, dtype: float64

In [32]:
confusion_matrix(results_diffs['y_real'], results_diffs['y_pred'], normalize='true')

array([[0.65381589, 0.15224233, 0.19394178],
       [0.4929312 , 0.2869934 , 0.2200754 ],
       [0.55020576, 0.20205761, 0.24773663]])

In [40]:
from sklearn.metrics import classification_report

In [42]:
print(classification_report(results_diffs['y_real'], results_diffs['y_pred']))

              precision    recall  f1-score   support

           1       0.58      0.65      0.62      5084
           2       0.32      0.29      0.30      2122
           X       0.29      0.25      0.27      2430

    accuracy                           0.47      9636
   macro avg       0.40      0.40      0.40      9636
weighted avg       0.45      0.47      0.46      9636



We can see from the confusion matrix, that still about 50% of the away wins and 58% of the ties are wrongly classified as home wins.

In [33]:
importances = pd.Series(model_diffs.feature_importances_, index=features)
importances.sort_values(ascending=False)

rank_diff         0.224007
GF_diff           0.188819
form_diff         0.172966
GA_diff           0.168920
last_rank_diff    0.133766
matchday          0.111522
dtype: float64

## Random Forest

In [34]:
from sklearn.ensemble import RandomForestClassifier

In [35]:
rforest_hyperparams = {
    'n_estimators': 500,
    'criterion': 'gini',
    'class_weight': 'balanced'
}
#features = ['rank_diff', 'form_diff', 'l5_goals_scored_diff', 'l5_goals_conceded_diff', 'GF_diff', 'GA_diff']
features = ['rank_diff', 'form_diff', 'GF_diff', 'GA_diff', 'last_rank_diff']
target = 'result'
df_past = df.loc[df['season']!='2021-2022']
X = df_past[features].copy()
y = df_past[target].copy()
X[X.isna()] = 0
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
sm = SMOTE()
X_train, y_train = sm.fit_resample(X_train, y_train)
rforest_model_diff = RandomForestClassifier(**rforest_hyperparams)
rforest_model_diff.fit(X_train, y_train)
rforest_y_pred_diff = rforest_model_diff.predict(X_test)

results_rforest_diff = X_test.copy()
results_rforest_diff["y_real"] = y_test
results_rforest_diff["y_pred"] = rforest_y_pred_diff
print(f"Model accuracy: {model_accuracy(results_rforest_diff['y_real'], results_rforest_diff['y_pred'])}")

Model accuracy: 0.43295973432959733


In [36]:
results_rforest_diff['y_pred'].value_counts(normalize=True)

1    0.518161
X    0.242217
2    0.239622
Name: y_pred, dtype: float64

## k-Nearest-Neighbours Classifier

In [37]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

In [38]:
knn_hyperparams = {
    'n_neighbors': 5,
    'weights': 'distance',
    'algorithm': 'auto',
    'metric': 'minkowski',
    'p': 2
}
sm = SMOTE()
#features = ['rank_diff', 'form_diff', 'l5_goals_scored_diff', 'l5_goals_conceded_diff', 'GF_diff', 'GA_diff']
features = ['rank_diff', 'form_diff', 'GF_diff', 'GA_diff', 'last_rank_diff']
#features = ['rank_home', 'form_home', 'rank_away', 'form_away', 'GF_home', 'GA_home', 'GF_away', 'GA_away', 'last_rank_home', 'last_rank_away']
target = 'result'
df_past = df.loc[df['season']!='2021-2022'].copy()
df_past.dropna(inplace=True)
X = df_past[features]
y = df_past[target]
#X[X.isna()] = 0
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
#X_train, y_train = sm.fit_resample(X_train, y_train)
print(X_train.shape)
knn_model = KNeighborsClassifier(**knn_hyperparams)
knn_model.fit(X_train, y_train)
knn_y_pred = knn_model.predict(X_test)

results_df_knn = X_test.copy()
results_df_knn["y_real"] = y_test
results_df_knn["y_pred"] = knn_y_pred
print(f"Model accuracy: {model_accuracy(results_df_knn['y_real'], results_df_knn['y_pred'])}")

(12964, 5)
Model accuracy: 0.39092872570194387


In [39]:
results_df_knn['y_pred'].value_counts(normalize=True)

1    0.520210
X    0.251774
2    0.228016
Name: y_pred, dtype: float64