In [1]:
from data_loader import DataLoader
from feature_factory import build_streaks, build_elo

dataset = DataLoader('dataset.csv', 'elo_dict.pkl').\
    load([ build_streaks, build_elo ])
dataset.head()

Unnamed: 0,Home_Team,Away_Team,Result,Link,Season,Round,League,Home_Score,Away_Score,Home_Win_Streak,Away_Win_Streak,Home_Score_Streak,Away_Score_Streak,Home_Elo,Away_Elo
0,Watford,Middlesbrough,0,https://www.besoccer.com/match/watford-fc/midd...,2021,1,championship,1,0,0,0,0,0,65.0,60.0
1,Birmingham City,Brentford,0,https://www.besoccer.com/match/birmingham-city...,2021,1,championship,1,0,0,0,0,0,52.0,59.0
2,Wycombe Wanderers,Rotherham United,1,https://www.besoccer.com/match/wycombe-wandere...,2021,1,championship,0,1,0,0,0,0,41.0,48.0
3,AFC Bournemouth,Blackburn Rovers,0,https://www.besoccer.com/match/afc-bournemouth...,2021,1,championship,3,2,0,0,0,0,63.0,57.0
4,Barnsley,Luton Town,1,https://www.besoccer.com/match/barnsley-fc/lut...,2021,1,championship,0,1,0,0,0,0,47.0,50.0


In [2]:
# Take subset of data for now
dataset = dataset[dataset['League'] == 'premier_league']
dataset.drop(['League', 'Season', 'Link', 'Round', 'Result'], axis=1, inplace=True)
dataset.head()

Unnamed: 0,Home_Team,Away_Team,Home_Score,Away_Score,Home_Win_Streak,Away_Win_Streak,Home_Score_Streak,Away_Score_Streak,Home_Elo,Away_Elo
103177,Hull City,Leicester,2,1,0,0,0,0,70.0,79.0
103178,Burnley,Swansea City,0,1,0,0,0,0,68.0,77.0
103179,Crystal Palace,West Bromwich Albion,0,1,0,0,0,0,74.0,74.0
103180,Everton,Tottenham Hotspur,1,1,0,0,0,0,84.0,88.0
103181,Middlesbrough,Stoke City,1,1,0,0,0,0,70.0,79.0


In [3]:
from feature_factory import build_encoded_teams

dataset = build_encoded_teams(dataset)
dataset.head()

Unnamed: 0,Home_Team,Away_Team,Home_Score,Away_Score,Home_Win_Streak,Away_Win_Streak,Home_Score_Streak,Away_Score_Streak,Home_Elo,Away_Elo
103177,20,23,2,1,0,0,0,0,70.0,79.0
103178,10,43,0,1,0,0,0,0,68.0,77.0
103179,15,47,0,1,0,0,0,0,74.0,74.0
103180,17,45,1,1,0,0,0,0,84.0,88.0
103181,28,41,1,1,0,0,0,0,70.0,79.0


In [4]:
import numpy as np

train, validation, test = np.split(dataset.sample(frac=1), [int(.6*len(dataset)), int(.8*len(dataset))])
train.head()

Unnamed: 0,Home_Team,Away_Team,Home_Score,Away_Score,Home_Win_Streak,Away_Win_Streak,Home_Score_Streak,Away_Score_Streak,Home_Elo,Away_Elo
105164,50,26,1,0,2,2,9,8,83.0,82.0
114373,5,13,2,1,0,0,69,42,84.0,80.0
107366,40,32,0,1,0,0,27,42,81.0,86.0
114942,47,30,0,3,0,1,9,8,69.0,85.0
113716,42,18,1,1,0,0,11,12,67.0,74.0


In [5]:
target = ['Home_Score', 'Away_Score']
features = dataset.columns.drop(target)

In [6]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.base import clone

def cross_validation(models: list, k=4):
    mean_accuracy = {}
    for i, _model in enumerate(models):
        accuracy = []
        for _ in range(k):
            model = clone(_model)
            x_train, x_val, y_train, y_val = train_test_split(dataset[features], dataset[target], test_size=0.3)
            model.fit(x_train, y_train)
            val_pred = model.predict(x_val)
            score = mean_absolute_error(y_val, val_pred)
            accuracy.append(score)
        mean_accuracy[i] = np.mean(accuracy)
    return mean_accuracy

In [7]:
from sklearn.linear_model import MultiTaskLasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor

cross_validation([
    RandomForestRegressor(),
    MultiTaskLasso(alpha=0.7),
    MultiOutputRegressor(Ridge())
])

{0: 0.943789597315436, 1: 0.914579096013103, 2: 0.9079840698280326}