In [1]:
from data_loader import DataLoader
from feature_factory import build_streaks, build_elo

dataset = DataLoader('dataset.csv', 'elo_dict.pkl').\
    load([ build_streaks, build_elo ])
dataset.head()

Unnamed: 0,Home_Team,Away_Team,Result,Link,Season,Round,League,Home_Score,Away_Score,Home_Win_Streak,Away_Win_Streak,Home_Score_Streak,Away_Score_Streak,Home_Elo,Away_Elo
0,Watford,Middlesbrough,0,https://www.besoccer.com/match/watford-fc/midd...,2021,1,championship,1,0,0,0,0,0,65.0,60.0
1,Birmingham City,Brentford,0,https://www.besoccer.com/match/birmingham-city...,2021,1,championship,1,0,0,0,0,0,52.0,59.0
2,Wycombe Wanderers,Rotherham United,1,https://www.besoccer.com/match/wycombe-wandere...,2021,1,championship,0,1,0,0,0,0,41.0,48.0
3,AFC Bournemouth,Blackburn Rovers,0,https://www.besoccer.com/match/afc-bournemouth...,2021,1,championship,3,2,0,0,0,0,63.0,57.0
4,Barnsley,Luton Town,1,https://www.besoccer.com/match/barnsley-fc/lut...,2021,1,championship,0,1,0,0,0,0,47.0,50.0


In [9]:
# Take subset of data for now
dataset = dataset[dataset['League'] == 'premier_league']
dataset.drop(['League', 'Season', 'Link', 'Round', 'Result', 'Home_Team', 'Away_Team'], axis=1, inplace=True)
dataset.head()

Unnamed: 0,Home_Score,Away_Score,Home_Win_Streak,Away_Win_Streak,Home_Score_Streak,Away_Score_Streak,Home_Elo,Away_Elo
103177,2,1,0,0,0,0,70.0,79.0
103178,0,1,0,0,0,0,68.0,77.0
103179,0,1,0,0,0,0,74.0,74.0
103180,1,1,0,0,0,0,84.0,88.0
103181,1,1,0,0,0,0,70.0,79.0


In [10]:
import numpy as np

train, validation, test = np.split(dataset.sample(frac=1), [int(.6*len(dataset)), int(.8*len(dataset))])
train.head()

Unnamed: 0,Home_Score,Away_Score,Home_Win_Streak,Away_Win_Streak,Home_Score_Streak,Away_Score_Streak,Home_Elo,Away_Elo
105793,2,2,0,0,21,27,64.0,77.0
110564,2,2,5,1,17,14,98.0,71.0
110473,2,2,0,0,44,44,76.0,85.0
113287,0,3,0,1,4,3,62.0,85.0
114790,2,3,0,2,27,60,78.0,92.0


In [11]:
target = ['Home_Score', 'Away_Score']
features = dataset.columns.drop(target)

In [23]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.base import clone

def cross_validation(models: list, k=4):
    mean_accuracy = {}
    for i, _model in enumerate(models):
        accuracy = []
        for _ in range(k):
            model = clone(_model)
            x_train, x_val, y_train, y_val = train_test_split(dataset[features], dataset[target], test_size=0.3)
            model.fit(x_train, y_train)
            val_pred = model.predict(x_val)
            score = mean_absolute_error(y_val, val_pred)
            accuracy.append(score)
        mean_accuracy[i] = np.mean(accuracy)
    return mean_accuracy

In [24]:
from sklearn.linear_model import MultiTaskLasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor

cross_validation([
    RandomForestRegressor(),
    MultiTaskLasso(alpha=0.7),
    MultiOutputRegressor(Ridge())
])

{0: 0.955415115670124, 1: 0.9205869053661297, 2: 0.9141737899225548}