In [1]:
from data_loader import DataLoader
from feature_factory import build_streaks, build_elo

dataset = DataLoader('dataset.csv', 'elo_dict.pkl').\
    load([ build_streaks, build_elo ])
dataset.head()

Unnamed: 0,Home_Team,Away_Team,Result,Link,Season,Round,League,Home_Score,Away_Score,Home_Win_Streak,Away_Win_Streak,Home_Score_Streak,Away_Score_Streak,Home_Elo,Away_Elo
0,Watford,Middlesbrough,0,https://www.besoccer.com/match/watford-fc/midd...,2021,1,championship,1,0,0,0,0,0,65.0,60.0
1,Birmingham City,Brentford,0,https://www.besoccer.com/match/birmingham-city...,2021,1,championship,1,0,0,0,0,0,52.0,59.0
2,Wycombe Wanderers,Rotherham United,1,https://www.besoccer.com/match/wycombe-wandere...,2021,1,championship,0,1,0,0,0,0,41.0,48.0
3,AFC Bournemouth,Blackburn Rovers,0,https://www.besoccer.com/match/afc-bournemouth...,2021,1,championship,3,2,0,0,0,0,63.0,57.0
4,Barnsley,Luton Town,1,https://www.besoccer.com/match/barnsley-fc/lut...,2021,1,championship,0,1,0,0,0,0,47.0,50.0


In [2]:
dataset.drop(['League', 'Season', 'Link', 'Round', 'Result'], axis=1, inplace=True)
dataset.head()

Unnamed: 0,Home_Team,Away_Team,Home_Score,Away_Score,Home_Win_Streak,Away_Win_Streak,Home_Score_Streak,Away_Score_Streak,Home_Elo,Away_Elo
0,Watford,Middlesbrough,1,0,0,0,0,0,65.0,60.0
1,Birmingham City,Brentford,1,0,0,0,0,0,52.0,59.0
2,Wycombe Wanderers,Rotherham United,0,1,0,0,0,0,41.0,48.0
3,AFC Bournemouth,Blackburn Rovers,3,2,0,0,0,0,63.0,57.0
4,Barnsley,Luton Town,0,1,0,0,0,0,47.0,50.0


In [4]:
from feature_factory import build_encoded_teams

dataset = build_encoded_teams(dataset, encoding_path='teams_encoding.json')
dataset.head()

Unnamed: 0,Home_Team,Away_Team,Home_Score,Away_Score,Home_Win_Streak,Away_Win_Streak,Home_Score_Streak,Away_Score_Streak,Home_Elo,Away_Elo
0,530,298,1,0,0,0,0,0,65.0,60.0
1,65,77,1,0,0,0,0,0,52.0,59.0
2,542,413,0,1,0,0,0,0,41.0,48.0
3,5,66,3,2,0,0,0,0,63.0,57.0
4,53,282,0,1,0,0,0,0,47.0,50.0


In [5]:
dataset.to_csv('featured_dataset.csv', index=False)

In [6]:
import numpy as np

train, validation, test = np.split(dataset.sample(frac=1), [int(.6*len(dataset)), int(.8*len(dataset))])
train.head()

Unnamed: 0,Home_Team,Away_Team,Home_Score,Away_Score,Home_Win_Streak,Away_Win_Streak,Home_Score_Streak,Away_Score_Streak,Home_Elo,Away_Elo
138540,35,199,0,1,0,0,28,24,53.0,58.0
62664,253,243,0,0,3,1,6,5,84.0,80.0
97546,203,458,1,2,0,1,20,44,53.0,60.0
14964,211,49,0,1,0,37,0,37,0.0,0.0
2226,460,118,3,2,1,2,6,10,0.0,0.0


In [7]:
target = ['Home_Score', 'Away_Score']
features = dataset.columns.drop(target)

In [8]:
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.base import clone

def cross_validation(models: list, k=4):
    mean_accuracy = {}
    for i, _model in enumerate(models):
        accuracy = []
        for _ in range(k):
            model = clone(_model)
            x_train, x_val, y_train, y_val = train_test_split(dataset[features], dataset[target], test_size=0.3)
            model.fit(x_train, y_train)
            val_pred = model.predict(x_val)
            score = mean_absolute_error(y_val, val_pred)
            accuracy.append(score)
        mean_accuracy[i] = np.mean(accuracy)
    return mean_accuracy

In [9]:
from sklearn.linear_model import MultiTaskLasso, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor

cross_validation([
    RandomForestRegressor(),
    MultiTaskLasso(alpha=0.7),
    MultiOutputRegressor(Ridge())
])

{0: 0.8228207339757455, 1: 0.8625171550614792, 2: 0.8564145258435485}

In [11]:
model = RandomForestRegressor()
model.fit(train[features], train[target])

RandomForestRegressor()

In [12]:
predicted = model.predict(test[features])
mean_absolute_error(test[target], predicted)

0.8180695790671213