In [12]:
import numpy as np
import pandas as pd

# ignore warning 
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.feature_selection import SelectPercentile

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

In [2]:
def clean_data(df):
    data = df[['HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HS', 'AS', 'HST', 'AST']]
    data['HSGR'] = data['FTHG']/data['HS']
    data['ASGR'] = data['FTAG']/data['AS']
    data = data.replace([np.inf, -np.inf], 0)
    bet_home = df[['B365H','BWH','IWH','PSH','VCH','WHH']].mean(axis=1)
    bet_draw = df[['B365D','BWD','IWD','PSD','VCD','WHD']].mean(axis=1)
    bet_away = df[['B365A','BWA','IWA','PSA','VCA','WHA']].mean(axis=1)
    data['odd_home'] = bet_home
    data['odd_draw'] = bet_draw
    data['odd_away'] = bet_away
    data.dropna()
    
    return data

In [114]:
# Clean all the data
data_1415 = clean_data(pd.read_csv('./data/1415.csv'))
data_1415.drop(data_1415.tail(1).index,inplace=True)
data_1516 = clean_data(pd.read_csv('./data/1516.csv'))
data_1617 = clean_data(pd.read_csv('./data/1617.csv'))
data_1718 = clean_data(pd.read_csv('./data/1718.csv'))
data_1819 = clean_data(pd.read_csv('./data/1819.csv'))

In [115]:
data_1415.head()

Unnamed: 0,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HS,AS,HST,AST,HSGR,ASGR,odd_home,odd_draw,odd_away
0,Arsenal,Crystal Palace,2.0,1.0,H,14.0,4.0,6.0,2.0,0.142857,0.25,1.26,5.866667,12.085
1,Leicester,Everton,2.0,2.0,D,11.0,13.0,3.0,3.0,0.181818,0.153846,3.073333,3.296667,2.393333
2,Man United,Swansea,1.0,2.0,A,14.0,5.0,5.0,4.0,0.071429,0.4,1.363333,4.925,9.6
3,QPR,Hull,0.0,1.0,A,19.0,11.0,6.0,4.0,0.0,0.090909,2.488333,3.193333,3.015
4,Stoke,Aston Villa,0.0,1.0,A,12.0,7.0,2.0,2.0,0.0,0.142857,1.958333,3.345,4.25


In [118]:
def restructure(df):
    df = df.copy()
    df['goals_h_a'] = df['FTHG'] - df['FTAG']
    df['total_h_a'] = df['HS'] - df['AS']

    H_GT = [] # home games total so far
    H_W = [] # home wins so far
    H_WR = [] # home win rate so far
    H_avg_diff = [] # home avg goals diff

    A_GT = [] # away games total so far
    A_W = [] # away wins so far
    A_WR = [] #away win rate so far
    A_avg_diff = [] # away avg goals diff

    for i in range(len(df)):
        home = df.loc[i, 'HomeTeam']
        away = df.loc[i, 'AwayTeam']
        
        home_h_games = df[df['HomeTeam'] == home].loc[:i]['goals_h_a']
        home_a_games = df[df['AwayTeam'] == home].loc[:i]['goals_h_a'] * -1
        home_games = home_h_games.append(home_a_games)
        
        away_h_games = df[df['HomeTeam'] == away].loc[:i]['goals_h_a']
        away_a_games = df[df['AwayTeam'] == away].loc[:i]['goals_h_a'] * -1
        away_games = away_h_games.append(away_a_games)
        
        H_GT.append(len(home_games))
        A_GT.append(len(away_games))
        
        H_W.append((home_games > 0).sum() + (home_games == 0).sum() * 0.5)
        A_W.append((away_games > 0).sum() + (away_games == 0).sum() * 0.5)
        
        H_WR.append(H_W[i] / H_GT[i])
        A_WR.append(A_W[i] / A_GT[i])
        
        H_avg_diff.append(home_games.mean())
        A_avg_diff.append(away_games.mean())
    
    df['H_GT'] = H_GT
    df['H_W'] = H_W
    df['H_WR'] = H_WR
    df['H_avg_diff'] = H_avg_diff
    df['A_GT'] = A_GT
    df['A_W'] = A_W
    df['A_WR'] = A_WR
    df['A_avg_diff'] = A_avg_diff
    return df

new_cols = ['HomeTeam', 'AwayTeam', 
            'H_GT', 'H_W', 'H_WR', 'H_avg_diff', 
            'A_GT', 'A_W', 'A_WR', 'A_avg_diff', 
            'odd_home', 'odd_draw', 'odd_away']
df_1415 = restructure(data_1415)
df_1415[new_cols].head()

Unnamed: 0,HomeTeam,AwayTeam,H_GT,H_W,H_WR,H_avg_diff,A_GT,A_W,A_WR,A_avg_diff
0,Arsenal,Crystal Palace,1,1.0,1.0,1.0,1,0.0,0.0,-1.0
1,Leicester,Everton,1,0.5,0.5,0.0,1,0.5,0.5,0.0
2,Man United,Swansea,1,0.0,0.0,-1.0,1,1.0,1.0,1.0
3,QPR,Hull,1,0.0,0.0,-1.0,1,1.0,1.0,1.0
4,Stoke,Aston Villa,1,0.0,0.0,-1.0,1,1.0,1.0,1.0


In [119]:
scaler = MinMaxScaler()
knn_reg = KNeighborsRegressor()
selector = SelectPercentile()

columns_to_use = ['H_GT', 'H_W', 'H_WR', 'H_avg_diff', 'A_GT', 'A_W', 'A_WR', 'A_avg_diff']

In [124]:
train_features, test_features, train_outcome, test_outcome = train_test_split(
    df_1415[columns_to_use], 
    df_1415['odd_away'], 
    test_size=0.3
)

pipe = make_pipeline(scaler, selector, knn_reg)
param_grid = {
    'selectpercentile__percentile':range(30, 60, 10),
    'kneighborsregressor__n_neighbors':range(1, 50, 2), 
    'kneighborsregressor__weights':['uniform', 'distance']
}
grid = GridSearchCV(pipe, param_grid, cv=10, scoring="neg_mean_absolute_error")
grid.fit(train_features, train_outcome)
grid.score(test_features, test_outcome)

-1.3975929005064915

In [122]:
data_1415.describe()

Unnamed: 0,FTHG,FTAG,HS,AS,HST,AST,HSGR,ASGR,odd_home,odd_draw,odd_away
count,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0,380.0
mean,1.473684,1.092105,14.623684,11.286842,4.723684,3.681579,0.109941,0.102512,2.676769,3.847167,4.556294
std,1.263175,1.069826,5.51926,4.594288,2.496429,2.042084,0.098125,0.106748,1.487845,0.832764,3.258092
min,0.0,0.0,3.0,1.0,0.0,0.0,0.0,0.0,1.168333,3.066667,1.368333
25%,1.0,0.0,11.0,8.0,3.0,2.0,0.041667,0.0,1.724167,3.308333,2.3925
50%,1.0,1.0,14.0,11.0,4.0,3.0,0.090909,0.090909,2.221667,3.519167,3.413333
75%,2.0,2.0,18.0,14.0,6.0,5.0,0.166667,0.166667,3.044583,3.989583,5.100417
max,8.0,6.0,43.0,27.0,16.0,11.0,0.454545,0.666667,9.566667,7.425,18.5
