In [1]:
import pandas as pd
import datetime as dt

matches = pd.read_csv("matches.csv", index_col=0)

### Feature Engineering

In [2]:
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,match report,notes,sh,sot,dist,fk,pk,pkatt,season,team
1,15/08/2021,16:30,Premier League,Matchweek 1,Sun,Away,L,0,1,Tottenham,...,Match Report,,18,4,16.9,1,0,0,2022,Manchester City
2,21/08/2021,15:00,Premier League,Matchweek 2,Sat,Home,W,5,0,Norwich City,...,Match Report,,16,4,17.3,1,0,0,2022,Manchester City
3,28/08/2021,12:30,Premier League,Matchweek 3,Sat,Home,W,5,0,Arsenal,...,Match Report,,25,10,14.3,0,0,0,2022,Manchester City
4,11/09/2021,15:00,Premier League,Matchweek 4,Sat,Away,W,1,0,Leicester City,...,Match Report,,25,8,14.0,0,0,0,2022,Manchester City
6,18/09/2021,15:00,Premier League,Matchweek 5,Sat,Home,D,0,0,Southampton,...,Match Report,,16,1,15.7,1,0,0,2022,Manchester City


In [3]:
matches.columns

Index(['date', 'time', 'comp', 'round', 'day', 'venue', 'result', 'gf', 'ga',
       'opponent', 'xg', 'xga', 'poss', 'attendance', 'captain', 'formation',
       'referee', 'match report', 'notes', 'sh', 'sot', 'dist', 'fk', 'pk',
       'pkatt', 'season', 'team'],
      dtype='object')

In [4]:
matches["team"].value_counts()

Manchester United           72
Brighton and Hove Albion    72
West Ham United             72
Southampton                 72
Newcastle United            72
Arsenal                     71
Crystal Palace              71
Burnley                     71
Leeds United                71
Manchester City             71
Wolverhampton Wanderers     71
Tottenham Hotspur           71
Leicester City              70
Everton                     70
Chelsea                     70
Aston Villa                 70
West Bromwich Albion        38
Liverpool                   38
Sheffield United            38
Fulham                      38
Brentford                   34
Watford                     33
Norwich City                33
Name: team, dtype: int64

In [5]:
matches["date"] = pd.to_datetime(matches["date"])

In [6]:
matches["venue_code"] = matches["venue"].astype("category").cat.codes

In [7]:
matches["team_code"] = matches["team"].astype("category").cat.codes

In [8]:
matches["opp_code"] = matches["opponent"].astype("category").cat.codes

In [9]:
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype(int)

In [10]:
matches["day_code"] = matches["date"].dt.dayofweek

In [11]:
class MissingDict(dict):
    __missing__ = lambda self, key: key
    
map_values = {  
    "Brighton and Hove Albion": "Brighton",
    "Manchester United": "Manchester Utd",
    "Newcastle United": "Newcastle Utd",
    "Sheffield United": "Sheffield Utd",
    "Tottenham Hotspur": "Tottenham",
    "West Bromwich Albion": "West Brom",
    "West Ham United": "West Ham",
    "Wolverhampton Wanderers": "Wolves",    
}
mapping = MissingDict(**map_values)

In [12]:
matches['team'] = matches['team'].map(mapping)

In [13]:
matches.head()

Unnamed: 0,date,time,comp,round,day,venue,result,gf,ga,opponent,...,fk,pk,pkatt,season,team,venue_code,team_code,opp_code,hour,day_code
1,2021-08-15,16:30,Premier League,Matchweek 1,Sun,Away,L,0,1,Tottenham,...,1,0,0,2022,Manchester City,0,12,18,16,6
2,2021-08-21,15:00,Premier League,Matchweek 2,Sat,Home,W,5,0,Norwich City,...,1,0,0,2022,Manchester City,1,12,15,15,5
3,2021-08-28,12:30,Premier League,Matchweek 3,Sat,Home,W,5,0,Arsenal,...,0,0,0,2022,Manchester City,1,12,0,12,5
4,2021-11-09,15:00,Premier League,Matchweek 4,Sat,Away,W,1,0,Leicester City,...,0,0,0,2022,Manchester City,0,12,10,15,1
6,2021-09-18,15:00,Premier League,Matchweek 5,Sat,Home,D,0,0,Southampton,...,1,0,0,2022,Manchester City,1,12,17,15,5


In [14]:
#calculate points for use in last 4 game rolling average points for representation of form later 
matches["points"] = matches["result"].apply(lambda row: 3 if row=="W" else 1 if row=="D" else 0)

In [15]:
#convert win/draw/loss to dummy variable
target = pd.get_dummies(matches["result"])
matches = pd.concat([matches, target], axis=1)

In [16]:
matches.drop(["result", "round", "comp", "season", "attendance"], axis=1).head()

Unnamed: 0,date,time,day,venue,gf,ga,opponent,xg,xga,poss,...,team,venue_code,team_code,opp_code,hour,day_code,points,D,L,W
1,2021-08-15,16:30,Sun,Away,0,1,Tottenham,1.9,1.3,64,...,Manchester City,0,12,18,16,6,0,0,1,0
2,2021-08-21,15:00,Sat,Home,5,0,Norwich City,2.7,0.1,67,...,Manchester City,1,12,15,15,5,3,0,0,1
3,2021-08-28,12:30,Sat,Home,5,0,Arsenal,3.8,0.1,80,...,Manchester City,1,12,0,12,5,3,0,0,1
4,2021-11-09,15:00,Sat,Away,1,0,Leicester City,2.9,0.8,61,...,Manchester City,0,12,10,15,1,3,0,0,1
6,2021-09-18,15:00,Sat,Home,0,0,Southampton,1.1,0.4,63,...,Manchester City,1,12,17,15,5,1,1,0,0


In [17]:
matches.sort_values('date', inplace=True)

In [18]:
cols = ['points', 'gf', 'ga', 'sh', 'sot', 'dist', 'fk', 'pk', 'pkatt']
new_cols = [f"{c}_rolling" for c in cols]

In [19]:
matches[new_cols] = matches.groupby('team')[cols].transform(lambda x: x.rolling(4).mean())

In [20]:
grp_matches = matches.groupby("team").apply(lambda a: a[:]).drop('team', axis=1).droplevel(1)

In [21]:
grp_matches.reset_index(inplace=True)

## Machine Learning Model

In [22]:
train = grp_matches[grp_matches["date"] < "2022-01-01"]

In [23]:
test = grp_matches[grp_matches["date"] > "2022-01-01"]

In [26]:
train = train[['venue_code', 'opp_code', 'team_code', 'hour', 'day_code', 'points_rolling', 'gf_rolling', 'ga_rolling', 'sh_rolling', 'sot_rolling', 'dist_rolling', 'fk_rolling', 'pk_rolling', 'pkatt_rolling', 'W', 'D', 'L']].dropna()

In [27]:
test = test[['venue_code', 'opp_code', 'team_code', 'hour', 'day_code', 'points_rolling', 'gf_rolling', 'ga_rolling', 'sh_rolling', 'sot_rolling', 'dist_rolling', 'fk_rolling', 'pk_rolling', 'pkatt_rolling', 'W', 'D', 'L']].dropna()

In [28]:
train[train["team_code"]==0].head()

Unnamed: 0,venue_code,opp_code,team_code,hour,day_code,points_rolling,gf_rolling,ga_rolling,sh_rolling,sot_rolling,dist_rolling,fk_rolling,pk_rolling,pkatt_rolling,W,D,L
3,1,1,0,19,1,1.5,0.75,1.5,9.25,2.5,17.525,0.25,0.25,0.25,0,0,1
4,1,21,0,20,5,1.5,1.0,1.75,9.25,3.0,15.675,0.25,0.0,0.0,1,0,0
5,0,11,0,20,0,0.75,0.75,2.25,8.75,2.5,15.325,0.25,0.0,0.0,0,0,1
6,0,12,0,17,5,0.75,0.75,2.0,8.5,2.75,16.1,0.5,0.0,0.0,0,0,1
7,1,10,0,19,6,0.75,0.75,1.5,8.25,3.25,15.5,0.75,0.0,0.0,0,0,1


In [37]:
X_train = train.iloc[:,:-3]
y_train = train.iloc[:,-3:]

In [38]:
y_train

Unnamed: 0,W,D,L
3,0,0,1
4,1,0,0
5,0,0,1
6,0,0,1
7,0,0,1
...,...,...,...
1369,1,0,0
1370,0,1,0
1371,0,0,1
1372,1,0,0


In [39]:
X_test = test.iloc[:,:-3]
y_test = test.iloc[:,-3:]

In [40]:
from scipy import interp
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

n_classes = 3

fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:,i], y_pred[:,i], )
    print(fpr[i], tpr[i], _)
    roc_auc[i] = auc(fpr[i], tpr[i])

TypeError: '(slice(None, None, None), 0)' is an invalid key

In [None]:
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK

### RandomForest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

In [None]:
def objective_rf(space):
    model = RandomForestClassifier(
        **space,
        random_state=42
    )
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    
    loss = 1 - score
    
    return {'loss': loss, 'status': STATUS_OK, 'model': model}

In [None]:
space = {
    'n_estimators':hp.randint('n_estimators',200,1000),
    'max_depth': hp.randint('max_depth',10,200),
    'min_samples_split':hp.uniform('min_samples_split',0,1),
    'min_samples_leaf':hp.randint('min_samples_leaf',1,10),
    'criterion':hp.choice('criterion', ['gini','entropy']),
    'max_features':hp.choice('max_features', ['sqrt','log2'])
}

In [None]:
rf_trials = Trials()

In [None]:
best_params_rf = fmin(
    fn=objective_rf,
    space=space,
    algo=tpe.suggest,
    trials=rf_trials,
    max_evals=50)

In [None]:
pd.DataFrame(trials).iloc[0]['result']

In [None]:
import numpy as np

def getBestModelfromTrials(trials):
    valid_trial_list = [trial for trial in trials if STATUS_OK == trial['result']['status']]
    losses = [float(trial['result']['loss']) for trial in valid_trial_list]
    index_having_minumum_loss = np.argmin(losses)
    best_trial_obj = valid_trial_list[index_having_minumum_loss]
    return best_trial_obj['result']['model']

In [None]:
rf_model = getBestModelfromTrials(trials)

In [None]:
rf_model.predict

### XGBoost Classifier

In [None]:
import xgboost as xgb

In [None]:
def objective_xgb(space):
    
    model = xgb.XGBClassifier(objective="multi:softmax", 
                              max_depth=int(space['max_depth']),
                              min_child_weight=int(space['min_child_weight']),
                              n_estimators=int(space['n_estimators']),
                              eval_metric="auc",
                              early_stopping_rounds=10,
                              random_state=42
                         )
    
    evaluation=[(X_test, y_test)]
    
    model.fit(X_train, y_train, eval_set=evaluation, verbose=False)
    
    y_pred = model.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    
    loss = 1 - score
    
    return {'loss': loss, 'status': STATUS_OK, 'model': model}

In [None]:
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
       'min_child_weight' : hp.quniform('min_child_weight', 0, 1000, 1),
       'n_estimators': hp.quniform("n_estimators", 1, 1000, 1)
      }

In [None]:
xgb_trials = Trials()

In [None]:
best_params_xgb = fmin(
    fn=objective_xgb,
    space=space,
    algo=tpe.suggest,
    trials=xgb_trials,
    max_evals=100)

In [None]:
print(best_params_xgb)

In [None]:
xgb_model = getBestModelfromTrials(xgb_trials)

In [None]:
pred = xgb_model.predict(X_test)