### Imports

In [None]:
import pandas as pd
import datetime as dt

matches = pd.read_csv("matches.csv", index_col=0)

### Feature Engineering

In [None]:
matches.head()

In [None]:
matches.columns

In [None]:
matches["team"].value_counts()

In [None]:
matches["date"] = pd.to_datetime(matches["date"])

In [None]:
matches["venue_code"] = matches["venue"].astype("category").cat.codes

In [None]:
matches["team_code"] = matches["team"].astype("category").cat.codes

In [None]:
matches["opp_code"] = matches["opponent"].astype("category").cat.codes

In [None]:
matches["hour"] = matches["time"].str.replace(":.+", "", regex=True).astype(int)

In [None]:
matches["day_code"] = matches["date"].dt.dayofweek

In [None]:
class MissingDict(dict):
    __missing__ = lambda self, key: key
    
map_values = {  
    "Brighton and Hove Albion": "Brighton",
    "Manchester United": "Manchester Utd",
    "Newcastle United": "Newcastle Utd",
    "Sheffield United": "Sheffield Utd",
    "Tottenham Hotspur": "Tottenham",
    "West Bromwich Albion": "West Brom",
    "West Ham United": "West Ham",
    "Wolverhampton Wanderers": "Wolves",    
}
mapping = MissingDict(**map_values)

In [None]:
matches['team'] = matches['team'].map(mapping)

In [None]:
matches.head()

In [None]:
#calculate points for use in last 4 game rolling average points for representation of form later 
matches["points"] = matches["result"].apply(lambda row: 3 if row=="W" else 1 if row=="D" else 0)

In [None]:
#convert win/draw/loss to dummy variable
target = pd.get_dummies(matches["result"])
matches = pd.concat([matches, target], axis=1)

In [None]:
matches.drop(["result", "round", "comp", "season", "attendance"], axis=1).head()

In [None]:
matches.sort_values('date', inplace=True)

In [None]:
cols = ['points', 'gf', 'ga', 'sh', 'sot', 'dist', 'fk', 'pk', 'pkatt']
new_cols = [f"{c}_rolling" for c in cols]

In [None]:
matches[new_cols] = matches.groupby('team')[cols].transform(lambda x: x.rolling(4).mean())

In [None]:
grp_matches = matches.groupby("team").apply(lambda a: a[:]).drop('team', axis=1).droplevel(1)

In [None]:
grp_matches.reset_index(inplace=True)

## Machine Learning Model

In [None]:
train = grp_matches[grp_matches["date"] < "2022-01-01"]

In [None]:
test = grp_matches[grp_matches["date"] > "2022-01-01"]

In [None]:
train = train[['venue_code', 'opp_code', 'team_code', 'hour', 'day_code', 'points_rolling', 'gf_rolling', 'ga_rolling', 'sh_rolling', 'sot_rolling', 'dist_rolling', 'fk_rolling', 'pk_rolling', 'pkatt_rolling', 'W', 'D', 'L']].dropna()

In [None]:
test = test[['venue_code', 'opp_code', 'team_code', 'hour', 'day_code', 'points_rolling', 'gf_rolling', 'ga_rolling', 'sh_rolling', 'sot_rolling', 'dist_rolling', 'fk_rolling', 'pk_rolling', 'pkatt_rolling', 'W', 'D', 'L']].dropna()

In [None]:
train[train["team_code"]==0].head()

In [None]:
X_train = train.iloc[:,:-3]
y_train = train.iloc[:,-3:]

In [None]:
y_train

In [None]:
X_test = test.iloc[:,:-3]
y_test = test.iloc[:,-3:]

In [None]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

In [None]:
y_pred

In [None]:
from scipy import interp
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

n_classes = 3

fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:,i], y_pred[:,i], )
    print(fpr[i], tpr[i], _)
    roc_auc[i] = auc(fpr[i], tpr[i])

In [None]:
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK

### RandomForest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

In [None]:
def objective_rf(space):
    model = RandomForestClassifier(
        **space,
        random_state=42
    )
    
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    
    loss = 1 - score
    
    return {'loss': loss, 'status': STATUS_OK, 'model': model}

In [None]:
space = {
    'n_estimators':hp.randint('n_estimators',200,1000),
    'max_depth': hp.randint('max_depth',10,200),
    'min_samples_split':hp.uniform('min_samples_split',0,1),
    'min_samples_leaf':hp.randint('min_samples_leaf',1,10),
    'criterion':hp.choice('criterion', ['gini','entropy']),
    'max_features':hp.choice('max_features', ['sqrt','log2'])
}

In [None]:
rf_trials = Trials()

In [None]:
best_params_rf = fmin(
    fn=objective_rf,
    space=space,
    algo=tpe.suggest,
    trials=rf_trials,
    max_evals=50)

In [None]:
pd.DataFrame(trials).iloc[0]['result']

In [None]:
import numpy as np

def getBestModelfromTrials(trials):
    valid_trial_list = [trial for trial in trials if STATUS_OK == trial['result']['status']]
    losses = [float(trial['result']['loss']) for trial in valid_trial_list]
    index_having_minumum_loss = np.argmin(losses)
    best_trial_obj = valid_trial_list[index_having_minumum_loss]
    return best_trial_obj['result']['model']

In [None]:
rf_model = getBestModelfromTrials(trials)

In [None]:
rf_model.predict

### XGBoost Classifier

In [None]:
import xgboost as xgb

In [None]:
def objective_xgb(space):
    
    model = xgb.XGBClassifier(objective="multi:softmax", 
                              max_depth=int(space['max_depth']),
                              min_child_weight=int(space['min_child_weight']),
                              n_estimators=int(space['n_estimators']),
                              eval_metric="auc",
                              early_stopping_rounds=10,
                              random_state=42
                         )
    
    evaluation=[(X_test, y_test)]
    
    model.fit(X_train, y_train, eval_set=evaluation, verbose=False)
    
    y_pred = model.predict(X_test)
    score = accuracy_score(y_test, y_pred)
    
    loss = 1 - score
    
    return {'loss': loss, 'status': STATUS_OK, 'model': model}

In [None]:
space={'max_depth': hp.quniform("max_depth", 3, 18, 1),
       'min_child_weight' : hp.quniform('min_child_weight', 0, 1000, 1),
       'n_estimators': hp.quniform("n_estimators", 1, 1000, 1)
      }

In [None]:
xgb_trials = Trials()

In [None]:
best_params_xgb = fmin(
    fn=objective_xgb,
    space=space,
    algo=tpe.suggest,
    trials=xgb_trials,
    max_evals=100)

In [None]:
print(best_params_xgb)

In [None]:
xgb_model = getBestModelfromTrials(xgb_trials)

In [None]:
pred = xgb_model.predict(X_test)