# Imports

In [57]:
import pandas as pd
import seaborn as sns
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from helper_functions import *

# (1) Load Data

In [92]:
# Load in full Data CSV
filename = './matchup_data_all.csv'
df = pd.read_csv(filename)

# Use helper function to create new win column that describes if home_team won, loss or draw
# This will be our target column
df['target'] = df[['home_score', 'away_score']].apply(add_win_column, axis = 1)
df.sort_values('date_value', inplace = True, ascending=False)
df = df.drop(columns = ['url', 'date', 'home_record', 'away_record'])
df.head()


# Make dataframe that is just the features inorder for feature variance analysis
df_feat = df.drop(columns = ['target', 'date_value', 'home_team', 'away_team', 'home_score', 'away_score', 'week'])
print(len(df_feat.columns))
list(df_feat)



32


['home_possession',
 'away_possession',
 'home_pass_acc',
 'away_pass_acc',
 'home_sot',
 'away_sot',
 'home_saves',
 'away_saves',
 'home_fouls',
 'away_fouls',
 'home_corners',
 'away_corners',
 'home_crosses',
 'away_crosses',
 'home_touches',
 'away_touches',
 'home_tackles',
 'away_tackles',
 'home_ints',
 'away_ints',
 'home_aerials',
 'away_aerials',
 'home_clearances',
 'away_clearances',
 'home_offsides',
 'away_offsides',
 'home_goal_kicks',
 'away_goal_kicks',
 'home_throwins',
 'away_throwins',
 'home_longballs',
 'away_longballs']

# Feature Variance Analysis

In [93]:
vifs = []
for i, feature in enumerate(list(df_feat)):
    vif_tup = (feature, variance_inflation_factor(df_feat.values, i))
    vifs.append( vif_tup )
vifs


[('home_possession', 680.701498080627),
 ('away_possession', 600.6859549597091),
 ('home_pass_acc', 996.1016985703549),
 ('away_pass_acc', 888.3849048955051),
 ('home_sot', 7.270767218536324),
 ('away_sot', 6.172409065556261),
 ('home_saves', 6.287660708873908),
 ('away_saves', 7.090955484260247),
 ('home_fouls', 13.111950083367153),
 ('away_fouls', 12.455623805545246),
 ('home_corners', 8.729434785612003),
 ('away_corners', 7.439265382377548),
 ('home_crosses', 16.31750629097872),
 ('away_crosses', 13.822444975990956),
 ('home_touches', 550.9832120077979),
 ('away_touches', 542.4138177235843),
 ('home_tackles', 15.097304097777911),
 ('away_tackles', 14.525367070179678),
 ('home_ints', 8.091690249647515),
 ('away_ints', 8.415804689950393),
 ('home_aerials', 10.666689852224783),
 ('away_aerials', 10.51729974145111),
 ('home_clearances', 12.55513971486953),
 ('away_clearances', 15.420994119467558),
 ('home_offsides', 2.7662337903569836),
 ('away_offsides', 2.5140735757033563),
 ('home_go

# Model Training and Testing on Already Played Games

In [94]:
clfs = [LogisticRegression(), RandomForestClassifier()]
X = df_feat.values

y = df['target'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 42)
print(type(X_test))

<class 'numpy.ndarray'>


In [53]:
print(X_test)
for clf in clfs:
    clf.fit(X_train, y_train)
    print(type(clf))
    print("score = ", clf.score(X_test, y_test), "\n")

[[65 35 83 ... 26 59 79]
 [55 45 82 ... 15 46 74]
 [48 52 78 ... 27 61 70]
 ...
 [29 71 55 ... 20 79 46]
 [58 42 76 ... 34 70 61]
 [70 30 87 ... 10 61 83]]
<class 'sklearn.linear_model.logistic.LogisticRegression'>
score =  0.75 

<class 'sklearn.ensemble.forest.RandomForestClassifier'>
score =  0.7149122807017544 





# Modelling on Games Yet to be Played (ie without stats)

In [77]:
def create_game(df, home, away, date_val, window):
    # Return a df of averaged stats from previous 5 games that represents a predicted game

    new_game = {'date_value': date_val, 'home_team': home, 'away_team': away, 'home_possession': 0, 'away_possession': 0, 'home_pass_acc': 0, 'away_pass_acc': 0, 'home_sot': 0, 'away_sot': 0, 'home_saves': 0,
                 'away_saves': 0, 'home_fouls': 0, 'away_fouls': 0, 'home_corners': 0, 'away_corners': 0, 'home_crosses': 0, 'away_crosses': 0, 'home_touches': 0, 'away_touches': 0, 'home_tackles': 0,
                  'away_tackles': 0, 'home_ints': 0, 'away_ints': 0, 'home_aerials': 0, 'away_aerials': 0, 'home_clearances': 0, 'away_clearances': 0, 'home_offsides': 0, 'away_offsides': 0, 'home_goal_kicks': 0, 'away_goal_kicks': 0,
                  'home_throwins': 0, 'away_throwins': 0, 'home_longballs': 0, 'away_longballs': 0,}

    # url	date	week	home_team	away_team	home_record	away_record	home_score	away_score	home_score_xg	away_score_xg	home_possession	away_possession	home_pass_acc	away_pass_acc	home_sot	away_sot	home_saves	away_saves	home_fouls	away_fouls	home_corners	away_corners	home_crosses	away_crosses	home_touches	away_touches	home_tackles	away_tackles	home_ints	away_ints	home_aerials	away_aerials	home_clearances	away_clearances	home_offsides	away_offsides	home_goal_kicks	away_goal_kicks	home_throwins	away_throwins	home_longballs	away_longballs	date_timestamp
    #
    # Extract and sum up previous 5 games from Home team
    num_found = 0
    for index, data in df.iterrows():
        if int(data['date_value']) >= int(date_val): continue  # Skip until we get to the point in time we are predicting for
        if num_found >= window: break

        if data['home_team'] == home:
            new_game['home_possession'] += data['home_possession']
            new_game['home_pass_acc'] += data['home_pass_acc']
            new_game['home_sot'] += data['home_sot']
            new_game['home_saves'] += data['home_saves']
            new_game['home_fouls'] += data['home_fouls']
            new_game['home_corners'] += data['home_corners']
            new_game['home_crosses'] += data['home_crosses']
            new_game['home_touches'] += data['home_touches']
            new_game['home_tackles'] += data['home_tackles']
            new_game['home_ints'] += data['home_ints']
            new_game['home_aerials'] += data['home_aerials']
            new_game['home_clearances'] += data['home_clearances']
            new_game['home_offsides'] += data['home_offsides']
            new_game['home_goal_kicks'] += data['home_goal_kicks']
            new_game['home_throwins'] += data['home_throwins']
            new_game['home_longballs'] += data['home_longballs']
        elif data['away_team'] == home:
            new_game['home_possession'] += data['away_possession']
            new_game['home_pass_acc'] += data['away_pass_acc']
            new_game['home_sot'] += data['away_sot']
            new_game['home_saves'] += data['away_saves']
            new_game['home_fouls'] += data['away_fouls']
            new_game['home_corners'] += data['away_corners']
            new_game['home_crosses'] += data['away_crosses']
            new_game['home_touches'] += data['away_touches']
            new_game['home_tackles'] += data['away_tackles']
            new_game['home_ints'] += data['away_ints']
            new_game['home_aerials'] += data['away_aerials']
            new_game['home_clearances'] += data['away_clearances']
            new_game['home_offsides'] += data['away_offsides']
            new_game['home_goal_kicks'] += data['away_goal_kicks']
            new_game['home_throwins'] += data['away_throwins']
            new_game['home_longballs'] += data['away_longballs']
        else: continue

        num_found += 1

    # Do the same for the Away Team
    num_found = 0
    for index, data in df.iterrows():
        if int(data['date_value']) >= int(date_val): continue  # Skip until we get to the point in time we are predicting for
        if num_found >= window: break

        if data['home_team'] == away:
            new_game['away_possession'] += data['home_possession']
            new_game['away_pass_acc'] += data['home_pass_acc']
            new_game['away_sot'] += data['home_sot']
            new_game['away_saves'] += data['home_saves']
            new_game['away_fouls'] += data['home_fouls']
            new_game['away_corners'] += data['home_corners']
            new_game['away_crosses'] += data['home_crosses']
            new_game['away_touches'] += data['home_touches']
            new_game['away_tackles'] += data['home_tackles']
            new_game['away_ints'] += data['home_ints']
            new_game['away_aerials'] += data['home_aerials']
            new_game['away_clearances'] += data['home_clearances']
            new_game['away_offsides'] += data['home_offsides']
            new_game['away_goal_kicks'] += data['home_goal_kicks']
            new_game['away_throwins'] += data['home_throwins']
            new_game['away_longballs'] += data['home_longballs']
        elif data['away_team'] == away:
            new_game['away_possession'] += data['away_possession']
            new_game['away_pass_acc'] += data['away_pass_acc']
            new_game['away_sot'] += data['away_sot']
            new_game['away_saves'] += data['away_saves']
            new_game['away_fouls'] += data['away_fouls']
            new_game['away_corners'] += data['away_corners']
            new_game['away_crosses'] += data['away_crosses']
            new_game['away_touches'] += data['away_touches']
            new_game['away_tackles'] += data['away_tackles']
            new_game['away_ints'] += data['away_ints']
            new_game['away_aerials'] += data['away_aerials']
            new_game['away_clearances'] += data['away_clearances']
            new_game['away_offsides'] += data['away_offsides']
            new_game['away_goal_kicks'] += data['away_goal_kicks']
            new_game['away_throwins'] += data['away_throwins']
            new_game['away_longballs'] += data['away_longballs']
        else: continue

        num_found += 1


    # Average out new game according to window size
    not_ints = ["date_value", "home_team", "away_team"]
    for key, val in new_game.items():
        if key in not_ints: continue
        new_game[key] = val // window 
    
    new_df = pd.DataFrame.from_records([new_game])
    return new_df

In [88]:
# Given home_team_name and away_team_name -> Return who will win
# To do this our model must take in a row of these two teams 5 game averages in each stat and then predict

home_team_name = "Chelsea"
away_team_name = "Arsenal"
matchID = 20190512

# Make a array of these teams previous 5 games averages of all stats found in the column labels of df
fict_game = create_game(df, home_team_name, away_team_name, matchID, window=5)
fict_game_test = fict_game.drop(columns = ['date_value', 'home_team', 'away_team'])

fict_game_test = fict_game_test.to_numpy()


print(fict_game_test)

for clf in clfs:
    #clf.fit(X_train, y_train)
    print(type(clf))
    print(clf.predict(fict_game_test))
    #print("score = ", clf.score(X_test, y_test), "\n")

[[ 55  62  84  83  40  28  72  55   9  10   6   7  21  22 713 763  20  11
   10  10  14  16  18  13   2   2   7   6  16  20  43  45]]
<class 'sklearn.linear_model.logistic.LogisticRegression'>
[1]
<class 'sklearn.ensemble.forest.RandomForestClassifier'>
[1]
