In [53]:
# Setting up
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

# ignore warning 
import warnings
warnings.filterwarnings("ignore")

In [4]:
# Clean data function from Kangwoo's notebook
def clean_data(df):
    data = df[['HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HS', 'AS', 'HST', 'AST']]
    data['HSGR'] = data['FTHG']/data['HS']
    data['ASGR'] = data['FTAG']/data['AS']
    data = data.replace([np.inf, -np.inf], 0)
    bet_home = df[['B365H','BWH','IWH','PSH','VCH','WHH']].mean(axis=1)
    bet_draw = df[['B365D','BWD','IWD','PSD','VCD','WHD']].mean(axis=1)
    bet_away = df[['B365A','BWA','IWA','PSA','VCA','WHA']].mean(axis=1)
    
    
    data['odd_home'] = bet_home
    data['odd_draw'] = bet_draw
    data['odd_away'] = bet_away
    
    data.dropna()
    
    return data

In [5]:
# Clean all the data
data_1415 = clean_data(pd.read_csv('./data/1415.csv'))
data_1415.drop(data_1415.tail(1).index,inplace=True)
data_1516 = clean_data(pd.read_csv('./data/1516.csv'))
data_1617 = clean_data(pd.read_csv('./data/1617.csv'))
data_1718 = clean_data(pd.read_csv('./data/1718.csv'))
data_1819 = clean_data(pd.read_csv('./data/1819.csv'))

In [146]:
# create label encoder to convert H, A, or D into factors for who wins (Home, Away, or Draw)
encoder_1415 = LabelEncoder()
encoder_1415.fit(data_1415["FTR"])
data_1415["FTR"] = encoder_1415.transform(data_1415["FTR"])

# split into training and test data
train_1415, valid_1415, train_labels_1415, valid_labels_1415 = train_test_split(
    data_1415.drop("FTR", axis=1),
    data_1415["FTR"],
    test_size=0.2,
    random_state=123
)

# Combine the training data and labels to fit into generalized linear model
train_1415_all = train_1415
train_1415_all["FTR"] = train_labels_1415

# Use Poisson to calculate
m = smf.glm(formula="FTR ~ HomeTeam + AwayTeam + FTHG + FTAG + HS + AS+ HST + AST + HSGR + ASGR", 
            data=train_1415_all, family=sm.families.Poisson()).fit()

# Predict and find accuracy
pred = np.round(m.predict(valid_1415))
a1 = accuracy_score(valid_labels_1415, pred)

# Use Poisson to calculate
m = smf.glm(formula="FTR ~ FTHG + FTAG + HS + AS+ HST + AST + HSGR + ASGR", 
            data=train_1415_all, family=sm.families.Poisson()).fit()

# Predict and find accuracy
pred = np.round(m.predict(valid_1415))
a2 = accuracy_score(valid_labels_1415, pred)
a1, a2

(0.47368421052631576, 0.5263157894736842)

In [147]:
# create label encoder to convert H, A, or D into factors for who wins (Home, Away, or Draw)
encoder_1516 = LabelEncoder()
encoder_1516.fit(data_1516["FTR"])
data_1516["FTR"] = encoder_1516.transform(data_1516["FTR"])

# split into training and test data
train_1516, valid_1516, train_labels_1516, valid_labels_1516 = train_test_split(
    data_1516.drop("FTR", axis=1),
    data_1516["FTR"],
    test_size=0.2,
    random_state=123
)

# Combine the training data and labels to fit into generalized linear model
train_1516_all = train_1516
train_1516_all["FTR"] = train_labels_1516

# Use Poisson to calculate
m = smf.glm(formula="FTR ~ HomeTeam + AwayTeam + FTHG + FTAG + HS + AS+ HST + AST + HSGR + ASGR", 
            data=train_1516_all, family=sm.families.Poisson()).fit()

# Predict and find accuracy
pred = np.round(m.predict(valid_1516))
a1 = accuracy_score(valid_labels_1516, pred)

# Use Poisson to calculate
m = smf.glm(formula="FTR ~ FTHG + FTAG + HS + AS+ HST + AST + HSGR + ASGR", 
            data=train_1516_all, family=sm.families.Poisson()).fit()

# Predict and find accuracy
pred = np.round(m.predict(valid_1516))
a2 = accuracy_score(valid_labels_1516, pred)
a1, a2

(0.6710526315789473, 0.631578947368421)

In [148]:
# create label encoder to convert H, A, or D into factors for who wins (Home, Away, or Draw)
encoder_1617 = LabelEncoder()
encoder_1617.fit(data_1617["FTR"])
data_1617["FTR"] = encoder_1617.transform(data_1617["FTR"])

# split into training and test data
train_1617, valid_1617, train_labels_1617, valid_labels_1617 = train_test_split(
    data_1617.drop("FTR", axis=1),
    data_1617["FTR"],
    test_size=0.2,
    random_state=123
)

# Combine the training data and labels to fit into generalized linear model
train_1617_all = train_1617
train_1617_all["FTR"] = train_labels_1617

# Use Poisson to calculate
m = smf.glm(formula="FTR ~ HomeTeam + AwayTeam + FTHG + FTAG + HS + AS+ HST + AST + HSGR + ASGR", 
            data=train_1617_all, family=sm.families.Poisson()).fit()

# Predict and find accuracy
pred = np.round(m.predict(valid_1617))
a1 = accuracy_score(valid_labels_1617, pred)

# Use Poisson to calculate
m = smf.glm(formula="FTR ~ FTHG + FTAG + HS + AS+ HST + AST + HSGR + ASGR", 
            data=train_1617_all, family=sm.families.Poisson()).fit()

# Predict and find accuracy
pred = np.round(m.predict(valid_1617))
a2 = accuracy_score(valid_labels_1617, pred)
a1, a2

(0.5789473684210527, 0.6052631578947368)

In [149]:
# create label encoder to convert H, A, or D into factors for who wins (Home, Away, or Draw)
encoder_1718 = LabelEncoder()
encoder_1718.fit(data_1718["FTR"])
data_1718["FTR"] = encoder_1718.transform(data_1718["FTR"])

# split into training and test data
train_1718, valid_1718, train_labels_1718, valid_labels_1718 = train_test_split(
    data_1718.drop("FTR", axis=1),
    data_1718["FTR"],
    test_size=0.2,
    random_state=123
)

# Combine the training data and labels to fit into generalized linear model
train_1718_all = train_1718
train_1718_all["FTR"] = train_labels_1718

# Use Poisson to calculate
m = smf.glm(formula="FTR ~ HomeTeam + AwayTeam + FTHG + FTAG + HS + AS+ HST + AST + HSGR + ASGR", 
            data=train_1718_all, family=sm.families.Poisson()).fit()

# Predict and find accuracy
pred = np.round(m.predict(valid_1718))
a1 = accuracy_score(valid_labels_1718, pred)

# Use Poisson to calculate
m = smf.glm(formula="FTR ~ FTHG + FTAG + HS + AS+ HST + AST + HSGR + ASGR", 
            data=train_1718_all, family=sm.families.Poisson()).fit()

# Predict and find accuracy
pred = np.round(m.predict(valid_1718))
a2 = accuracy_score(valid_labels_1718, pred)
a1, a2

(0.618421052631579, 0.6578947368421053)

In [150]:
# create label encoder to convert H, A, or D into factors for who wins (Home, Away, or Draw)
encoder_1819 = LabelEncoder()
encoder_1819.fit(data_1819["FTR"])
data_1819["FTR"] = encoder_1819.transform(data_1819["FTR"])

# split into training and test data
train_1819, valid_1819, train_labels_1819, valid_labels_1819 = train_test_split(
    data_1819.drop("FTR", axis=1),
    data_1819["FTR"],
    test_size=0.2,
    random_state=123
)

# Combine the training data and labels to fit into generalized linear model
train_1819_all = train_1819
train_1819_all["FTR"] = train_labels_1819

# Use Poisson to calculate
m = smf.glm(formula="FTR ~ HomeTeam + AwayTeam + FTHG + FTAG + HS + AS+ HST + AST + HSGR + ASGR", 
            data=train_1819_all, family=sm.families.Poisson()).fit()

# Predict and find accuracy
pred = np.round(m.predict(valid_1819))
a1 = accuracy_score(valid_labels_1819, pred)

# Use Poisson to calculate
m = smf.glm(formula="FTR ~ FTHG + FTAG + HS + AS+ HST + AST + HSGR + ASGR", 
            data=train_1819_all, family=sm.families.Poisson()).fit()

# Predict and find accuracy
pred = np.round(m.predict(valid_1819))
a2 = accuracy_score(valid_labels_1819, pred)
a1, a2

(0.5862068965517241, 0.5862068965517241)

In [124]:
# concatenate all data
all_data = pd.concat([data_1415, data_1516, data_1617, data_1718, data_1819], axis=0, ignore_index=True)

In [152]:
encoder_all = LabelEncoder()
encoder_all.fit(all_data["FTR"])
all_data["FTR"] = encoder_all.transform(all_data["FTR"])
train_all, valid_all, train_labels_all, valid_labels_all = train_test_split(
    all_data.drop("FTR", axis=1),
    all_data["FTR"],
    test_size=0.2,
    random_state=123
)

train_all_allcols = train_all
train_all_allcols["FTR"] = train_labels_all
m1 = smf.glm(formula="FTR ~ HomeTeam + AwayTeam + FTHG + FTAG + HS + AS+ HST + AST + HSGR + ASGR", 
            data=train_all_allcols, family=sm.families.Poisson()).fit()
pred = np.round(m1.predict(valid_all))
pred[np.isnan(pred)] = 1
a1 = accuracy_score(valid_labels_all, pred)

m2 = smf.glm(formula="FTR ~ FTHG + FTAG + HS + AS+ HST + AST + HSGR + ASGR", 
            data=train_all_allcols, family=sm.families.Poisson()).fit()

pred = np.round(m2.predict(valid_all))
pred[np.isnan(pred)] = 1
a2 = accuracy_score(valid_labels_all, pred)
a1, a2

(0.6519337016574586, 0.6104972375690608)

# Using Poisson

We attempted to run a poisson model on the data to produce a guess for if the Home team would win, Away team, or if it would end up in a draw. We encoded the values using `sklearn`'s `LabelEncoder` function. Then we split the data into test and training data. We then plugged it into a general linear model with a Poisson family using the following formulas: `FTR ~ HomeTeam + AwayTeam FTHG + FTAG + HS + AS + HST + AST + HSGR + ASGR` and `FTR ~ FTHG + FTAG + HS + AS + HST + AST + HSGR + ASGR`.

Here are a list of the values we used in these models:
* `HomeTeam` - Name of the home team
* `AwayTeam` - Name of the away team
* `FTR` - Full time result (Home win, Away win, Draw)
* `FTHG` - Full time home team goals
* `FTAG` - Full time away team goals
* `HS` - Home team shots
* `AS` - Away team shots
* `HST` - Home team shots on target
* `AST` - Away team shots on target
* `HSGR` - Home team shots goal ratio (calculated)
* `ASGR` - Away team shots goal ratio (calculated)

## For each year

First we ran this model on the separate datasets. At first, we also included the Home team and Away team names factors in the model and received these accuracy scores:

* 14-15: 0.474
* 15-16: 0.671
* 16-17: 0.579
* 17-18: 0.618
* 18-19: 0.586

Our very simple model can predict correctly more than half the times per year except for 2014-2015.

We decided to remove the names from the equation and only focus on the statistics of the team and got better accuracies for most of them (all were over 50% accurate now!).

* 14-15: 0.526
* 15-16: 0.632
* 16-17: 0.605
* 17-18: 0.658
* 18-19: 0.586

## Combined

We also ran the model on all the combined datasets. For the model that included the Home team and away team names as factors, we received an accuracy of 0.652. For the one without, we received a lower accuracy of 0.611. 

If we take a look at the summary of the model with the team names as factors though, we see that none of the p values for the team name factors are significant. To contrast [David Sheehan's study](https://dashee87.github.io/football/python/predicting-football-results-with-statistical-modelling/), we didn't receive any significant p values because we did not combine `HomeTeam` and `AwayTeam` into a single `team` variable for this specific analysis. 

In [154]:
m1.summary()

0,1,2,3
Dep. Variable:,FTR,No. Observations:,1446
Model:,GLM,Df Residuals:,1381
Model Family:,Poisson,Df Model:,64
Link Function:,log,Scale:,1.0000
Method:,IRLS,Log-Likelihood:,-1494.8
Date:,"Tue, 12 Mar 2019",Deviance:,569.74
Time:,00:29:11,Pearson chi2:,436.
No. Iterations:,5,Covariance Type:,nonrobust

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.2010,0.213,-0.944,0.345,-0.618,0.216
HomeTeam[T.Aston Villa],-0.1945,0.253,-0.768,0.443,-0.691,0.302
HomeTeam[T.Bournemouth],-0.0082,0.160,-0.051,0.959,-0.323,0.306
HomeTeam[T.Brighton],0.0623,0.200,0.312,0.755,-0.329,0.454
HomeTeam[T.Burnley],-0.1241,0.172,-0.720,0.472,-0.462,0.214
HomeTeam[T.Cardiff],-0.0024,0.336,-0.007,0.994,-0.661,0.656
HomeTeam[T.Chelsea],0.0151,0.133,0.113,0.910,-0.246,0.276
HomeTeam[T.Crystal Palace],-0.1758,0.161,-1.094,0.274,-0.491,0.139
HomeTeam[T.Everton],-0.1237,0.145,-0.853,0.393,-0.408,0.160


These non significant p values indicate that the model with team names as factors is overfitting the data and should not be part of the model. If we take a look at the model without the team names as factors, we can see what variables have a definite effect on the outcome of the game.

We see `FTHG`, `FTAG`, `HS`, `HSGR`, and `ASGR` are all very significant in the model (p < 0.06) in predicting the outcome of the game. 

## Poisson Discussion

insert stuff here later because it's too late right now

In [155]:
m2.summary()

0,1,2,3
Dep. Variable:,FTR,No. Observations:,1446
Model:,GLM,Df Residuals:,1437
Model Family:,Poisson,Df Model:,8
Link Function:,log,Scale:,1.0000
Method:,IRLS,Log-Likelihood:,-1505.3
Date:,"Tue, 12 Mar 2019",Deviance:,590.65
Time:,00:31:07,Pearson chi2:,447.
No. Iterations:,5,Covariance Type:,nonrobust

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.3320,0.134,-2.474,0.013,-0.595,-0.069
FTHG,0.1594,0.038,4.190,0.000,0.085,0.234
FTAG,-0.6423,0.061,-10.520,0.000,-0.762,-0.523
HS,0.0215,0.008,2.864,0.004,0.007,0.036
AS,0.0124,0.008,1.573,0.116,-0.003,0.028
HST,-0.0071,0.015,-0.476,0.634,-0.036,0.022
AST,-0.0012,0.017,-0.071,0.944,-0.034,0.032
HSGR,1.8988,0.468,4.059,0.000,0.982,2.816
ASGR,0.9209,0.482,1.909,0.056,-0.024,1.866
