In [21]:
from __future__ import division
import pandas as pd
import numpy as np 
import json, os, sys
import statsmodels.api as sm
from scipy.stats import poisson

In [42]:
raw_df = pd.read_clipboard()

In [49]:
df = raw_df
df['team'] = df['team'].astype('category')
df['opponent_team'] = df['opponent_team'].astype('category')


print df.info()

df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 760 entries, 0 to 759
Data columns (total 6 columns):
date                    760 non-null object
time                    760 non-null object
team                    760 non-null category
opponent_team           760 non-null category
goals_scored_by_team    760 non-null int64
home_flag               760 non-null int64
dtypes: category(2), int64(2), object(2)
memory usage: 25.6+ KB
None


Unnamed: 0,date,time,team,opponent_team,goals_scored_by_team,home_flag
0,17/5/2016,20:00,Man Utd,Bournemouth,3,0
1,15/5/2016,15:00,Arsenal,Aston Villa,4,0
2,16/5/2016,15:00,Chelsea,Leicester,1,0
3,17/5/2016,15:00,Everton,Norwich,3,0
4,18/5/2016,15:00,Newcastle,Spurs,5,0




## $ Pr (Home Goals) \sim Poisson(\lambda) $

## $ Pr(Away Goals) \sim Poisson (\mu) $



## $ \lambda \sim \gamma \tau \alpha_{i} \beta_{j} $

## $ \mu \sim \gamma \alpha_{j} \beta_{i} $

##### We can initialize $\gamma $ as half the mean of total goals divided by 2. (To give avg number of goals per team, as each game contains 2 teams

##### We can estimate the params for $ \alpha$ and $ \beta $ using GLM from statsmodels

In [90]:
def get_expected_goals(home_team, away_team, coef_df, model):
    home_attack_score = np.exp(coef_df.loc['team[T.{0}]'.format(home_team)][0])
    home_defence_score = np.exp(coef_df.loc['team[T.{0}]'.format(home_team)][1])
    
    away_attack_score = np.exp(coef_df.loc['team[T.{0}]'.format(away_team)][0])
    away_defence_score = np.exp(coef_df.loc['team[T.{0}]'.format(home_team)][1])
    
    attack_outcome = model.predict()
    
    poisson()

In [251]:
m = smf.glm(formula='goals_scored_by_team ~ home_flag + team + opponent_team - 1',
            data=df, family=sm.families.Poisson())
res = m.fit()

res.summary()

coeff_table = res.summary().tables[1]
coeff_df = pd.read_html(coeff_table.as_html(), header=0)[0]

col_names = ['param', 'coef', 'std_err', 'z', 'P', '95%']
coeff_df.columns = col_names

coeff_df.index = coeff_df['param']
del coeff_df['param']

### We have now calculated $ \alpha $ and $ \beta $ for every team.

#### The next step now is to move to calculate $ \rho $. This will then leave us with only $ \tau $ to calculate, and then we can move onto $ \mu $ and $ \lambda $, then we'll be ablt to develop a model. 

### If we assume that $ \rho $ can be estimated as 0.9.

In [214]:
rho = 0.9

### Let's also use the (unfounded) assumption that $ \gamma $ can be modelled as the average of goals per game.

In [209]:
gamma = np.mean(df['goals_scored_by_team'] / 2)

## Let's start with $ i = $ Arsenal

## $ j = $ Aston Villa

# $ \lambda_{ij} \sim \rho * \gamma * \alpha_{i} * \beta_{j}$
# $ \mu_{ij} \sim \gamma * \alpha_{j} * \beta_{i} $

In [269]:
home_team = 'West Ham'
away_team = 'Aston Villa'


alpha_i = coeff_df.loc['team[{0}]'.format(home_team)]['coef']
beta_i = coeff_df.loc['opponent_team[T.{0}]'.format(home_team)]['coef']
alpha_j = coeff_df.loc['team[{0}]'.format(away_team)]['coef']
beta_j = coeff_df.loc['opponent_team[T.{0}]'.format(away_team)]['coef']

lambda_var = rho * gamma * np.exp(alpha_i) * np.exp(beta_j)
mu_var = gamma * np.exp(alpha_j) * np.exp(beta_i)

In [273]:
repetitions = 10000
result_list = []
for x in xrange(repetitions):
    if x % 1000 == 0:
        print x
    result = {}
    home_goals = poisson(lambda_var).rvs(1)
    away_goals = poisson(mu_var).rvs(1)
    result['home'] = home_goals
    result['away'] = away_goals
    result_list.append(result)

0
1000
2000
3000
4000
5000
6000
7000
8000
9000


In [278]:
def unpack_results(results_dict):
    if results_dict['home'] > results_dict['away']:
        return 'home'
    
    if results_dict['away'] > results_dict['home']:
        return 'away'
    
    if results_dict['home'] == results_dict['away']:
        return 'draw'
    

In [279]:
outcome_list = []
for game in result_list:
    outcome_list.append(unpack_results(game))

In [282]:
a = np.array(outcome_list)

In [285]:
b = pd.DataFrame({'outcome': outcome_list})
b['outcome'] = b['outcome'].astype('category')

In [291]:
df[(df['team'] == 'West Ham') & (df['opponent_team'] == 'Liverpool')]

Unnamed: 0,date,time,team,opponent_team,goals_scored_by_team,home_flag
189,2/1/2016,12:45,West Ham,Liverpool,2,0
569,2/1/2016,12:45,West Ham,Liverpool,0,1
