In [21]:
from __future__ import division
import pandas as pd
import numpy as np 
import json, os, sys
import statsmodels.api as sm
from scipy.stats import poisson

In [42]:
raw_df = pd.read_clipboard()

In [49]:
df = raw_df
df['team'] = df['team'].astype('category')
df['opponent_team'] = df['opponent_team'].astype('category')


print df.info()

df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 760 entries, 0 to 759
Data columns (total 6 columns):
date                    760 non-null object
time                    760 non-null object
team                    760 non-null category
opponent_team           760 non-null category
goals_scored_by_team    760 non-null int64
home_flag               760 non-null int64
dtypes: category(2), int64(2), object(2)
memory usage: 25.6+ KB
None


Unnamed: 0,date,time,team,opponent_team,goals_scored_by_team,home_flag
0,17/5/2016,20:00,Man Utd,Bournemouth,3,0
1,15/5/2016,15:00,Arsenal,Aston Villa,4,0
2,16/5/2016,15:00,Chelsea,Leicester,1,0
3,17/5/2016,15:00,Everton,Norwich,3,0
4,18/5/2016,15:00,Newcastle,Spurs,5,0




## $ Pr (Home Goals) \sim Poisson(\lambda) $

## $ Pr(Away Goals) \sim Poisson (\mu) $



## $ \lambda \sim \gamma \tau \alpha_{i} \beta_{j} $

## $ \mu \sim \gamma \alpha_{j} \beta_{i} $

##### We can initialize $\gamma $ as half the mean of total goals divided by 2. (To give avg number of goals per team, as each game contains 2 teams

In [20]:
gamma = np.mean(df['home_score'] + df['away_score']) / 2

##### We can estimate the params for $ \alpha$ and $ \beta $ using GLM from statsmodels

In [206]:
m = smf.glm(formula='goals_scored_by_team ~ home_flag + team + opponent_team', data=df, family=sm.families.Poisson())
res = m.fit()

res.summary()

coeff_table = res.summary().tables[1]
coeff_df = pd.read_html(coeff_table.as_html(), header=0)[0]

In [90]:
def get_expected_goals(home_team, away_team, coef_df, model):
    home_attack_score = np.exp(coef_df.loc['team[T.{0}]'.format(home_team)][0])
    home_defence_score = np.exp(coef_df.loc['team[T.{0}]'.format(home_team)][1])
    
    away_attack_score = np.exp(coef_df.loc['team[T.{0}]'.format(away_team)][0])
    away_defence_score = np.exp(coef_df.loc['team[T.{0}]'.format(home_team)][1])
    
    attack_outcome = model.predict()
    
    poisson()

In [155]:
res.cov_params().head()

Unnamed: 0,Intercept,team[T.Aston Villa],team[T.Bournemouth],team[T.Chelsea],team[T.Crystal Palace],team[T.Everton],team[T.Leicester],team[T.Liverpool],team[T.Man City],team[T.Man Utd],...,opponent_team[T.Norwich],opponent_team[T.Southampton],opponent_team[T.Spurs],opponent_team[T.Stoke],opponent_team[T.Sunderland],opponent_team[T.Swansea],opponent_team[T.Watford],opponent_team[T.West Brom],opponent_team[T.West Ham],home_flag
Intercept,0.042629,-0.024894,-0.024896,-0.024891,-0.024886,-0.02489,-0.024891,-0.024899,-0.024885,-0.024886,...,-0.018035,-0.018039,-0.018029,-0.018024,-0.018022,-0.018018,-0.018016,-0.018022,-0.01804,-0.001763668
team[T.Aston Villa],-0.024894,0.044337,0.023872,0.023872,0.023872,0.023872,0.023872,0.023872,0.023872,0.023872,...,0.001027,0.001027,0.001027,0.001027,0.001027,0.001027,0.001027,0.001027,0.001027,1.991967e-16
team[T.Bournemouth],-0.024896,0.023872,0.041475,0.023872,0.023872,0.023872,0.023872,0.023872,0.023872,0.023872,...,0.001027,0.001027,0.001027,0.001027,0.001027,0.001027,0.001027,0.001027,0.001027,2.026272e-16
team[T.Chelsea],-0.024891,0.023872,0.023872,0.040054,0.023871,0.023872,0.023872,0.023872,0.023871,0.023871,...,0.001027,0.001027,0.001027,0.001027,0.001027,0.001027,0.001027,0.001027,0.001027,1.980837e-16
team[T.Crystal Palace],-0.024886,0.023872,0.023872,0.023871,0.04773,0.023871,0.023871,0.023872,0.023871,0.023871,...,0.001027,0.001027,0.001027,0.001027,0.001027,0.001027,0.001027,0.001027,0.001027,2.015193e-16


In [163]:
df[(df['team'] == 'Aston Villa') & (df['opponent_team'] == 'Watford')]

Unnamed: 0,date,time,team,opponent_team,goals_scored_by_team,home_flag
245,28/11/2016,15:00,Aston Villa,Watford,2,0
625,28/11/2016,15:00,Aston Villa,Watford,3,1


### We have now calculated $ \alpha $ and $ \beta $ for every team.

#### The next step now is to move to calculate $ \rho $. This will then leave us with only $ \tau $ to calculate, and then we can move onto $ \mu $ and $ \lambda $, then we'll be ablt to develop a model. 

If we assume that $\rho $

In [205]:
poisson(np.exp(m.predict(-0.263269, 0.701311))).rvs(10)

array([1, 2, 2, 2, 5, 1, 1, 1, 3, 3])

In [None]:
poisson(np.exp(m.predict(0.701311)))