In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
from afl.data import afltables

df = afltables.get_fixtures()

def sigmoid(x, width=100):
    return 1.0 / (1 + np.exp(-x/width))

df['home_squashed_margin'] = df['home_margin'].pipe(sigmoid)  # a width of 100 returns a similiar looking distribution to the home_points_ratio
# its possible we want to use the same width as the expected_home_squashed_margin, so that the ratings diff predicts the score margin?
df.tail()

Unnamed: 0_level_0,date,round,home_team,home_score,away_team,away_score,venue,season,home_points,away_points,home_points_ratio,home_margin,winner,home_win_draw_loss,round_type,round_number,home_state,away_state,is_interstate,home_squashed_margin
matchid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
15592,2019-08-17,R22,North Melbourne,22.12.144,Port Adelaide,8.10.58,Docklands,2019,144,58,0.712871,86,home,1.0,regular,22,VIC,SA,True,0.702661
15593,2019-08-17,R22,Fremantle,7.13.55,Essendon,13.9.87,Perth Stadium,2019,55,87,0.387324,-32,away,0.0,regular,22,WA,VIC,True,0.420676
15594,2019-08-18,R22,Richmond,13.10.88,West Coast,13.4.82,M.C.G.,2019,88,82,0.517647,6,home,1.0,regular,22,VIC,WA,True,0.514996
15595,2019-08-18,R22,GWS,9.11.65,Footscray,19.12.126,Sydney Showground,2019,65,126,0.340314,-61,away,0.0,regular,22,NSW,VIC,True,0.352059
15596,2019-08-18,R22,Hawthorn,18.10.118,Gold Coast,7.6.48,Docklands,2019,118,48,0.710843,70,home,1.0,regular,22,VIC,QLD,True,0.668188


# afl/model

In [3]:
# optimise initial ratings-
# find ratings which minimise 

In [4]:
from collections import defaultdict
from math import exp, log

In [5]:
class Elo:
    """
    """
    def __init__(self, k=30, home_advantage=20, interstate_advantage=5, width=400/log(10), carryover=0.75, k_decay=0.95, 
                 initial_ratings=None, mean_rating=1500, target='home_win_draw_loss'):
        """
        """
        self.k = k
        self.home_advantage = home_advantage
        self.interstate_advantage = interstate_advantage
        self.width = width
        self.carryover = carryover
        self.k_decay = k_decay
        
        self.mean_rating = mean_rating
        self.initial_ratings = initial_ratings or {}
        self.target = target  # home_win_draw_loss, home_points_ratio, home_squashed_margin
    
    def iterate_fixtures(self, fixtures, as_dataframe=True):
        """
        Parameters
        ----------
        fixtures : list of dict or pd.DataFrame
            Must be ordered. Each record (row) must have (columns): home_team, away_team, round_number, is_interstate, <self.target>
            Prefer a list of records as it's much faster
        
        We use the python stdlib math.exp which seems faster in single computation than numpy's version and therefore speeds up parameter fitting
        
        Profile code with lprun:
        %load_ext line_profiler
        elo = Elo()
        %lprun -f elo.iterate_fixtures elo.iterate_fixtures(fxtrain, as_dataframe=True)
        """
        # new teams are given self.initial_ratings
        self.current_ratings_ = defaultdict(lambda: self.mean_rating, self.initial_ratings)
        
        if isinstance(fixtures, pd.DataFrame):
            # A list of records is faster and less prone to errors on update than a DataFrame
            fixtures = fixtures.reset_index().to_dict('records')
            
        for fx in fixtures:
            home_team = fx['home_team']
            away_team = fx['away_team']
            home_actual_result = fx[self.target]
            round_number = fx['round_number']
            is_interstate = fx['is_interstate']
            
            # home_expected_result = self.predict_result(home_team, away_team, is_interstate, round_number)
            # -------
            home_rating_pre = self.current_ratings_[home_team]
            away_rating_pre = self.current_ratings_[away_team]
            
            if round_number == 1:
                # TODO: don't crunch the first round of training
                # Crunch the start of the season
                # Warning: this will make an in-place change the current ratings for the end of season 
                home_rating_pre = self.carryover*home_rating_pre + (1-self.carryover)*self.mean_rating
                away_rating_pre = self.carryover*away_rating_pre + (1-self.carryover)*self.mean_rating
            
            ratings_diff = home_rating_pre - away_rating_pre + self.home_advantage + self.interstate_advantage*is_interstate
            home_expected_result = 1.0 / (1 + exp(-ratings_diff/self.width))
            
            # self.update_ratings(home_actual_result, home_expected_result, round_number)
            # ------
            change_in_home_elo = self.k*self.k_decay**round_number*(home_actual_result - home_expected_result)          
            
            home_rating_post = home_rating_pre + change_in_home_elo
            away_rating_post = away_rating_pre - change_in_home_elo
            
            # update ratings
            self.current_ratings_[home_team] = home_rating_post
            self.current_ratings_[away_team] = away_rating_post

            fx['home_rating_pre'] = home_rating_pre
            fx['away_rating_pre'] = away_rating_pre
            fx['home_expected_result'] = home_expected_result  # proba
            # fx['binary_expected_home_result'] = int(expected_home_result > 0.5)  # prob
        
        if as_dataframe:
            # return pd.DataFrame(fixtures, columns=['matchid', 'home_expected_result']).set_index('matchid')
            return pd.DataFrame(fixtures).set_index('matchid')

        return fixtures
    
    def fit(self, X):
        # the only thing we really need to store is the *latest* rating (the system is memoryless)
        # self.teams_ = ['myteam']
        # self.current_ratings_ = {'myteam': 1500}
        return X
        
    def predict_proba(self):
        return expected_home_result
    
    def predict(self):
        return int(expected_home_result > 0.5)

In [6]:
TEAMS = ['Adelaide', 'Brisbane Lions', 'Carlton', 'Collingwood', 'Essendon', 'Footscray', 'Fremantle', 'GWS', 'Geelong', 'Gold Coast', 'Hawthorn', 'Melbourne',
         'North Melbourne', 'Port Adelaide', 'Richmond', 'St Kilda', 'Sydney', 'West Coast']

In [7]:
dftrain = df[df['season'].between(2010, 2018)]
test_msk = dftrain['season'].between(2016, 2018)
ytest = dftrain.loc[test_msk, 'home_win_draw_loss']
fxtrain = dftrain.reset_index().to_dict('records')

elo = Elo(target='home_squashed_margin')  #
fx = elo.iterate_fixtures(dftrain)
ypred = fx.loc[test_msk, 'home_expected_result'].gt(0.5).astype(int)
accuracy = (ytest == ypred).mean()
print(accuracy)

0.6650563607085346


In [8]:
df17 = df[df['season'].eq(2017)]

In [13]:
from scipy.optimize import minimize
from sklearn.metrics import log_loss, mean_squared_error

def loss_function(x, df, team_list, r0, width, reg_strength):
    """
    Helper function for optimizing initial ratings
    x[:-1] are team ratings; these should be ordered according to team_list. team_list should be ordered alphabetically. x[-1] is HT_advantage
    """
    team_ratings = x[:-1]
    home_advantage = x[-1]
    
    ratings_dict = dict(zip(team_list, team_ratings))
    
    home_rating = df['home_team'].map(ratings_dict)
    away_rating = df['away_team'].map(ratings_dict)
    
    ratings_diff = home_rating - away_rating + home_advantage
    home_expected_result = sigmoid(ratings_diff, width=width)    
    # MSE & log_loss doesn't seem to work- needs to be the total sum of squared errors or the log_loss * len(df)
    # Note that sklearn has defined `log_loss` as the average log_loss across samples
    loss = log_loss(df['home_win_draw_loss'], home_expected_result) * len(df)
    return loss

def rating_constraint(x, r0):
    # Average rating must equal r0
    team_ratings = x[:-1]
    return sum(team_ratings - r0)

In [14]:
def _optimize_initial_ratings(df):
    """Optimize initial ratings and the home_team_advantage
    Take a subset of match results as a training set.
    Find the ratings values which minimize loss on this set.
    This is a static optimization - ratings are NOT updated, hence the result is independent of k.
    The sum of ratings is constrained
    """
    r0 = 1500
    width = 400/log(10)
    reg_strength = 0

    team_list = sorted(set(df['home_team']) | set(df['away_team']))
    
    x0 = np.array([r0]*len(team_list) + [0])
    constraints = {'type': 'eq', 'fun': rating_constraint, 'args': [r0]}

    result = minimize(loss_function, x0,
                      args=(df, team_list, r0, width, reg_strength),
                      constraints=constraints)
    
    ratings = result.x[:-1]
    ratings = dict(zip(team_list, ratings))
    home_advantage = result.x[-1]

    return ratings, home_advantage

In [15]:
dftrain = df[df['season'].eq(2017) & df['home_win_draw_loss'].ne(0.5)].copy()

In [16]:
ratings, home_advantage = _optimize_initial_ratings(dftrain)
ratings = pd.Series(ratings).sort_values(ascending=False)
home_advantage

88.75482720719378

In [17]:
# from statsmodels.discrete.discrete_model import Logit
from statsmodels.api import Logit, OLS
from sklearn.linear_model import LogisticRegression, LinearRegression

In [18]:
width = 400 / log(10)

In [19]:
df.groupby('season')['home_win_draw_loss'].value_counts().unstack(fill_value=0).tail()

home_win_draw_loss,0.0,0.5,1.0
season,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015,91,2,113
2016,78,0,129
2017,80,3,124
2018,90,1,116
2019,28,0,26


In [20]:
dftrain = df[df['season'].eq(2015) & df['home_win_draw_loss'].ne(0.5)].copy()
X = (pd.get_dummies(dftrain['home_team']).astype(float) - pd.get_dummies(dftrain['away_team']).astype(float))
y = dftrain['home_win_draw_loss']

In [21]:
model = Logit(y, X)
fit_model = model.fit()
coefs = pd.Series(fit_model.params).sort_values(ascending=False)
coefs.sum(), coefs.std()

Optimization terminated successfully.
         Current function value: 0.509911
         Iterations 6


(-1.0137355688222272e-07, 1.1562192942042049)

In [22]:
coefs*width + 1500

West Coast         1764.753267
Fremantle          1762.634414
Hawthorn           1742.551697
Sydney             1670.004547
Richmond           1636.053158
North Melbourne    1616.823988
Adelaide           1607.066097
Footscray          1590.876672
Port Adelaide      1586.299341
Geelong            1582.858548
GWS                1467.170635
Collingwood        1445.558960
Melbourne          1346.951377
Essendon           1309.876665
St Kilda           1304.241143
Gold Coast         1199.859853
Carlton            1190.460395
Brisbane Lions     1175.959225
dtype: float64

In [23]:
dftrain = df[df['season'].eq(2015) & df['home_win_draw_loss'].ne(0.5)].copy()
X = (pd.get_dummies(dftrain['home_team']).astype(float) - pd.get_dummies(dftrain['away_team']).astype(float))
y = dftrain['home_win_draw_loss']

model = LogisticRegression(solver='liblinear', C=1e12)
model.fit(X, y)
coefs = pd.Series(model.coef_[0], index=X.columns).sort_values(ascending=False)
coefs.sum(), coefs.std()

(2.220446049250313e-15, 1.1777289469064713)

In [24]:
coefs*width + 1500

Fremantle          1771.575670
West Coast         1765.912320
Hawthorn           1743.443092
Sydney             1677.252536
Richmond           1641.253757
North Melbourne    1612.620429
Adelaide           1610.166859
Footscray          1598.961052
Geelong            1583.013258
Port Adelaide      1582.952637
GWS                1467.255811
Collingwood        1443.042167
Melbourne          1345.891534
St Kilda           1308.294152
Essendon           1304.759421
Gold Coast         1194.165340
Carlton            1177.759110
Brisbane Lions     1171.680857
dtype: float64

In [86]:
x = np.array([[1, 1.,-1., 0, 1], [1, -1.,1., 0, 1], [1, 1.,-1., 0, 0], [1, -1.,0, 1, 0]])
y = np.array([1, 0, 1, 1])

In [87]:
x

array([[ 1.,  1., -1.,  0.,  1.],
       [ 1., -1.,  1.,  0.,  1.],
       [ 1.,  1., -1.,  0.,  0.],
       [ 1., -1.,  0.,  1.,  0.]])

In [88]:
xtx = np.dot(x.transpose(), x)
invxtx = np.linalg.pinv(xtx)
hat = np.dot(invxtx, x.transpose())

yhat = np.dot(hat, y)
print(yhat[1:].sum())
yhat

0.0


array([ 5.00000000e-01, -2.22044605e-16, -5.00000000e-01,  5.00000000e-01,
        2.22044605e-16])

In [89]:
lr = LogisticRegression(fit_intercept=False, solver='liblinear', C=1e12).fit(x, y)
print(lr.coef_.sum())
lr.coef_

-0.03858814316734094


array([[  4.58841784,   1.39196545, -10.23410897,   8.84214353,
         -4.62700598]])

In [90]:
OLS(y, x).fit().params

array([ 5.00000000e-01,  1.66533454e-16, -5.00000000e-01,  5.00000000e-01,
       -1.26690821e-16])

In [72]:
raise adf

NameError: name 'adf' is not defined

In [None]:
def evaluate_accuracy_initial_ratings(x):
    # teams must be ordered
    elo = Elo(initial_ratings=dict(zip(TEAMS, x)))
    fx = elo.iterate_fixtures(fxtrain)
    ypred = fx.loc[test_msk, 'home_expected_result'].gt(0.5).astype(int)
    accuracy = (ytest == ypred).mean()
    return -accuracy   

def rating_constraint(x, r0):
    # Average rating must equal r0
    # team_ratings = x[:-1]
    return sum(x - r0)

constraints = {'type': 'eq', 'fun': rating_constraint, 'args': [1500]}

res = minimize(evaluate_accuracy_initial_ratings, [1500.]*len(TEAMS), constraints=constraints)
res.x

In [None]:
# Baseline accuracy
dftrain.loc[test_msk, 'winner'].eq('home').mean()

In [None]:
import seaborn as sns

In [None]:
# d = fx[['matchid', 'season', 'round_number', 'home_team', 'away_team', 'home_rating_pre', 'away_rating_pre']]
id_vars = ['date', 'season', 'round_number']
long = pd.concat([
    pd.melt(fx, id_vars=id_vars, value_vars=['home_team', 'away_team'], var_name='home_or_away', value_name='team'),
    pd.melt(fx, id_vars=None, value_vars=['home_rating_pre', 'away_rating_pre'], var_name='home_or_away2', value_name='rating'),
], 'columns')

In [None]:
import altair as alt
alt.Chart(long[long['season'] == 2013]).mark_line(interpolate='step-after').encode(
    x='date', 
    y=alt.Y('rating', scale=alt.Scale(zero=False)),
    color='team',
    tooltip=['round_number', 'home_or_away', 'date', 'team', 'rating']
)

In [None]:
# sns.scatterplot(x='date', y='rating', hue='team', data=long)

In [None]:
# need to melt fixtures so they can be plotted

In [None]:
from scipy.optimize import minimize, basinhopping, differential_evolution, dual_annealing, shgo

In [None]:
def evaluate_accuracy(x):
    """ytest and fxtrain defined outside this function
    """
    elo = Elo(*x)
    fx = elo.iterate_fixtures(fxtrain)
    ypred = fx.loc[test_msk, 'home_expected_result'].gt(0.5).astype(int)
    accuracy = (ytest == ypred).mean()
    # print(accuracy, x)
    return -accuracy

In [None]:
# see http://opisthokonta.net/?p=1412

In [None]:
# initial_ratings = dict(zip(TEAMS, res.x))
# initial_ratings

In [None]:
evaluate_accuracy_initial_ratings(res.x)

In [None]:
# minimize?

In [None]:
def optimise_parameters(fxtrain, ytest, test_msk, as_dataframe=True):
    scores = []
    target = 'home_squashed_margin'
    # for target in ['home_squashed_margin', 'home_points_ratio', 'home_win_draw_loss']:
    for k_decay in np.linspace(0.8, 1, 5):
        for carryover in np.linspace(0.6, 1, 5):
            for k in np.linspace(0, 100, 5):
                for interstate_advantage in np.linspace(0, 30, 5):
                    for home_advantage in np.linspace(0, 30, 5):

                        # Score
                        elo = Elo(target=target, k_decay=k_decay, k=k,
                                  home_advantage=home_advantage, interstate_advantage=interstate_advantage, carryover=carryover)
                        fx = elo.iterate_fixtures(fxtrain)
                        ypred = fx.loc[test_msk, 'home_expected_result'].gt(0.5).astype(int)
                        # yproba = 

                        accuracy = (ytest == ypred).mean()
                        # log_loss = np.nan

                        scores.append({'target': target, 'carryover': carryover, 'k': k, 'k_decay': k_decay, 
                                       'home_advantage': home_advantage, 'interstate_advantage': interstate_advantage,
                                       'accuracy': accuracy})
    if as_dataframe:
        return pd.DataFrame(scores).sort_values('accuracy', ascending=False)
    
    return scores

In [None]:
scores = optimise_parameters(fxtrain, ytest, test_msk)
scores.head()

In [None]:
# sns.scatterplot(x='k', y='accuracy', data=scores[(scores.home_advantage==15) & (scores.interstate_advantage==7.5) & (scores.carryover==0.7) & (scores.k_decay==0.9)])

In [None]:
# cv_results = pd.DataFrame(scores).sort_values('accuracy', ascending=False)

In [None]:
# best_params = cv_results.iloc[0].to_dict()

In [None]:
elo = Elo(target='home_squashed_margin', k_decay=0.9, k=100, width=250, 
          home_advantage=22.5, interstate_advantage=7.5, carryover=0.6)
fx = elo.iterate_fixtures(fxtrain)
ypred = fx.loc[test_msk, 'home_expected_result'].gt(0.5).astype(int)
# yproba = 

accuracy = (ytest == ypred).mean()
accuracy

In [None]:
elo = Elo(target='home_squashed_margin', k_decay=0.9, k=100, width=250, 
          home_advantage=7.5, interstate_advantage=22.5, carryover=0.6)
fx = elo.iterate_fixtures(fxtrain)
ypred = fx.loc[test_msk, 'home_expected_result'].gt(0.5).astype(int)
# yproba = 

accuracy = (ytest == ypred).mean()
accuracy

In [None]:
best_params

In [None]:
fx['correct'] = fx['home_expected_result'].ge(0.5) == fx['home_win_draw_loss']

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
fx.groupby('season')['correct'].mean()

In [None]:
cv_results.head(10)

In [None]:
raise Stop