In [229]:
import pandas as pd
import numpy as np

results = pd.read_csv("1_bradley_terry_model.csv")
results.head(10)

Unnamed: 0,Date,Away Team,Away Pts,Home Team,Home Pts
0,Mar 22 (Thu 7:25pm),Carlton,95,Richmond,121
1,Mar 23 (Fri 7:50pm),Adelaide,87,Essendon,99
2,Mar 24 (Sat 3:35pm),Brisbane Lions,82,St Kilda,107
3,Mar 24 (Sat 4:35pm),Fremantle,60,Port Adelaide,110
4,Mar 24 (Sat 7:25pm),North Melbourne,39,Gold Coast,55
5,Mar 24 (Sat 7:25pm),Collingwood,67,Hawthorn,101
6,Mar 25 (Sun 1:10pm),Western Bulldogs,51,GWS Giants,133
7,Mar 25 (Sun 3:20pm),Geelong,97,Melbourne,94
8,Mar 25 (Sun 7:20pm),Sydney,115,West Coast,86
9,Mar 29 (Thu 7:50pm),Richmond,82,Adelaide,118


Add a few basic housekeeping columns: 
- game total
- mov
- game_result ("did the home team win?")
- home field advantage

In [230]:
results['game_total'] = results['Home Pts'] + results['Away Pts']
results['home_mov'] = results['Home Pts'] - results['Away Pts']
results['game_result'] = (results['Home Pts'] > results['Away Pts']).astype(int)
results['hfa'] = 1

To be able to run a regression based on pairwise data, we need to arrive at the following structure:
- Need a column for HFA that is also set to 1.
    - After running the regression, the value for that column will also represent the amount that HFA matters.
    
- a column for each team.
- a 1 in that row if that team was present in that game.
- some way to designate which of the two teams with a "1" were playing in that particular game.
    - After running the regression, the values for these columns represent the logistic_rating for each team 


Add a column for each team and set that column to 1 if that team played in that game (should be two "1"s per row).

In [231]:
teams = set(results['Away Team'].unique().tolist() + results['Home Team'].unique().tolist())
teams

{'Adelaide',
 'Brisbane Lions',
 'Carlton',
 'Collingwood',
 'Essendon',
 'Fremantle',
 'GWS Giants',
 'Geelong',
 'Gold Coast',
 'Hawthorn',
 'Melbourne',
 'North Melbourne',
 'Port Adelaide',
 'Richmond',
 'St Kilda',
 'Sydney',
 'West Coast',
 'Western Bulldogs'}

Add a column for each team

In [232]:
for team in teams:
    col_name = f"{team}"
    results[col_name] = 0
results.head(5)

Unnamed: 0,Date,Away Team,Away Pts,Home Team,Home Pts,game_total,home_mov,game_result,hfa,Richmond,...,Port Adelaide,Melbourne,Adelaide,Essendon,Carlton,Geelong,Fremantle,Western Bulldogs,Brisbane Lions,Collingwood
0,Mar 22 (Thu 7:25pm),Carlton,95,Richmond,121,216,26,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,Mar 23 (Fri 7:50pm),Adelaide,87,Essendon,99,186,12,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,Mar 24 (Sat 3:35pm),Brisbane Lions,82,St Kilda,107,189,25,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,Mar 24 (Sat 4:35pm),Fremantle,60,Port Adelaide,110,170,50,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,Mar 24 (Sat 7:25pm),North Melbourne,39,Gold Coast,55,94,16,1,1,0,...,0,0,0,0,0,0,0,0,0,0


In [233]:
for row in results.iterrows():
    asdict = row[1].to_dict()
    results.loc[row[0], asdict['Home Team']] = 1
    results.loc[row[0], asdict['Away Team']] = -1
results.head(10)

Unnamed: 0,Date,Away Team,Away Pts,Home Team,Home Pts,game_total,home_mov,game_result,hfa,Richmond,...,Port Adelaide,Melbourne,Adelaide,Essendon,Carlton,Geelong,Fremantle,Western Bulldogs,Brisbane Lions,Collingwood
0,Mar 22 (Thu 7:25pm),Carlton,95,Richmond,121,216,26,1,1,1,...,0,0,0,0,-1,0,0,0,0,0
1,Mar 23 (Fri 7:50pm),Adelaide,87,Essendon,99,186,12,1,1,0,...,0,0,-1,1,0,0,0,0,0,0
2,Mar 24 (Sat 3:35pm),Brisbane Lions,82,St Kilda,107,189,25,1,1,0,...,0,0,0,0,0,0,0,0,-1,0
3,Mar 24 (Sat 4:35pm),Fremantle,60,Port Adelaide,110,170,50,1,1,0,...,1,0,0,0,0,0,-1,0,0,0
4,Mar 24 (Sat 7:25pm),North Melbourne,39,Gold Coast,55,94,16,1,1,0,...,0,0,0,0,0,0,0,0,0,0
5,Mar 24 (Sat 7:25pm),Collingwood,67,Hawthorn,101,168,34,1,1,0,...,0,0,0,0,0,0,0,0,0,-1
6,Mar 25 (Sun 1:10pm),Western Bulldogs,51,GWS Giants,133,184,82,1,1,0,...,0,0,0,0,0,0,0,-1,0,0
7,Mar 25 (Sun 3:20pm),Geelong,97,Melbourne,94,191,-3,0,1,0,...,0,1,0,0,0,-1,0,0,0,0
8,Mar 25 (Sun 7:20pm),Sydney,115,West Coast,86,201,-29,0,1,0,...,0,0,0,0,0,0,0,0,0,0
9,Mar 29 (Thu 7:50pm),Richmond,82,Adelaide,118,200,36,1,1,-1,...,0,0,1,0,0,0,0,0,0,0


## Run the Logistic Regression With Scikit Learn

In [234]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
X_cols = list(teams) + ['hfa']
X = results[X_cols]
y = results[['game_result']]
model.fit(X, y)
print(model.coef_)
print(model.intercept_)

[[ 1.28042479 -1.14388266  0.61769174  0.51593326  0.66342965  0.06734911
  -1.33190497  0.87835988  0.19494668  0.43559295  0.22111071  0.26134314
  -1.73973762  0.43584334 -0.48682218 -0.49502591 -1.01990007  0.64524816
   0.10315611]]
[0.10315611]


  y = column_or_1d(y, warn=True)


## Sort Teams By Strength

In [235]:
ranked_teams = pd.DataFrame(list(zip(model.coef_[0].tolist(), X_cols)), columns=['Strength', 'Team'])
ranked_teams.sort_values(by="Strength", ascending=False)

Unnamed: 0,Strength,Team
0,1.280425,Richmond
7,0.87836,West Coast
4,0.66343,Hawthorn
17,0.645248,Collingwood
2,0.617692,Sydney
3,0.515933,GWS Giants
13,0.435843,Geelong
9,0.435593,Melbourne
11,0.261343,Essendon
10,0.221111,Adelaide
