In [1]:
import pandas as pd
import numpy as np
from superugby import cleanup
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [2]:
df = pd.read_csv("super_rugby_oddsportal.csv").drop('Play-off Game?', axis=1).dropna()

In [3]:
spare = df

Take a look at the upcoming fixtures (for weekend 15 March, 2019)

In [4]:
df.head(6)

Unnamed: 0,Date,Home Team,Away Team,Home Score,Away Score,Home Odds,Draw Odds,Away Odds,Bookmakers Surveyed
0,15-Mar-19,Chiefs,Hurricanes,0.0,0.0,3.26,22.29,1.36,6.0
1,15-Mar-19,Brumbies,Waratahs,0.0,0.0,1.87,20.25,1.99,6.0
2,15-Mar-19,Stormers,Jaguares,0.0,0.0,1.4,22.13,3.05,6.0
3,16-Mar-19,Sunwolves,Reds,0.0,0.0,1.69,21.25,2.23,6.0
4,16-Mar-19,Highlanders,Crusaders,0.0,0.0,3.38,22.83,1.34,6.0
5,16-Mar-19,Lions,Rebels,0.0,0.0,1.36,22.83,3.19,6.0


## Engineer features which capture form
The functions below will work row-by-row, returning the win (or loss) streak for both home and away teams.

### Home and away streak
Home streak is the active win (or loss) streak for the team playing at home, in their most recent home games.   
Away streak is the same, but for the visiting team in their most recent away fixtures.   

### Home and away margins
Home average margin is the average margin (relative to home team) for the home team in their last n games at home. For example, a value of 5 means that the home side in this fixture is averaging a winning margin of 5 in their last n games at home.   

Away average margin is the same, but for the visiting team in last n away games. A positive number means that the away side has scored more points away from home than they have conceded in last n away games.   

## Encode all team names and nationalities
All team names have been one hot encoding into separate columns, with additional columns for the (generalized) nationalities of the teams.

In [5]:
df = cleanup(df)
df.head(15)

Unnamed: 0,home_margin,home_win,home_streak,home_avg_marg,away_streak,away_avg_marg,home_country_AUS,home_country_NZ,home_country_SA,away_country_AUS,...,away_team_Hurricanes,away_team_Jaguares,away_team_Lions,away_team_Rebels,away_team_Reds,away_team_Sharks,away_team_Stormers,away_team_Sunwolves,away_team_Waratahs,home_odds
0,0.0,False,-2,-4.6,-1,-9.8,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0.294372
1,0.0,False,1,9.8,1,-2.6,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0.515544
2,0.0,False,2,3.2,-4,-8.0,0,0,1,0,...,0,1,0,0,0,0,0,0,0,0.685393
3,0.0,False,-2,1.4,-8,-15.6,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0.568878
4,0.0,False,2,3.2,8,15.8,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0.283898
5,0.0,False,1,10.2,1,-0.0,0,0,1,1,...,0,0,0,1,0,0,0,0,0,0.701099
6,3.0,True,11,12.4,-1,-5.8,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0.740443
7,3.0,True,1,-4.2,-1,-6.8,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0.610422
8,29.0,True,21,24.0,-2,-2.6,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0.826807
9,8.0,True,-1,-3.6,1,-19.0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0.818466


## Model fitting

In [31]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [16]:
# scale numerical features
scaler = StandardScaler()

numeric = ['home_streak', 'home_avg_marg', 'away_streak', 'away_avg_marg', 'home_odds']
df[numeric] = scaler.fit_transform(df[numeric].astype('float64'))

In [17]:
df.head()

Unnamed: 0,home_margin,home_win,home_streak,home_avg_marg,away_streak,away_avg_marg,home_country_AUS,home_country_NZ,home_country_SA,away_country_AUS,...,away_team_Hurricanes,away_team_Jaguares,away_team_Lions,away_team_Rebels,away_team_Reds,away_team_Sharks,away_team_Stormers,away_team_Sunwolves,away_team_Waratahs,home_odds
0,0.0,0.0,-0.837236,-0.893591,0.044243,-0.528144,0.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.388831
1,0.0,0.0,-0.049946,0.526639,0.627873,0.199323,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,-0.361497
2,0.0,0.0,0.212484,-0.1243,-0.831203,-0.346277,0.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.427446
3,0.0,0.0,-0.837236,-0.301828,-1.998463,-1.114159,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.113765
4,0.0,0.0,0.212484,-0.1243,2.670579,2.058406,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.437482


In [18]:
X = df[6:].drop(['home_margin', 'home_win'], axis=1).values.astype(np.float32)
y = df[6:].home_margin.values.astype(np.float32)

In [19]:
# we will use these later:
X_temp = df[:6].drop(['home_margin', 'home_win'], axis=1).values.astype(np.float32)
y_temp = df[:6].home_margin.values.astype(np.float32)

In [20]:
y = y.reshape(-1, 1)
y.shape

(1234, 1)

In [21]:
X.shape

(1234, 41)

In [65]:
from torch import nn
from torch import optim
import torch.nn.functional as F
from skorch import NeuralNetRegressor

In [93]:
class RegressorModule(nn.Module):
    def __init__(
            self,
            num_units=24,
            nonlin=F.relu,
    ):
        super(RegressorModule, self).__init__()
        self.num_units = num_units
        self.nonlin = nonlin

        self.dense0 = nn.Linear(X.shape[1], num_units)
        self.nonlin = nonlin
        self.dense1 = nn.Linear(num_units, 10)
        self.output = nn.Linear(10, 1)

    def forward(self, X, **kwargs):
        X = self.nonlin(self.dense0(X))
        X = F.relu(self.dense1(X))
        X = self.output(X)
        return X
    
net = NeuralNetRegressor(
    module=RegressorModule,
    criterion=nn.modules.loss.L1Loss,
    optimizer=optim.Adam,
    lr=0.005,
    max_epochs=20,
    batch_size=64,
    # Shuffle training data on each epoch
    iterator_train__shuffle=False,
)

In [94]:
net.fit(X, y)

  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1       13.1454       14.9347  0.0339
      2       12.9579       14.4732  0.0359
      3       12.3584       13.2978  0.0329
      4       11.4221       12.2443  0.0329
      5       11.0225       12.2381  0.0339
      6       10.8610       12.3161  0.0339
      7       10.7822       12.3351  0.0329
      8       10.7120       12.3680  0.0329
      9       10.6485       12.3643  0.0319
     10       10.5825       12.3910  0.0329
     11       10.5332       12.3754  0.0329
     12       10.4802       12.4115  0.0329
     13       10.4416       12.4097  0.0319
     14       10.3843       12.4262  0.0319
     15       10.3476       12.4249  0.0319
     16       10.3002       12.4174  0.0329
     17       10.2675       12.4312  0.0329
     18       10.2208       12.4993  0.0329
     19       10.1942       12.5024  0.0319
     20       10.1400       12.4860  0.0319


<class 'skorch.regressor.NeuralNetRegressor'>[initialized](
  module_=RegressorModule(
    (dense0): Linear(in_features=41, out_features=24, bias=True)
    (dense1): Linear(in_features=24, out_features=10, bias=True)
    (output): Linear(in_features=10, out_features=1, bias=True)
  ),
)

In [95]:
mean_absolute_error(y, net.predict(X))

10.5555725

In [96]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [97]:
net.fit(X_train, y_train)

Re-initializing module because the following parameters were re-set: .
Re-initializing optimizer because the following parameters were re-set: .
  epoch    train_loss    valid_loss     dur
-------  ------------  ------------  ------
      1       13.4930       12.3499  0.0319
      2       13.1879       11.8267  0.0290
      3       12.5290       10.9560  0.0270
      4       11.9445       10.7093  0.0260
      5       11.7853       10.5718  0.0270
      6       11.5940       10.4719  0.0279
      7       11.4496       10.4417  0.0269
      8       11.3064       10.4731  0.0280
      9       11.1911       10.5331  0.0250
     10       11.1170       10.5486  0.0249
     11       11.0275       10.5512  0.0260
     12       10.9540       10.5453  0.0250
     13       10.8877       10.5488  0.0270
     14       10.8350       10.5140  0.0250
     15       10.7899       10.5025  0.0279
     16       10.7646       10.4952  0.0289
     17       10.7006       10.4753  0.0250
     18       10.67

<class 'skorch.regressor.NeuralNetRegressor'>[initialized](
  module_=RegressorModule(
    (dense0): Linear(in_features=41, out_features=24, bias=True)
    (dense1): Linear(in_features=24, out_features=10, bias=True)
    (output): Linear(in_features=10, out_features=1, bias=True)
  ),
)

In [98]:
mean_absolute_error(y_test, net.predict(X_test))

11.710151

---

## Make predictions for upcoming week

In [99]:
temp = np.vstack((net.predict(X_temp).reshape(6), spare['Home_Team'][:6], spare['Away_Team'][:6])).T

In [100]:
preds_df = pd.DataFrame(temp, columns=["Home_Margin", "Home_Team", "Away_Team"])

In [101]:
preds_df["Home_Margin"] = preds_df.Home_Margin.apply(lambda x: int(np.round(x, 0)))
preds_df

Unnamed: 0,Home_Margin,Home_Team,Away_Team
0,-3,Chiefs,Hurricanes
1,4,Brumbies,Waratahs
2,14,Stormers,Jaguares
3,-1,Sunwolves,Reds
4,-4,Highlanders,Crusaders
5,13,Lions,Rebels
