In [1]:
import pandas as pd
import numpy as np
from superugby import cleanup
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [2]:
results = pd.read_csv("https://raw.githubusercontent.com/kieranbd/superrugby-predictor/master/super_rugby_oddsportal.csv").drop('Play-off Game?', axis=1).dropna()

In [3]:
games = 7

Take a look at the upcoming fixtures (for weekend 15 March, 2019)

In [4]:
results.head(7)

Unnamed: 0,Date,Home Team,Away Team,Home Score,Away Score,Home Odds,Draw Odds,Away Odds
0,10-May-19,Blues,Hurricanes,0,0,2.87,22.18,1.43
1,10-May-19,Rebels,Reds,0,0,1.26,23.76,3.91
2,10-May-19,Bulls,Crusaders,0,0,3.81,24.48,1.27
3,10-May-19,Highlanders,Jaguares,0,0,1.16,30.18,5.25
4,10-May-19,Chiefs,Sharks,0,0,1.56,21.62,2.46
5,10-May-19,Lions,Waratahs,0,0,1.35,22.92,3.22
6,10-May-19,Brumbies,Sunwolves,0,0,1.15,30.38,5.28


## Engineer features which capture form
The functions below will work row-by-row, returning the win (or loss) streak for both home and away teams.

### Home and away streak
Home streak is the active win (or loss) streak for the team playing at home, in their most recent home games.   
Away streak is the same, but for the visiting team in their most recent away fixtures.   

### Home and away margins
Home average margin is the average margin (relative to home team) for the home team in their last n games at home. For example, a value of 5 means that the home side in this fixture is averaging a winning margin of 5 in their last n games at home.   

Away average margin is the same, but for the visiting team in last n away games. A positive number means that the away side has scored more points away from home than they have conceded in last n away games.   

## Encode all team names and nationalities
All team names have been one hot encoding into separate columns, with additional columns for the (generalized) nationalities of the teams.

In [5]:
df = cleanup(results)

In [7]:
df.head(7)

Unnamed: 0,Date,home_margin,home_win,home_streak,home_avg_marg,away_streak,away_avg_marg,home_win_prob,home_country_AUS,home_country_NZ,...,away_team_Highlanders,away_team_Hurricanes,away_team_Jaguares,away_team_Lions,away_team_Rebels,away_team_Reds,away_team_Sharks,away_team_Stormers,away_team_Sunwolves,away_team_Waratahs
0,2019-05-10,0,False,2,6.4,3,-1.2,0.332558,0,1,...,0,1,0,0,0,0,0,0,0,0
4,2019-05-10,0,False,-1,-4.0,3,11.4,0.61194,0,1,...,0,0,0,0,0,0,1,0,0,0
6,2019-05-10,0,False,4,10.4,-2,-7.8,0.821151,1,0,...,0,0,0,0,0,0,0,0,1,0
5,2019-05-10,0,False,-1,-5.2,-3,-6.6,0.704595,0,0,...,0,0,0,0,0,0,0,0,0,1
1,2019-05-10,0,False,-1,2.6,1,-4.2,0.756286,1,0,...,0,0,0,0,0,1,0,0,0,0
3,2019-05-10,0,False,-1,4.0,2,-3.2,0.819033,0,1,...,0,0,1,0,0,0,0,0,0,0
2,2019-05-10,0,False,2,-10.4,2,3.6,0.25,0,0,...,0,0,0,0,0,0,0,0,0,0


## Model fitting

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [None]:
# scale numerical features
scaler = StandardScaler()

numeric = ['home_streak', 'home_avg_marg', 'away_streak', 'away_avg_marg', 'home_win_prob']
df[numeric] = scaler.fit_transform(df[numeric].astype('float64'))

In [None]:
X = df[games:].drop(['Date', 'home_margin', 'home_win'], axis=1).values.astype(np.float64)
y = df[games:].home_margin.values.astype(np.float64)

In [None]:
# we will use these later:
X_temp = df[:games].drop(['Date', 'home_margin', 'home_win'], axis=1).values.astype(np.float64)
y_temp = df[:games].home_margin.values.astype(np.float64)

In [None]:
y.shape

In [None]:
X.shape

## Build TPOT model

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_selection import SelectFwe, f_regression
from sklearn.linear_model import ElasticNetCV, LassoLarsCV
from sklearn.pipeline import make_pipeline, make_union
from tpot.builtins import StackingEstimator, ZeroCount

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Average CV score on the training set was:-11.188556355715761
pipeline = make_pipeline(
    ZeroCount(),
    StackingEstimator(estimator=GradientBoostingRegressor(alpha=0.99,
                                                          learning_rate=0.01,
                                                          loss="quantile",
                                                          max_depth=4,
                                                          max_features=0.95,
                                                          min_samples_leaf=17,
                                                          min_samples_split=8,
                                                          n_estimators=100,
                                                          subsample=0.35)),
    StackingEstimator(estimator=ElasticNetCV(l1_ratio=0.05, tol=0.1, cv=5)),
    SelectFwe(score_func=f_regression, alpha=0.031),
    LassoLarsCV(normalize=True, cv=5)
)

pipeline.fit(X_train, y_train)

In [None]:
# Use the model's predict method on the test data
np.sqrt(mean_squared_error(pipeline.predict(X_test), y_test))

In [None]:
mean_absolute_error(y_test, pipeline.predict(X_test))

### Try 2018 season as a test run (TESTING ONLY)

In [None]:
# save a copy of the 2019 season as a test
mask_2018 = (df['Date'] > pd.to_datetime('2018-01-01')) & (df['Date'] <= pd.to_datetime('2019-01-01'))
df_2018 = df[mask_2018]

X_test = df_2018.drop(['Date', 'home_margin', 'home_win'], axis=1).values.astype(np.float64)
y_test = df_2018.home_margin.values.astype(np.float64)

In [None]:
# keep all other matches as training set
mask_pre_2018 = df['Date'] < pd.to_datetime('2018-01-01')
df_pre_2018 = df[mask_pre_2018]

X_train = df_pre_2018.drop(['Date', 'home_margin', 'home_win'], axis=1).values.astype(np.float64)
y_train = df_pre_2018.home_margin.values.astype(np.float64)

In [None]:
results_2018 = results[mask_2018][['Home_Team', 'Away_Team', 'home_margin', 'Home_Odds', 'Away_Odds']]
results_2018['home_win_prob'] = results_2018.Away_Odds / (results_2018.Home_Odds + results_2018.Away_Odds)
results_2018.drop(['Home_Odds', 'Away_Odds'], axis=1, inplace=True)

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
mean_absolute_error(y_test, pipeline.predict(X_test))

In [None]:
results_2018['prediction'] = np.round(pipeline.predict(X_test),0)

In [None]:
results_2018['error'] = abs(results_2018.home_margin - results_2018.prediction)
results_2018['win_point'] = (results_2018.home_margin * results_2018.prediction>0).astype('int')
results_2018['margin_point'] = (results_2018.error<=5).astype('int')

In [None]:
results_2018.head()

In [None]:
# how often is the winner correctly predicted?
results_2018.win_point.agg('mean')

In [None]:
# how often is the scoreline predicted within 5 points?
results_2018.margin_point.agg('mean')

In [None]:
# compare win points to success rate of bookmakers
home_wins = (results_2018['home_margin'] > 0).values

In [None]:
home_favourites = (results_2018.home_win_prob>.5).values

In [None]:
np.mean(home_wins == home_favourites)

---

## Make predictions for upcoming week

In [None]:
# fit to full data set
pipeline.fit(X, y)

In [None]:
temp = np.vstack((pipeline.predict(X_temp).reshape(games), results['Home_Team'][:games], results['Away_Team'][:games])).T

In [None]:
preds_df = pd.DataFrame(temp, columns=["Home_Margin", "Home_Team", "Away_Team"])

In [None]:
preds_df["Home_Margin"] = preds_df.Home_Margin.apply(lambda x: int(np.round(x, 0)))
preds_df