In [None]:
import pandas as pd
pd.set_option('display.max_columns', 100)
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error
from xgboost import XGBRegressor
import shap
shap.initjs()

In [None]:
# read csv with specified date column, drop unnecessary columns
df = pd.read_csv(
    "./data/super_rugby_oddsportal.csv", parse_dates =["Date"]
).drop(
    ['Play-off Game?', 'Bookmakers Surveyed'], axis=1  # drop unnecessary columns
)

# ** is this necessary ?
spare = pd.read_csv("./data/super_rugby_oddsportal.csv").drop('Play-off Game?', axis=1).dropna()

# number of upcoming fixtures
n_matches = len(df[(df['Home Score']==0) & (df['Away Score']==0)])

df.head(n_matches)

---
## 1. Feature engineering
### Numeric columns
  - `home_margin` = `home_score` - `away_score`: score differential w.r.t. home team
  - `home_win` = `home_score` > `away_score`: did the home team win (1/0)?
    - We will use this in the next block to create a moving window of home / away team win rates
  - `home_odds` = `home_score` - `away_score`: transformed bookmakers' odds into form pr(home team win)

In [None]:
# standardise naming format
df.columns = [x.lower().replace(' ', '_') for x in df.columns]

# extract year
df['year'] = df['date'].apply(lambda x: x.year)

# add home margin column
df['home_margin'] = df['home_score'] - df['away_score']
# add home win column
df['home_win'] = (df['home_score'] > df['away_score']).astype('int')

# aggregate odds into single probability variable
df['home_odds'] = df['away_odds'] / (df['home_odds'] + df['away_odds'])
df.drop(['draw_odds', 'away_odds'], axis=1, inplace=True)

df.head(15)

### Window functions
  - avg points scored by home side in last n games
  - avg points conceded by home side in last n games
  - avg points scored by away side in last n games
  - avg points conceded by away side in last n games

In [None]:
df = df.assign(
    # average odds for home_team in previous n home fixtures
    avg_hm_odd_5=df.groupby('home_team')['home_odds'].rolling(5).mean().shift(-5).reset_index(0, drop=True),
    avg_hm_odd_10=df.groupby('home_team')['home_odds'].rolling(10).mean().shift(-10).reset_index(0, drop=True),
    avg_hm_odd_20=df.groupby('home_team')['home_odds'].rolling(20).mean().shift(-20).reset_index(0, drop=True),
    # average odds for away team in previous n away fixtures
    avg_aw_odd_5= 1-df.groupby('away_team')['home_odds'].rolling(5).mean().shift(-5).reset_index(0, drop=True),
    avg_aw_odd_10= 1-df.groupby('away_team')['home_odds'].rolling(10).mean().shift(-10).reset_index(0, drop=True),
    avg_aw_odd_20= 1-df.groupby('away_team')['home_odds'].rolling(20).mean().shift(-20).reset_index(0, drop=True),
    # home team win rate previous n
    hm_wr_5=df.groupby('home_team')['home_win'].rolling(5).sum().shift(-5).reset_index(0, drop=True)/5,
    hm_wr_10=df.groupby('home_team')['home_win'].rolling(10).sum().shift(-10).reset_index(0, drop=True)/10,
    hm_wr_20=df.groupby('home_team')['home_win'].rolling(20).sum().shift(-20).reset_index(0, drop=True)/20,
    # away team win rate previous n
    aw_wr_5= 1-df.groupby('away_team')['home_win'].rolling(5).sum().shift(-5).reset_index(0, drop=True)/5,
    aw_wr_10= 1-df.groupby('away_team')['home_win'].rolling(10).sum().shift(-10).reset_index(0, drop=True)/10,
    aw_wr_20= 1-df.groupby('away_team')['home_win'].rolling(20).sum().shift(-20).reset_index(0, drop=True)/20,
    # average margin by home_team in previous n home fixtures
    avg_hm_marg_5=df.groupby('home_team')['home_margin'].rolling(5).mean().shift(-5).reset_index(0, drop=True),
    avg_hm_marg_10=df.groupby('home_team')['home_margin'].rolling(10).mean().shift(-10).reset_index(0, drop=True),
    avg_hm_marg_20=df.groupby('home_team')['home_margin'].rolling(20).mean().shift(-20).reset_index(0, drop=True)
)

df.drop(['date', 'home_score', 'away_score', 'home_win'], axis=1, inplace=True)

df = df.dropna().reset_index(drop=True)

df.head(n_matches)

### Categorical columns
  - home team / country
  - away team / country
  - year
  
Drop `country = Argentina` and `country = Japan` and all teams not currently in competition.
  
> **Test**: One-hot encoding vs label encoder

In [None]:
countries = {'Crusaders': 'NZ',
             'Chiefs': 'NZ',
             'Blues': 'NZ',
             'Hurricanes': 'NZ',
             'Highlanders': 'NZ',
             'Bulls': 'SA',
             'Cheetahs': 'SA',
             'Kings': 'SA',
             'Lions': 'SA',
             'Sharks': 'SA',
             'Stormers': 'SA',
             'Brumbies': 'AUS',
             'Force': 'AUS',
             'Rebels': 'AUS',
             'Reds': 'AUS',
             'Waratahs': 'AUS',
             'Jaguares': 'ARG',
             'Sunwolves': 'JPN'}

# add nationalities
df['home_country'] = df['home_team'].replace(countries)
df['away_country'] = df['away_team'].replace(countries)

# convert to pandas category dtypes
df[
    ['home_team', 'away_team', 'home_country', 'away_country']
] = df[
    ['home_team', 'away_team', 'home_country', 'away_country']
].astype('category')

# one-hot encode nationalities
df = pd.get_dummies(df, prefix='home_country', columns=['home_country'])
df = pd.get_dummies(df, prefix='away_country', columns=['away_country'])
    
# one-hot encode team names
df = pd.get_dummies(df, prefix='home_team', columns=['home_team'])
df = pd.get_dummies(df, prefix='away_team', columns=['away_team'])

# drop irrelevent columns
df.drop(['home_country_ARG', 'home_country_JPN', 'away_country_ARG', 'away_country_JPN',
         'home_team_Cheetahs', 'away_team_Cheetahs', 'home_team_Kings', 
         'away_team_Kings', 'home_team_Force', 'away_team_Force'], 
        axis=1,
        inplace=True)

df.head(n_matches)

---

In [None]:
X = df[n_matches:].drop(['home_margin'], axis=1)
y = df[n_matches:].home_margin.values

In [None]:
# upcoming week's fixtures
X_temp = df[:n_matches].drop(['home_margin'], axis=1)
y_temp = df[:n_matches].home_margin.values

In [None]:
y.shape

In [None]:
X.shape

---
## 2. Train model
  - To do: train neural network using embeddings for teams instead of one-hot encoding

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.1, shuffle=False)

In [None]:
X_train.head()

### Evaluate performance

In [None]:
model = XGBRegressor(max_depth=1, learning_rate=.1, n_estimators=100, n_jobs=-1, min_child_weight=5, subsample=.45, random_state=0)
model.fit(X_train, y_train)

In [None]:
# training data
print(f'Train RMSE: {np.sqrt(mean_squared_error(model.predict(X_train), y_train)):.3f}')
print(f'Train MAE: {mean_absolute_error(y_train, model.predict(X_train)):.3f}')

In [None]:
# test data
print(f'Test RMSE: {np.sqrt(mean_squared_error(model.predict(X_test), y_test)):.3f}')
print(f'Test MAE: {mean_absolute_error(y_test, model.predict(X_test)):.3f}')

### Train on all data (if not overfitting)

In [None]:
# include feature standardizer for numeric columns

# from sklearn.preprocessing import StandardScaler
# from sklearn.compose import ColumnTransformer
# from sklearn.pipeline import Pipeline

# # standardize first two columns
# ct = ColumnTransformer(
#     [('scaler', StandardScaler(), [0, 1, -1])]
# )

# xgb = XGBRegressor(max_depth=1, learning_rate=.1, n_estimators=100, n_jobs=-1, min_child_weight=5, subsample=.45, random_state=0)

# model = Pipeline(
#     [
#         ('transformer', ct),
#         ('regressor', xgb)        
#     ]
# )

In [None]:
model.fit(X, y)

In [None]:
model.fit(X, y)

print(f'RMSE: {np.sqrt(mean_squared_error(model.predict(X), y)):.3f}')
print(f'MAE: {mean_absolute_error(y, model.predict(X)):.3f}')

### SHAP values
[Here is a nice notebook tutorial](https://slundberg.github.io/shap/notebooks/Census%20income%20classification%20with%20XGBoost.html)  for working with SHAP values.

In [None]:
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)
shap.summary_plot(shap_values, X)

In [None]:
shap.summary_plot(shap_values, X, plot_type="bar")

In [None]:
shap.force_plot(explainer.expected_value, shap_values[:1000,:], X.iloc[:1000,:])

## 3. Make predictions for upcoming week

In [None]:
# predict for upcoming week
np.vstack((model.predict(X_temp), spare['Home Team'][:n_matches], spare['Away Team'][:n_matches])).T

In [None]:
temp = np.vstack((model.predict(X_temp), spare['Home Team'][:n_matches], spare['Away Team'][:n_matches])).T

In [None]:
preds_df = pd.DataFrame(temp, columns=["Home_Margin", "Home_Team", "Away_Team"])

In [None]:
preds_df["Home_Margin"] = preds_df.Home_Margin.apply(lambda x: int(np.round(x, 0)))
preds_df

### Explain single prediction

In [None]:
MATCH_NO = 6

print(f'Explaining result of  {preds_df.iloc[MATCH_NO,1]} vs {preds_df.iloc[MATCH_NO,2]}:\n')
shap_values = explainer.shap_values(X_temp)
shap.force_plot(explainer.expected_value, shap_values[MATCH_NO,:], X_temp.iloc[MATCH_NO,:])