In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

In [2]:
op_data = pd.read_csv("super_rugby_oddsportal.csv")

## Data Cleaning and Pre-Processing

Steps to carry out:

- Column names *DONE*
- Change categorical to numerical using one-hot encoding: Home team, away team  *DONE*
- Drop date column *DONE*
- Create response variable: home team margin (positive for home win, negative for away win)  *DONE*
- Remove home score and away score after creating response *DONE*

#### Function which takes in CSV and spits out pandas dataframe with dummy variables

In [3]:
# function to clean data

def prep_data (raw_csv):
  # read in data
  temp = pd.read_csv(raw_csv, names=["date", "home_team","away_team","home_score",
                                    "away_score","playoff","home_odds","draw_odds",
                                    "away_odds","num_bookies"])
  
  # drop incorrect first row
  temp = temp.drop(index=[0])
  
  # drop date column
  temp = temp.drop("date", axis=1)
  
  # playoffs replace NaN with 0
  temp.playoff.fillna(0, inplace=True)
  
  # playoffs replace Y with 1
  temp.playoff.replace("Y", 1, inplace=True)
  
  # drop any rows containing at least one NaN
  temp = temp.dropna(axis=0, how="any")
  
  # one hot encoding: find team names and append a prefix
  temp['home_team'] = 'home_' + temp['home_team'].astype(str)
  temp['away_team'] = 'away_' + temp['away_team'].astype(str)
  
  # create one hot df for home teams
  one_hot_home = pd.get_dummies(temp.home_team)
  one_hot_away = pd.get_dummies(temp.away_team)
  
  # append the one hot DFs to the primary df
  
  # Drop columns as they are now encoded
  temp = temp.drop(labels=["home_team", "away_team"], axis = 1)

  # Join the encoded df
  temp = temp.join(one_hot_home)
  temp = temp.join(one_hot_away)

  # convert score columns to numeric
  temp["home_score"] = temp["home_score"].astype(dtype=np.int64)
  temp["away_score"] = temp["away_score"].astype(dtype=np.int64)
  
  # create response variable for regression
  temp['home_margin'] = temp.home_score - temp.away_score

  # drop score columns
  temp = temp.drop(["home_score", "away_score"], axis=1)
  
  return temp

In [None]:
new_data = prep_data("new_data_wk_2.csv")

## Model Preparation: Random Forest Regressor

To do:

- Train/test split
- Set up random forest
- Train
- Check test accuracy
- Predict with week 2 data

In [None]:
# set up train/test split with 80/20 proportion

features = new_data[7:].drop(["home_margin"], axis = 1)
response = new_data[7:].home_margin

In [None]:
# train test split from sklearn

X_train, X_test, y_train, y_test = train_test_split(features, response, test_size=0.1)

In [None]:
# random forest regressor

# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 101)

In [None]:
# Train the model on training data
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=None,
           oob_score=False, random_state=101, verbose=0, warm_start=False)

#### Use week 2 from 2019 as test case

In [None]:
X_test = new_data[:7].drop(["home_margin"], axis = 1)

In [None]:
# Use the forest's predict method on the test data
predictions = rf.predict(X_test)

In [None]:
# Calculate the absolute errors
errors = abs(predictions - y_test)

In [None]:
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'points.')

Mean Absolute Error: 15.33 points.


In [None]:
# dataframe on preds
predictions_rf_df = pd.DataFrame(data=predictions, columns=["predictions"], index=X_test.index)

In [None]:
# join test features with predictions
result = X_test.join(round(predictions_rf_df))

In [None]:
result

Unnamed: 0,playoff,home_odds,draw_odds,away_odds,num_bookies,home_Blues,home_Brumbies,home_Bulls,home_Cheetahs,home_Chiefs,...,away_Jaguares,away_Kings,away_Lions,away_Rebels,away_Reds,away_Sharks,away_Stormers,away_Sunwolves,away_Waratahs,predictions
1,0,1.08,38.19,7.65,11,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,22.0
2,0,7.92,40.2,1.07,10,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,-16.0
3,0,1.36,23.63,3.21,10,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8.0
4,0,3.29,23.97,1.34,10,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,-6.0
5,0,2.14,21.71,1.74,10,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1.0
6,0,3.29,25.27,1.34,10,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,-4.0
7,0,2.02,21.48,1.82,10,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2.0


In [None]:
result.to_csv("week_2_preds.csv")