In [2]:
import numpy as np
import pandas as pd 

from rugby_prediction.constants import DATA_FOLDER

In [3]:
# reading in raw data
raw_match_data_path = DATA_FOLDER.joinpath('match_data.csv')
raw_match_data = pd.read_csv(raw_match_data_path)
raw_match_data.head()

Unnamed: 0,match_id,unique_id,match_date,venue,city,state,neutral_site,indoor,team_1_id,team_1_name,...,team_1_score,team_1_winner,team_2_id,team_2_name,team_2_abbreviation,team_2_home_away,team_2_score,team_2_winner,competition,season
0,24843,s:300~l:8323~e:24843~c:24843,2005-02-05T14:00Z,Stade de France,Saint-Denis,Reunion,False,False,9,France,...,16,True,2,Scotland,SCOT,away,9,False,Six Nations,2005
1,24844,s:300~l:8323~e:24844~c:24844,2005-02-05T17:30Z,Principality Stadium,Cardiff,Wales,False,False,4,Wales,...,11,True,1,England,ENG,away,9,False,Six Nations,2005
2,128030,s:300~l:8007~e:128030~c:128030,2005-02-05T14:00Z,Twickenham Stoop,London,England,False,False,25901,Harlequins,...,38,True,25900,Gloucester Rugby,GLOUC,away,9,False,Gallagher Premiership,2005
3,128028,s:300~l:8007~e:128028~c:128028,2005-02-05T15:00Z,cinch Stadium at Franklin's Gardens,Northampton,England,False,False,25907,Northampton Saints,...,20,False,25909,Saracens,SARAC,away,21,True,Gallagher Premiership,2005
4,128029,s:300~l:8007~e:128029~c:128029,2005-02-05T19:45Z,Recreation Ground,Bath,England,False,False,25898,Bath Rugby,...,6,False,25903,Leicester Tigers,LEICS,away,6,False,Gallagher Premiership,2005


In [4]:
# create conceded columns
raw_match_data['team_1_conceded'] = raw_match_data['team_2_score']
raw_match_data['team_2_conceded'] = raw_match_data['team_1_score']

In [5]:
# create previous score + previous conceded columns
raw_match_data['team_1_prev_score'] = raw_match_data.sort_values(by='match_date').groupby('team_1_id')['team_1_score'].shift(1)
raw_match_data['team_1_prev_conceded'] = raw_match_data.sort_values(by='match_date').groupby('team_1_id')['team_1_conceded'].shift(1)
raw_match_data['team_2_prev_score'] = raw_match_data.sort_values(by='match_date').groupby('team_2_id')['team_2_score'].shift(1)
raw_match_data['team_2_prev_conceded'] = raw_match_data.sort_values(by='match_date').groupby('team_2_id')['team_2_conceded'].shift(1)

In [6]:
# create rolling average columns of conceded levels
raw_match_data['team_1_rolling_score'] = raw_match_data.sort_values(by='match_date').groupby('team_1_id')['team_1_prev_score'].rolling(5).mean().reset_index(level='team_1_id', drop=True)
raw_match_data['team_1_rolling_conceded'] = raw_match_data.sort_values(by='match_date').groupby('team_1_id')['team_1_prev_conceded'].rolling(5).mean().reset_index(level='team_1_id', drop=True)
raw_match_data['team_2_rolling_score'] = raw_match_data.sort_values(by='match_date').groupby('team_2_id')['team_2_prev_score'].rolling(5).mean().reset_index(level='team_2_id', drop=True)
raw_match_data['team_2_rolling_conceded'] = raw_match_data.sort_values(by='match_date').groupby('team_2_id')['team_2_prev_conceded'].rolling(5).mean().reset_index(level='team_2_id', drop=True)


In [7]:
# imputing missing values - with median score
all_scores = raw_match_data['team_1_score'].tolist() + raw_match_data['team_2_score'].tolist()
median_score = np.median(all_scores)
median_score

21.0

In [8]:
from rugby_prediction.preprocess import drop_competitions

raw_match_data = drop_competitions(raw_match_data)

In [9]:
# create row per team, per match
from rugby_prediction.preprocess import TEAM_COLUMNS, CORE_COLUMNS, transform_raw_data_to_team_level

team_columns_with_conceded = TEAM_COLUMNS + ['conceded', 'rolling_score', 'rolling_conceded']
team_level_data = transform_raw_data_to_team_level(raw_match_data, team_columns_with_conceded)
team_level_data.head()

Unnamed: 0,match_id,unique_id,match_date,venue,city,state,neutral_site,indoor,competition,season,...,conceded,rolling_score,rolling_conceded,id_opposition,home_away_opposition,score_opposition,winner_opposition,conceded_opposition,rolling_score_opposition,rolling_conceded_opposition
0,24843,s:300~l:8323~e:24843~c:24843,2005-02-05T14:00Z,Stade de France,Saint-Denis,Reunion,False,False,Six Nations,2005,...,9,,,2,away,9,False,16,,
1,24843,s:300~l:8323~e:24843~c:24843,2005-02-05T14:00Z,Stade de France,Saint-Denis,Reunion,False,False,Six Nations,2005,...,16,,,9,home,16,True,9,,
2,24844,s:300~l:8323~e:24844~c:24844,2005-02-05T17:30Z,Principality Stadium,Cardiff,Wales,False,False,Six Nations,2005,...,9,,,1,away,9,False,11,,
3,24844,s:300~l:8323~e:24844~c:24844,2005-02-05T17:30Z,Principality Stadium,Cardiff,Wales,False,False,Six Nations,2005,...,11,,,4,home,11,True,9,,
4,128030,s:300~l:8007~e:128030~c:128030,2005-02-05T14:00Z,Twickenham Stoop,London,England,False,False,Gallagher Premiership,2005,...,9,,,25900,away,9,False,38,,


In [10]:
from rugby_prediction.preprocess import map_competitions

team_level_data = map_competitions(team_level_data)
team_level_data['home'] = np.where(team_level_data['home_away'] == 'home', True, False)
team_level_data.head()

Unnamed: 0,match_id,unique_id,match_date,venue,city,state,neutral_site,indoor,season,id,...,home_away_opposition,score_opposition,winner_opposition,conceded_opposition,rolling_score_opposition,rolling_conceded_opposition,international_competition,club_competition,unknown_competition,home
0,24843,s:300~l:8323~e:24843~c:24843,2005-02-05T14:00Z,Stade de France,Saint-Denis,Reunion,False,False,2005,9,...,away,9,False,16,,,1,0,0,True
1,24843,s:300~l:8323~e:24843~c:24843,2005-02-05T14:00Z,Stade de France,Saint-Denis,Reunion,False,False,2005,2,...,home,16,True,9,,,1,0,0,False
2,24844,s:300~l:8323~e:24844~c:24844,2005-02-05T17:30Z,Principality Stadium,Cardiff,Wales,False,False,2005,4,...,away,9,False,11,,,1,0,0,True
3,24844,s:300~l:8323~e:24844~c:24844,2005-02-05T17:30Z,Principality Stadium,Cardiff,Wales,False,False,2005,1,...,home,11,True,9,,,1,0,0,False
4,128030,s:300~l:8007~e:128030~c:128030,2005-02-05T14:00Z,Twickenham Stoop,London,England,False,False,2005,25901,...,away,9,False,38,,,0,1,0,True


In [11]:
training_columns = [
    'neutral_site',
    'season',
    'home',
    'rolling_score',
    'rolling_conceded_opposition',
    'international_competition',
    'club_competition',
    'unknown_competition',
]
target_column = ['score']

In [12]:
X = team_level_data[training_columns]
y = team_level_data[target_column]
X.head()

Unnamed: 0,neutral_site,season,home,rolling_score,rolling_conceded_opposition,international_competition,club_competition,unknown_competition
0,False,2005,True,,,1,0,0
1,False,2005,False,,,1,0,0
2,False,2005,True,,,1,0,0
3,False,2005,False,,,1,0,0
4,False,2005,True,,,0,1,0


In [13]:
X = X.fillna(median_score)
X.head()

Unnamed: 0,neutral_site,season,home,rolling_score,rolling_conceded_opposition,international_competition,club_competition,unknown_competition
0,False,2005,True,21.0,21.0,1,0,0
1,False,2005,False,21.0,21.0,1,0,0
2,False,2005,True,21.0,21.0,1,0,0
3,False,2005,False,21.0,21.0,1,0,0
4,False,2005,True,21.0,21.0,0,1,0


In [14]:
from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [15]:
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score

xgbr = XGBRegressor()
scores = cross_val_score(xgbr, X_train, y_train, cv=10, scoring='neg_mean_absolute_error')
np.mean(-scores)

9.524835762254124

This model doesn't beat our MAE score of 9.34 using the linear regression model that we built in our first notebook. Due to this, we're going to experiment with some other windows for the rolling function - to see if that helps us get a better score.