In [21]:
import numpy as np 
import pandas as pd

from rugby_prediction.constants import DATA_FOLDER

In [22]:
raw_match_data_path = DATA_FOLDER.joinpath('match_data.csv')
raw_match_data = pd.read_csv(raw_match_data_path)
raw_match_data.head()

Unnamed: 0,match_id,unique_id,match_date,venue,city,state,neutral_site,indoor,team_1_id,team_1_name,...,team_1_score,team_1_winner,team_2_id,team_2_name,team_2_abbreviation,team_2_home_away,team_2_score,team_2_winner,competition,season
0,24843,s:300~l:8323~e:24843~c:24843,2005-02-05T14:00Z,Stade de France,Saint-Denis,Reunion,False,False,9,France,...,16,True,2,Scotland,SCOT,away,9,False,Six Nations,2005
1,24844,s:300~l:8323~e:24844~c:24844,2005-02-05T17:30Z,Principality Stadium,Cardiff,Wales,False,False,4,Wales,...,11,True,1,England,ENG,away,9,False,Six Nations,2005
2,128030,s:300~l:8007~e:128030~c:128030,2005-02-05T14:00Z,Twickenham Stoop,London,England,False,False,25901,Harlequins,...,38,True,25900,Gloucester Rugby,GLOUC,away,9,False,Gallagher Premiership,2005
3,128028,s:300~l:8007~e:128028~c:128028,2005-02-05T15:00Z,cinch Stadium at Franklin's Gardens,Northampton,England,False,False,25907,Northampton Saints,...,20,False,25909,Saracens,SARAC,away,21,True,Gallagher Premiership,2005
4,128029,s:300~l:8007~e:128029~c:128029,2005-02-05T19:45Z,Recreation Ground,Bath,England,False,False,25898,Bath Rugby,...,6,False,25903,Leicester Tigers,LEICS,away,6,False,Gallagher Premiership,2005


In [23]:
from rugby_prediction.preprocessing import drop_nill_draws 

raw_match_data = drop_nill_draws(raw_match_data)

- Create previous conceded + scored columns
- Create previous result column
- Create rolling average columns 
- Test all initially with your default values (last conceded + last scored - last result + rolling 5 average)
- See what the score is 
- If good, make the pipelines
- If not, reassess on making some loops to test this

In [24]:
# creating a result column
team_1_conditions = [
    raw_match_data['team_1_score'] > raw_match_data['team_2_score'],
    raw_match_data['team_1_score'] == raw_match_data['team_2_score'],
    raw_match_data['team_1_score'] < raw_match_data['team_2_score'],
]
team_2_conditions = [
    raw_match_data['team_1_score'] < raw_match_data['team_2_score'],
    raw_match_data['team_1_score'] == raw_match_data['team_2_score'],
    raw_match_data['team_1_score'] > raw_match_data['team_2_score'],
]
choices = ['win', 'draw', 'loss']

raw_match_data['team_1_result'] = np.select(team_1_conditions, choices, default=np.nan)
raw_match_data['team_2_result'] = np.select(team_2_conditions, choices, default=np.nan)
raw_match_data[['team_1_score', 'team_2_score', 'team_1_result', 'team_2_result']].head(10)

Unnamed: 0,team_1_score,team_2_score,team_1_result,team_2_result
0,16,9,win,loss
1,11,9,win,loss
2,38,9,win,loss
3,20,21,loss,win
4,6,6,draw,draw
5,17,28,loss,win
6,29,28,win,loss
7,6,9,loss,win
8,8,38,loss,win
9,13,40,loss,win


In [25]:
# create score against columns
raw_match_data['team_1_score_against'] = raw_match_data['team_2_score']
raw_match_data['team_2_score_against'] = raw_match_data['team_1_score']

In [26]:
# create previous score and score against columns
raw_match_data['team_1_prev_score'] = raw_match_data.sort_values(by='match_date').groupby('team_1_id')['team_1_score'].shift(1)
raw_match_data['team_1_prev_score_against'] = raw_match_data.sort_values(by='match_date').groupby('team_1_id')['team_1_score_against'].shift(1)
raw_match_data['team_2_prev_score'] = raw_match_data.sort_values(by='match_date').groupby('team_2_id')['team_2_score'].shift(1)
raw_match_data['team_2_prev_score_against'] = raw_match_data.sort_values(by='match_date').groupby('team_2_id')['team_2_score_against'].shift(1)

In [27]:
# previous result column
raw_match_data['team_1_prev_result'] = raw_match_data.sort_values(by='match_date').groupby('team_1_id')['team_1_result'].shift(1)
raw_match_data['team_2_prev_result'] = raw_match_data.sort_values(by='match_date').groupby('team_2_id')['team_2_result'].shift(1)


In [28]:
# rolling avg columns
raw_match_data['team_1_rolling_score'] = raw_match_data.sort_values(by='match_date').groupby('team_1_id')['team_1_prev_score'].rolling(5).mean().reset_index(level='team_1_id', drop=True)
raw_match_data['team_1_rolling_score_against'] = raw_match_data.sort_values(by='match_date').groupby('team_1_id')['team_1_prev_score_against'].rolling(5).mean().reset_index(level='team_1_id', drop=True)
raw_match_data['team_2_rolling_score'] = raw_match_data.sort_values(by='match_date').groupby('team_2_id')['team_2_prev_score'].rolling(5).mean().reset_index(level='team_2_id', drop=True)
raw_match_data['team_2_rolling_score_against'] = raw_match_data.sort_values(by='match_date').groupby('team_2_id')['team_2_prev_score_against'].rolling(5).mean().reset_index(level='team_2_id', drop=True)


In [29]:
# imputing missing values - with median score
all_scores = raw_match_data['team_1_score'].tolist() + raw_match_data['team_2_score'].tolist()
median_score = np.median(all_scores)
median_score

21.0

In [30]:
from rugby_prediction.preprocessing import drop_competitions

raw_match_data = drop_competitions(raw_match_data)

In [31]:
# create row per team, per match
from rugby_prediction.preprocessing import TEAM_COLUMNS, CORE_COLUMNS, transform_raw_data_to_team_level

team_columns_with_conceded = TEAM_COLUMNS + ['score_against', 'rolling_score', 'rolling_score_against', 'prev_result', 'prev_score', 'prev_score_against']
team_level_data = transform_raw_data_to_team_level(raw_match_data, team_columns_with_conceded)
team_level_data.head()

Unnamed: 0,match_id,unique_id,match_date,venue,city,state,neutral_site,indoor,competition,season,...,id_opposition,home_away_opposition,score_opposition,winner_opposition,score_against_opposition,rolling_score_opposition,rolling_score_against_opposition,prev_result_opposition,prev_score_opposition,prev_score_against_opposition
0,24843,s:300~l:8323~e:24843~c:24843,2005-02-05T14:00Z,Stade de France,Saint-Denis,Reunion,False,False,Six Nations,2005,...,2,away,9,False,16,,,,,
1,24843,s:300~l:8323~e:24843~c:24843,2005-02-05T14:00Z,Stade de France,Saint-Denis,Reunion,False,False,Six Nations,2005,...,9,home,16,True,9,,,,,
2,24844,s:300~l:8323~e:24844~c:24844,2005-02-05T17:30Z,Principality Stadium,Cardiff,Wales,False,False,Six Nations,2005,...,1,away,9,False,11,,,,,
3,24844,s:300~l:8323~e:24844~c:24844,2005-02-05T17:30Z,Principality Stadium,Cardiff,Wales,False,False,Six Nations,2005,...,4,home,11,True,9,,,,,
4,128030,s:300~l:8007~e:128030~c:128030,2005-02-05T14:00Z,Twickenham Stoop,London,England,False,False,Gallagher Premiership,2005,...,25900,away,9,False,38,,,,,


In [32]:
from rugby_prediction.preprocessing import map_competitions

team_level_data = map_competitions(team_level_data)
team_level_data['home'] = np.where(team_level_data['home_away'] == 'home', True, False)
team_level_data.head()

Unnamed: 0,match_id,unique_id,match_date,venue,city,state,neutral_site,indoor,season,id,...,score_against_opposition,rolling_score_opposition,rolling_score_against_opposition,prev_result_opposition,prev_score_opposition,prev_score_against_opposition,international_competition,club_competition,unknown_competition,home
0,24843,s:300~l:8323~e:24843~c:24843,2005-02-05T14:00Z,Stade de France,Saint-Denis,Reunion,False,False,2005,9,...,16,,,,,,1,0,0,True
1,24843,s:300~l:8323~e:24843~c:24843,2005-02-05T14:00Z,Stade de France,Saint-Denis,Reunion,False,False,2005,2,...,9,,,,,,1,0,0,False
2,24844,s:300~l:8323~e:24844~c:24844,2005-02-05T17:30Z,Principality Stadium,Cardiff,Wales,False,False,2005,4,...,11,,,,,,1,0,0,True
3,24844,s:300~l:8323~e:24844~c:24844,2005-02-05T17:30Z,Principality Stadium,Cardiff,Wales,False,False,2005,1,...,9,,,,,,1,0,0,False
4,128030,s:300~l:8007~e:128030~c:128030,2005-02-05T14:00Z,Twickenham Stoop,London,England,False,False,2005,25901,...,38,,,,,,0,1,0,True


In [33]:
team_level_data.columns

Index(['match_id', 'unique_id', 'match_date', 'venue', 'city', 'state',
       'neutral_site', 'indoor', 'season', 'id', 'home_away', 'score',
       'winner', 'score_against', 'rolling_score', 'rolling_score_against',
       'prev_result', 'prev_score', 'prev_score_against', 'id_opposition',
       'home_away_opposition', 'score_opposition', 'winner_opposition',
       'score_against_opposition', 'rolling_score_opposition',
       'rolling_score_against_opposition', 'prev_result_opposition',
       'prev_score_opposition', 'prev_score_against_opposition',
       'international_competition', 'club_competition', 'unknown_competition',
       'home'],
      dtype='object')

In [34]:
training_columns = [
    'neutral_site',
    'season',
    'home',
    'rolling_score',
    'rolling_score_against',
    'prev_result',
    'prev_score',
    'prev_score_against',
    'rolling_score_opposition',
    'rolling_score_against_opposition',
    'prev_result_opposition',
    'prev_score_opposition',
    'prev_score_against_opposition',
    'international_competition',
    'club_competition',
    'unknown_competition',
]
target_column = ['score']

In [35]:
X = team_level_data[training_columns]
y = team_level_data[target_column]
X.head()

Unnamed: 0,neutral_site,season,home,rolling_score,rolling_score_against,prev_result,prev_score,prev_score_against,rolling_score_opposition,rolling_score_against_opposition,prev_result_opposition,prev_score_opposition,prev_score_against_opposition,international_competition,club_competition,unknown_competition
0,False,2005,True,,,,,,,,,,,1,0,0
1,False,2005,False,,,,,,,,,,,1,0,0
2,False,2005,True,,,,,,,,,,,1,0,0
3,False,2005,False,,,,,,,,,,,1,0,0
4,False,2005,True,,,,,,,,,,,0,1,0


In [36]:
null_value_replacements = {
    'rolling_score': median_score,
    'rolling_score_against': median_score,
    'prev_result': 'Unknown',
    'prev_score': median_score,
    'prev_score_against': median_score,
    'rolling_score_opposition': median_score,
    'rolling_score_against_opposition': median_score,
    'prev_result_opposition': 'Unknown',
    'prev_score_opposition': median_score,
    'prev_score_against_opposition': median_score
}
X = X.fillna(value=null_value_replacements)

X.isna().sum()


neutral_site                        0
season                              0
home                                0
rolling_score                       0
rolling_score_against               0
prev_result                         0
prev_score                          0
prev_score_against                  0
rolling_score_opposition            0
rolling_score_against_opposition    0
prev_result_opposition              0
prev_score_opposition               0
prev_score_against_opposition       0
international_competition           0
club_competition                    0
unknown_competition                 0
dtype: int64

In [42]:
# one hot encode the result column
cat_columns = ['prev_result', 'prev_result_opposition']
X_cat = X[cat_columns]
X_num = X.drop(columns=cat_columns)

X_cat = pd.get_dummies(X_cat)

In [44]:
# merge data back together
X = pd.merge(X_num, X_cat, left_index=True, right_index=True)
X.head()

Unnamed: 0,neutral_site,season,home,rolling_score,rolling_score_against,prev_score,prev_score_against,rolling_score_opposition,rolling_score_against_opposition,prev_score_opposition,...,club_competition,unknown_competition,prev_result_Unknown,prev_result_draw,prev_result_loss,prev_result_win,prev_result_opposition_Unknown,prev_result_opposition_draw,prev_result_opposition_loss,prev_result_opposition_win
0,False,2005,True,21.0,21.0,21.0,21.0,21.0,21.0,21.0,...,0,0,1,0,0,0,1,0,0,0
1,False,2005,False,21.0,21.0,21.0,21.0,21.0,21.0,21.0,...,0,0,1,0,0,0,1,0,0,0
2,False,2005,True,21.0,21.0,21.0,21.0,21.0,21.0,21.0,...,0,0,1,0,0,0,1,0,0,0
3,False,2005,False,21.0,21.0,21.0,21.0,21.0,21.0,21.0,...,0,0,1,0,0,0,1,0,0,0
4,False,2005,True,21.0,21.0,21.0,21.0,21.0,21.0,21.0,...,1,0,1,0,0,0,1,0,0,0


In [45]:
# train model
from sklearn.model_selection import train_test_split 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [46]:
from xgboost import XGBRegressor
from sklearn.model_selection import cross_val_score

xgbr = XGBRegressor()
scores = cross_val_score(xgbr, X_train, y_train, cv=10, scoring='neg_mean_absolute_error')
np.mean(-scores)

9.236109423383299

So, this model - *just* beats the baseline I got of 9.35 mean absolute error. This is not a good model, but this repository is focused on me practicising productionising, so for now, I'm going to use this (bang average) model.