## Take data from respository and add more features

In [99]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split

In [56]:
match = pd.read_csv('./Data_Raw//Match.csv', index_col=False)

## Add Features

In [57]:
## Add win/draw/loss labels (1,0,-1) to match, will be deleted later on

goal_dif = match['home_team_goal'].sub(match['away_team_goal'])
goal_dif[goal_dif < 0] = -1
goal_dif[goal_dif > 0] = 1
match['target_clf'] = goal_dif

In [58]:
## Add Scoring/Defensive form

alpha = 0.5 ##parameter for the decay of exp weighted mean


#Sort by earliest date first (need it in this format when applying exp average function)
match = match.sort_values(by = ['date'])


##loop over every season and every team
for season in match['season'].unique():
    df1 = match[match['season'] == season]
    for team in df1['home_team_api_id'].unique():
        df = df1[(df1['home_team_api_id'] == team)  | (df1['away_team_api_id'] == team)]
        df_home =  df1[df1['home_team_api_id'] == team]
        df_away = df1[df1['away_team_api_id'] == team]
        
        
        ##win loss form of a specific team
        win_loss =  df['target_clf'] * (df['home_team_api_id'] == team) + df['target_clf']*(df['away_team_api_id'] == team)
        win_loss_home = df_home['target_clf']
        win_loss_away = df_away['target_clf']

        ##goals score/conceded in each stage of the season by the team
        goals_scored = df['home_team_goal'] * (df['home_team_api_id'] == team) +  df['away_team_goal']*(df['away_team_api_id'] == team)
        goals_conceded = df['home_team_goal'] * (df['away_team_api_id'] == team) +  df['away_team_goal']*(df['home_team_api_id'] == team)
        
        goals_scored_home = df_home['home_team_goal']
        goals_conceded_home = df_home['away_team_goal']
        
        goals_scored_away = df_away['away_team_goal']
        goals_conceded_away = df_away['home_team_goal']
        
        ##take exp weighted average
        win_loss =  pd.Series(0).append(win_loss[:-1].ewm(alpha = alpha).mean())
        win_loss_home =  pd.Series(0).append(win_loss_home[:-1].ewm(alpha = alpha).mean())
        win_loss_away =  pd.Series(0).append(win_loss_away[:-1].ewm(alpha = alpha).mean())
        win_loss.index = df.index
        win_loss_home.index = df_home.index
        win_loss_away.index = df_away.index
        
        ewm_scored = pd.Series(1).append(goals_scored[:-1].ewm(alpha = alpha).mean())
        ewd_conceded = pd.Series(1).append(goals_conceded[:-1].ewm(alpha = alpha).mean())
        ewm_scored.index = df.index
        ewd_conceded.index = df.index
        
        ewm_scored_home = pd.Series(1).append(goals_scored_home[:-1].ewm(alpha = alpha).mean())
        ewd_conceded_home = pd.Series(0.5).append(goals_conceded_home[:-1].ewm(alpha = alpha).mean())
        ewm_scored_home.index = df_home.index
        ewd_conceded_home.index = df_home.index
        
        ewm_scored_away = pd.Series(0.5).append(goals_scored_away[:-1].ewm(alpha = alpha).mean())
        ewd_conceded_away = pd.Series(1).append(goals_conceded_away[:-1].ewm(alpha = alpha).mean())
        ewm_scored_away.index = df_away.index
        ewd_conceded_away.index = df_away.index
        
        ##Add to match dataframe
        match.loc[df_home.index, 'home_team_form'] = win_loss * (df['home_team_api_id'] == team)
        match.loc[df_away.index, 'away_team_form'] = win_loss * (df['away_team_api_id'] == team)

        match.loc[df_home.index, 'home_team_home_form'] = win_loss_home
        match.loc[df_away.index, 'away_team_away_form'] = win_loss_away
        
        match.loc[df_home.index, 'home_team_scoring_form'] = ewm_scored * (df['home_team_api_id'] == team) 
        match.loc[df_away.index, 'away_team_scoring_form'] = ewm_scored * (df['away_team_api_id'] == team)
        match.loc[df_home.index, 'home_team_defensive_form'] = ewd_conceded * (df['home_team_api_id'] == team) 
        match.loc[df_away.index, 'away_team_defensive_form'] = ewd_conceded * (df['away_team_api_id'] == team)

        
        match.loc[df_home.index, 'home_team_home_scoring_form'] = ewm_scored_home
        match.loc[df_home.index, 'home_team_home_defensive_form'] = ewd_conceded_home
        
        match.loc[df_away.index, 'away_team_away_scoring_form'] = ewm_scored_away
        match.loc[df_away.index, 'away_team_away_defensive_form'] = ewd_conceded_away

#Undo sort
match = match.sort_index()
match - match.drop(columns = 'target_clf')

In [557]:
# ## Rebalance proportion of home wins/draws/home losses
# indices = y_train[y_train['result'] == 1].sample(frac = 0.35).index.values
# mask_y= ~ y_train.index.isin(indices)
# mask_x= ~ x_train.index.isin(indices)
# y_train = y_train[mask_y]
# x_train = x_train[mask_x]
# assert len(x_train) == len(y_train)
# assert (x_train.index == y_train.index)
# assert (x_train.match_api_id == y_train.match_api_id)

(726, 35)

In [59]:
## Add normalised bookie odds

bookie_list = ['B365','BW','IW','LB','PS','WH','SJ','VC','BS'] #bookies we want to keep
for x in bookie_list:
    df = match[[x+'H', x+'D',x+'A']]
    df = 1/df
    df = df.div(df.sum(axis=1), axis=0)
    match.loc[:,[x+'H', x+'D',x+'A']] = df.loc[:,[x+'H', x+'D',x+'A']]

# Form new data

In [134]:
##Read in full attribute data and merge with new columns
a = pd.read_csv('./Data with correct player_atts/x_full.csv').merge(match, how = 'left', on = 'match_api_id') ##merge to match
b = pd.read_csv('./Data with correct player_atts/y_full.csv')

## Data Selection

In [142]:
## Select time period/leagues

a = a[a['league_id'].isin([1729,21518,7809,10257,4769])] ##leagues
a = a[a['date'] > '2010-08-01'] ##Time period

b = b[b.index.isin(a.index)]


In [143]:
##Form train/val/test splits
fifteen_sixteen = a[a['season'] == '2015/2016'] ##Want to sse latest season as test set
x_test = fifteen_sixteen.sample(n= math.floor(len(a)/10)).sort_index() ##sample 10 percent of total data for test
y_test = b[b.index.isin(x_test.index)]

x_remainder, y_remainder = a[~a.index.isin(x_test.index)], b[~b.index.isin(x_test.index)]
x_train, x_val, y_train, y_val = train_test_split(x_remainder,y_remainder, test_size = math.floor(len(a)/10)) ##get a val set which is 10% of total data

## Select Features

In [144]:
#keep certain columns
keep = ['match_api_id',
 'home_player_1_overall_rating',
 'home_player_2_overall_rating',
 'home_player_3_overall_rating',
 'home_player_4_overall_rating',
 'home_player_5_overall_rating',
 'home_player_6_overall_rating',
 'home_player_7_overall_rating',
 'home_player_8_overall_rating',
 'home_player_9_overall_rating',
 'home_player_10_overall_rating',
 'home_player_11_overall_rating',
 'away_player_1_overall_rating',
 'away_player_2_overall_rating',
 'away_player_3_overall_rating',
 'away_player_4_overall_rating',
 'away_player_5_overall_rating',
 'away_player_6_overall_rating',
 'away_player_7_overall_rating',
 'away_player_8_overall_rating',
 'away_player_9_overall_rating',
 'away_player_10_overall_rating',
 'away_player_11_overall_rating',
 'B365H',
 'B365D',
 'B365A',
 'BWH',
 'BWD',
 'BWA',
 'IWH',
 'IWD',
 'IWA',
 'LBH',
 'LBD',
 'LBA',
 'WHH',
 'WHD',
 'WHA',
 'VCH',
 'VCD',
 'VCA']



In [145]:
## Decide which columns you want

x_train, x_val, x_test = x_train[keep], x_val[keep], x_test[keep]

In [146]:
x_train, x_val, x_test = x_train.dropna(how= 'any', axis = 0), x_val.dropna(how= 'any', axis = 0), x_test.dropna(how= 'any', axis = 0)
y_train, y_val, y_test = y_train[y_train.index.isin(x_train.index)], y_val[y_val.index.isin(x_val.index)], y_test[y_test.index.isin(x_test.index)]

assert (x_train.isna().sum().sum() == 0) & (x_val.isna().sum().sum() == 0) & (x_test.isna().sum().sum() == 0)
x_train.shape, x_val.shape, x_test.shape

((8342, 41), (1038, 41), (1046, 41))

In [147]:
#Data quality check

assert (x_train.match_api_id == y_train.match_api_id).sum() ==len(x_train)
assert (x_val.match_api_id == y_val.match_api_id).sum() ==len(x_val)
assert (x_test.match_api_id == y_test.match_api_id).sum() ==len(x_test)

## Save new data

In [148]:
##Output new data
x_train.to_csv('./Data with correct player_atts/x_train.csv', index = False)
x_val.to_csv('./Data with correct player_atts/x_val.csv', index = False)
x_test.to_csv('./Data with correct player_atts/x_test.csv', index = False)

y_train.to_csv('./Data with correct player_atts/y_train.csv', index = False)
y_val.to_csv('./Data with correct player_atts/y_val.csv', index = False)
y_test.to_csv('./Data with correct player_atts/y_test.csv', index = False)