## Take data from respository and add more features

In [104]:
import pandas as pd
import numpy as np

In [105]:
match = pd.read_csv('./Data/Match.csv', index_col=False)
#country = pd.read_csv('./Data/Country.csv', index_col=False)
#team = pd.read_csv('./Data/Team.csv', index_col=False)
#player = pd.read_csv('./Data/Player.csv', index_col=False)
#p_att = pd.read_csv('./Data/Player_Attributes.csv', index_col=False)
#league = pd.read_csv('./Data/League.csv', index_col=False)
#t_att = pd.read_csv('./Data/Team_Attributes.csv', index_col=False)


In [106]:
## Add win/draw/loss labels (1,0,-1)

goal_dif = match['home_team_goal'].sub(match['away_team_goal'])
goal_dif[goal_dif < 0] = -1
goal_dif[goal_dif > 0] = 1
match['target_clf'] = goal_dif

## Add Form Features

In [107]:
## Add Scoring/Defensive form

alpha = 0.5 ##parameter for the decay of exp weighted mean


#Sort by earliest date first (need it in this format when applying exp average function)
match = match.sort_values(by = ['date'])


##loop over every season and every team
for season in match['season'].unique():
    df1 = match[match['season'] == season]
    for team in df1['home_team_api_id'].unique():
        df = df1[(df1['home_team_api_id'] == team)  | (df1['away_team_api_id'] == team)]
        df_home =  df1[df1['home_team_api_id'] == team]
        df_away = df1[df1['away_team_api_id'] == team]
        
        
        ##win loss form of a specific team
        win_loss =  df['target_clf'] * (df['home_team_api_id'] == team) + df['target_clf']*(df['away_team_api_id'] == team)
        win_loss_home = df_home['target_clf']
        win_loss_away = df_away['target_clf']

        ##goals score/conceded in each stage of the season by the team
        goals_scored = df['home_team_goal'] * (df['home_team_api_id'] == team) +  df['away_team_goal']*(df['away_team_api_id'] == team)
        goals_conceded = df['home_team_goal'] * (df['away_team_api_id'] == team) +  df['away_team_goal']*(df['home_team_api_id'] == team)
        
        goals_scored_home = df_home['home_team_goal']
        goals_conceded_home = df_home['away_team_goal']
        
        goals_scored_away = df_away['away_team_goal']
        goals_conceded_away = df_away['home_team_goal']
        
        ##take exp weighted average
        win_loss =  pd.Series(0).append(win_loss[:-1].ewm(alpha = alpha).mean())
        win_loss_home =  pd.Series(0).append(win_loss_home[:-1].ewm(alpha = alpha).mean())
        win_loss_away =  pd.Series(0).append(win_loss_away[:-1].ewm(alpha = alpha).mean())
        win_loss.index = df.index
        win_loss_home.index = df_home.index
        win_loss_away.index = df_away.index
        
        ewm_scored = pd.Series(1).append(goals_scored[:-1].ewm(alpha = alpha).mean())
        ewd_conceded = pd.Series(1).append(goals_conceded[:-1].ewm(alpha = alpha).mean())
        ewm_scored.index = df.index
        ewd_conceded.index = df.index
        
        ewm_scored_home = pd.Series(1).append(goals_scored_home[:-1].ewm(alpha = alpha).mean())
        ewd_conceded_home = pd.Series(0.5).append(goals_conceded_home[:-1].ewm(alpha = alpha).mean())
        ewm_scored_home.index = df_home.index
        ewd_conceded_home.index = df_home.index
        
        ewm_scored_away = pd.Series(0.5).append(goals_scored_away[:-1].ewm(alpha = alpha).mean())
        ewd_conceded_away = pd.Series(1).append(goals_conceded_away[:-1].ewm(alpha = alpha).mean())
        ewm_scored_away.index = df_away.index
        ewd_conceded_away.index = df_away.index
        
        ##Add to match dataframe
        match.loc[df_home.index, 'home_team_form'] = win_loss * (df['home_team_api_id'] == team)
        match.loc[df_away.index, 'away_team_form'] = win_loss * (df['away_team_api_id'] == team)

        match.loc[df_home.index, 'home_team_home_form'] = win_loss_home
        match.loc[df_away.index, 'away_team_away_form'] = win_loss_away
        
        match.loc[df_home.index, 'home_team_scoring_form'] = ewm_scored * (df['home_team_api_id'] == team) 
        match.loc[df_away.index, 'away_team_scoring_form'] = ewm_scored * (df['away_team_api_id'] == team)
        match.loc[df_home.index, 'home_team_defensive_form'] = ewd_conceded * (df['home_team_api_id'] == team) 
        match.loc[df_away.index, 'away_team_defensive_form'] = ewd_conceded * (df['away_team_api_id'] == team)

        
        match.loc[df_home.index, 'home_team_home_scoring_form'] = ewm_scored_home
        match.loc[df_home.index, 'home_team_home_defensive_form'] = ewd_conceded_home
        
        match.loc[df_away.index, 'away_team_away_scoring_form'] = ewm_scored_away
        match.loc[df_away.index, 'away_team_away_defensive_form'] = ewd_conceded_away

#Undo sort
match = match.sort_index()


In [108]:
match = match[['match_api_id', 'home_team_form', 'away_team_form','home_team_home_form', 'away_team_away_form', 'home_team_scoring_form',  'away_team_scoring_form', 'home_team_defensive_form', 
               'away_team_defensive_form', 'home_team_home_scoring_form', 'home_team_home_defensive_form', 'away_team_away_scoring_form', 'away_team_away_defensive_form' ]]

In [557]:
# ## Rebalance proportion of home wins/draws/home losses
# indices = y_train[y_train['result'] == 1].sample(frac = 0.35).index.values
# mask_y= ~ y_train.index.isin(indices)
# mask_x= ~ x_train.index.isin(indices)
# y_train = y_train[mask_y]
# x_train = x_train[mask_x]
# assert len(x_train) == len(y_train)
# assert (x_train.index == y_train.index).sum()
# assert (x_train.match_api_id == y_train.match_api_id).sum()

(726, 35)

## Ouput new data

In [111]:
#keep certain columns
keep = ['match_api_id',
 'home_player_1_overall_rating',
 'home_player_2_overall_rating',
 'home_player_3_overall_rating',
 'home_player_4_overall_rating',
 'home_player_5_overall_rating',
 'home_player_6_overall_rating',
 'home_player_7_overall_rating',
 'home_player_8_overall_rating',
 'home_player_9_overall_rating',
 'home_player_10_overall_rating',
 'home_player_11_overall_rating',
 'away_player_1_overall_rating',
 'away_player_2_overall_rating',
 'away_player_3_overall_rating',
 'away_player_4_overall_rating',
 'away_player_5_overall_rating',
 'away_player_6_overall_rating',
 'away_player_7_overall_rating',
 'away_player_8_overall_rating',
 'away_player_9_overall_rating',
 'away_player_10_overall_rating',
 'away_player_11_overall_rating']



In [112]:
#Output new full data

a=pd.read_csv('./Data with correct player_atts/x_train.csv')
b=pd.read_csv('./Data with correct player_atts/x_val.csv')
c=pd.read_csv('./Data with correct player_atts/x_test.csv')

a= a[keep]
b= b[keep]
c= c[keep]

In [113]:
a.merge(match, how = 'left', on ='match_api_id').to_csv('x_train_full.csv', index = False)
b.merge(match, how = 'left', on ='match_api_id').to_csv('x_val_full.csv', index = False)
c.merge(match, how = 'left', on ='match_api_id').to_csv('x_test_full.csv', index = False)