# Capstone 2: Preprocessing (Match Winner Predictor)

## Importing Packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

## Individual Player Preprocessing

### Reading in the Individual Player File

In [2]:
ind_df = pd.read_csv('C:/Users/justi/SpringboardDS/Capstone II/1.Match Winner Predictor/df_individual.csv')
ind_df.drop(columns = 'Unnamed: 0', axis = 1, inplace = True)
ind_df.head()

Unnamed: 0,date,player_name,team,opponent,country,player_id,match_id,event_id,event_name,map,...,deaths_ct,kddiff_ct,adr_ct,kast_ct,kills_t,deaths_t,kddiff_t,adr_t,kast_t,match_outcome
0,2020-02-27,Andersin,Thunder Logic,Station7,United States,14038,2339816,5151,ESEA MDL Season 33 North America,Overpass,...,2,1,145.7,100.0,18,8,10,118.1,86.7,Win
1,2020-02-27,FrostayK,Station7,Thunder Logic,United States,12090,2339816,5151,ESEA MDL Season 33 North America,Overpass,...,13,-5,62.5,60.0,1,3,-2,12.3,33.3,Loss
2,2020-02-27,Inseaniac,Thunder Logic,Station7,Canada,18623,2339816,5151,ESEA MDL Season 33 North America,Overpass,...,0,5,192.3,100.0,15,7,8,80.8,86.7,Win
3,2020-02-27,PureR,Thunder Logic,Station7,United States,10622,2339816,5151,ESEA MDL Season 33 North America,Overpass,...,0,4,63.7,100.0,14,9,5,96.5,86.7,Win
4,2020-02-27,Sharkie,Thunder Logic,Station7,United States,19476,2339816,5151,ESEA MDL Season 33 North America,Overpass,...,0,3,58.3,100.0,10,7,3,77.9,86.7,Win


### Splitting Into Train/Test Sets

In [3]:
# Split features into X and y dataframes.
X = ind_df.drop(columns=['match_outcome', 'player_name', 'team', 'opponent', 'player_id', 'match_id', 'event_id', 'event_name', 'map'])
y = ind_df['match_outcome']

# Split into train/test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

The variable 'player_name' could be useful but since we are interested in how individual performance predicts match outcomes we don't want performance from other matches impacting our predictions, so we will drop this feature. Since we are interested in predicting whether a match will result in a win or a loss from individual player data, the variables 'team' and 'opponent' should not be included in our modeling (we are not interested in how the team a player is on or facing affects the outcome of a match). Since there are teams with players from different countries we will keep the 'country' variable. The 'event_name' and 'map' variables will also not be included since they don't provide much for our goal of predicting match outcomes. It is also worth noting that we dropped our 3 numeric id columns since they are of little value moving forward (we don't want them in our analysis and we can merge our numeric and categorical dataframes by using the index).

### Scaling Our Numeric Features

In [4]:
# We first need to split our data into two dataframes of numerical and categorical data.
X_train_numeric = X_train.drop(columns=['date', 'country'])
X_test_numeric = X_test.drop(columns=['date', 'country'])
X_train_categorical = X_train.drop(X_train.columns.difference(['country']), 1) # This code keeps the columns listed. (We don't need the date column)
X_test_categorical = X_test.drop(X_test.columns.difference(['country']), 1)

# Save the column names
X_col = X_train_numeric.columns

  X_train_categorical = X_train.drop(X_train.columns.difference(['country']), 1) # This code keeps the columns listed. (We don't need the date column)
  X_test_categorical = X_test.drop(X_test.columns.difference(['country']), 1)


In [5]:
# Initiate, fit, and transform the scaler on our numeric features.
scaler = StandardScaler()
scaler.fit(X_train_numeric)
X_train_scaled = scaler.transform(X_train_numeric)
X_test_scaled = scaler.transform(X_test_numeric)

In [6]:
X_train_scaled = pd.DataFrame(X_train_scaled, index = X_train_numeric.index, columns = X_col)
X_test_scaled = pd.DataFrame(X_test_scaled, index = X_test_numeric.index, columns = X_col)

In [7]:
X_train_scaled.head()

Unnamed: 0,kills,assists,deaths,hs,flash_assists,kast,kddiff,adr,fkdiff,kills_ct,deaths_ct,kddiff_ct,adr_ct,kast_ct,kills_t,deaths_t,kddiff_t,adr_t,kast_t
23335,-0.435906,-0.464604,0.064459,-0.009613,-0.899852,-0.32261,-0.42495,-0.761278,-0.41413,-1.364995,0.774766,-1.773084,-1.152674,-1.130213,0.83026,-0.630169,1.180915,0.085498,0.661377
112712,3.227906,-0.873878,0.271812,3.608984,-0.011025,1.303165,2.642961,3.01815,0.816729,2.525977,1.073904,1.611957,2.257036,0.513792,1.990442,-0.630169,2.18899,1.93522,1.43811
113696,0.519871,-1.283152,-0.764951,-0.268085,-0.011025,0.461393,0.969555,0.896553,2.457873,-1.14883,-0.421782,-0.777484,-0.702834,-0.305132,1.990442,-0.630169,2.18899,1.46527,0.744386
2299,-0.754498,2.400315,0.89387,-0.526556,0.877802,-0.32261,-1.261653,-0.86256,0.406443,-0.284169,1.073904,-0.976604,-0.237873,0.101251,-0.793995,0.199315,-0.835234,-1.034862,-0.613415
30170,0.201279,-0.464604,-0.142893,2.058157,-0.011025,-0.347368,0.272303,0.795271,0.406443,0.148161,-0.720919,0.616357,1.746713,0.975591,0.134151,0.47581,-0.230389,-0.02353,-0.838728


In [8]:
X_test_scaled.head()

Unnamed: 0,kills,assists,deaths,hs,flash_assists,kast,kddiff,adr,fkdiff,kills_ct,deaths_ct,kddiff_ct,adr_ct,kast_ct,kills_t,deaths_t,kddiff_t,adr_t,kast_t
44195,-0.117313,0.353944,0.271812,0.248858,-0.011025,0.114781,-0.285499,0.560722,0.816729,0.364326,0.774766,-0.180124,0.007837,-0.305132,-0.561958,-0.353674,-0.230389,0.938926,0.613942
81783,-0.27661,-0.464604,-2.216421,-0.268085,-0.899852,1.146364,1.248456,0.688658,0.406443,-1.797325,-1.917468,-0.379244,-1.224497,1.745256,1.526369,-1.183159,2.18899,0.912609,0.744386
63824,0.679167,-0.05533,-0.350246,0.507329,-0.899852,1.534239,0.830105,1.147093,0.406443,0.580491,-1.020057,1.213717,2.170092,1.129524,0.366187,0.47581,-0.028774,-0.064886,1.141647
72738,-0.595202,0.353944,-0.142893,0.507329,0.877802,-0.4464,-0.42495,-0.86256,-0.41413,-1.14883,1.373041,-1.972204,-1.387044,-0.717672,0.366187,-1.459654,1.38253,0.465217,0.15146
43538,0.041983,-0.05533,0.89387,0.248858,-0.899852,0.874026,-0.5644,-0.313503,-0.41413,-0.068004,1.073904,-0.777484,-0.411761,0.926332,0.134151,0.199315,-0.028774,-0.049848,0.347126


### Encoding Dummy Variables

In [9]:
X_train_dummies = pd.get_dummies(X_train_categorical['country'])
X_test_dummies = pd.get_dummies(X_test_categorical['country'])
X_train_dummies.head()

Unnamed: 0,Albania,Algeria,Argentina,Australia,Austria,Azerbaijan,Bangladesh,Belarus,Belgium,Bosnia and Herzegovina,...,Tunisia,Turkey,Ukraine,United Arab Emirates,United Kingdom,United States,Uruguay,Uzbekistan,Venezuela,Vietnam
23335,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
112712,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
113696,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2299,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
30170,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [10]:
X_test_dummies.head()

Unnamed: 0,Albania,Algeria,Argentina,Australia,Austria,Azerbaijan,Bangladesh,Belarus,Belgium,Bosnia and Herzegovina,...,Thailand,Tunisia,Turkey,Ukraine,United Arab Emirates,United Kingdom,United States,Uruguay,Uzbekistan,Vietnam
44195,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81783,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
63824,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
72738,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
43538,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


We can see that there are countries in the training set that are not in the test set. This means that we need to add these 5 country's dummy features to the test set which will all be set to zeros across the entire columns. The countries are: Cambodia, Iceland, Iran, Montenegro, and Venezuela.

In [11]:
X_test_dummies['Cambodia'] = 0
X_test_dummies['Iceland'] = 0
X_test_dummies['Iran'] = 0
X_test_dummies['Montenegro'] = 0
X_test_dummies['Venezuela'] = 0
X_test_dummies.shape

(26101, 82)

### Merging Features Back Together

In [12]:
# Merge the numerical and categorical (dummy) dataframes for the training data and the testing data, respectively.
ind_X_train_final = X_train_scaled.merge(X_train_dummies, how = 'inner', left_index = True, right_index = True)
ind_X_train_final.head()

Unnamed: 0,kills,assists,deaths,hs,flash_assists,kast,kddiff,adr,fkdiff,kills_ct,...,Tunisia,Turkey,Ukraine,United Arab Emirates,United Kingdom,United States,Uruguay,Uzbekistan,Venezuela,Vietnam
23335,-0.435906,-0.464604,0.064459,-0.009613,-0.899852,-0.32261,-0.42495,-0.761278,-0.41413,-1.364995,...,0,0,0,0,0,0,0,0,0,0
112712,3.227906,-0.873878,0.271812,3.608984,-0.011025,1.303165,2.642961,3.01815,0.816729,2.525977,...,0,0,0,0,0,0,0,0,0,0
113696,0.519871,-1.283152,-0.764951,-0.268085,-0.011025,0.461393,0.969555,0.896553,2.457873,-1.14883,...,0,0,0,0,0,0,0,0,0,0
2299,-0.754498,2.400315,0.89387,-0.526556,0.877802,-0.32261,-1.261653,-0.86256,0.406443,-0.284169,...,0,1,0,0,0,0,0,0,0,0
30170,0.201279,-0.464604,-0.142893,2.058157,-0.011025,-0.347368,0.272303,0.795271,0.406443,0.148161,...,0,1,0,0,0,0,0,0,0,0


In [13]:
ind_X_test_final = X_test_scaled.merge(X_test_dummies, how = 'inner', left_index = True, right_index = True)
ind_X_test_final.head()

Unnamed: 0,kills,assists,deaths,hs,flash_assists,kast,kddiff,adr,fkdiff,kills_ct,...,United Kingdom,United States,Uruguay,Uzbekistan,Vietnam,Cambodia,Iceland,Iran,Montenegro,Venezuela
44195,-0.117313,0.353944,0.271812,0.248858,-0.011025,0.114781,-0.285499,0.560722,0.816729,0.364326,...,0,0,0,0,0,0,0,0,0,0
81783,-0.27661,-0.464604,-2.216421,-0.268085,-0.899852,1.146364,1.248456,0.688658,0.406443,-1.797325,...,0,0,0,0,0,0,0,0,0,0
63824,0.679167,-0.05533,-0.350246,0.507329,-0.899852,1.534239,0.830105,1.147093,0.406443,0.580491,...,0,0,0,0,0,0,0,0,0,0
72738,-0.595202,0.353944,-0.142893,0.507329,0.877802,-0.4464,-0.42495,-0.86256,-0.41413,-1.14883,...,0,1,0,0,0,0,0,0,0,0
43538,0.041983,-0.05533,0.89387,0.248858,-0.899852,0.874026,-0.5644,-0.313503,-0.41413,-0.068004,...,0,1,0,0,0,0,0,0,0,0


### Saving our train and test sets

In [14]:
ind_X_train_final.to_csv('C:/Users/justi/SpringboardDS/Capstone II/1.Match Winner Predictor/Individual Data/X_train_ind.csv')

In [15]:
y_train.to_csv('C:/Users/justi/SpringboardDS/Capstone II/1.Match Winner Predictor/Individual Data/y_train_ind.csv')

In [16]:
ind_X_test_final.to_csv('C:/Users/justi/SpringboardDS/Capstone II/1.Match Winner Predictor/Individual Data/X_test_ind.csv')

In [17]:
y_test.to_csv('C:/Users/justi/SpringboardDS/Capstone II/1.Match Winner Predictor/Individual Data/y_test_ind.csv')

## Team Players Preprocessing

### Reading in the Team Player File

In [18]:
team_df = pd.read_csv('df_team.csv')
team_df.head()

Unnamed: 0,match_id,team,date,opponent,event_name,map,kills,assists,deaths,hs,...,deaths_ct,kddiff_ct,adr_ct,kast_ct,kills_t,deaths_t,kddiff_t,adr_t,kast_t,match_outcome
0,2299001,Dignitas,2015-11-03,NiP,ESL ESEA Pro League Season 2,Train,10.2,2.0,17.8,3.8,...,8.6,-4.0,78.133799,71.432949,4.8,8.6,-4.0,71.627147,67.267077,Loss
1,2299001,NiP,2015-11-03,Dignitas,ESL ESEA Pro League Season 2,Train,17.8,3.4,10.2,7.4,...,4.8,3.6,78.133799,71.432949,8.8,4.8,3.6,71.627147,67.267077,Win
2,2299003,Envy,2015-11-03,NiP,ESL ESEA Pro League Season 2,Cobblestone,16.4,4.2,19.2,6.6,...,9.4,-1.6,78.133799,71.432949,8.0,9.4,-1.6,71.627147,67.267077,Loss
3,2299003,NiP,2015-11-03,Envy,ESL ESEA Pro League Season 2,Cobblestone,19.2,2.2,16.4,9.8,...,7.8,1.2,78.133799,71.432949,9.4,7.8,1.2,71.627147,67.267077,Win
4,2299011,CLG,2015-11-04,Liquid,ESL ESEA Pro League Season 2,Inferno,20.4,5.4,16.8,5.2,...,8.2,1.6,78.133799,71.432949,10.0,8.2,1.6,71.627147,67.267077,Win


### Splitting Into Train/Test Sets

In [19]:
# Split features into X and y dataframes.
X = team_df.drop(columns=['match_outcome', 'match_id', 'team', 'opponent', 'event_name', 'map'])
y = team_df['match_outcome']

# Split into train/test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

The variables 'team' and 'opponent' should not be included in our modeling. We don't want teams' performances from other matches impacting our predictions, but rather how a teams performance in a given match affects the match outcome, so we will drop this feature. The 'event_name' and 'map' variables will also not be included since they don't provide much for our goal of predicting match outcomes. It is also worth noting that we dropped our numeric id column 'match_id' since it is of little value moving forward (we don't want it in our analysis and we can merge our numeric and categorical dataframes by using the index).

### Scaling Our Numeric Features

In [20]:
# We have no categorical features in our X train/test sets, so we can keep one dataframe.
X_train = X_train.drop(columns=['date'])
X_test = X_test.drop(columns=['date'])

# Save the column names
X_col = X_train.columns

In [21]:
# Initiate, fit, and transform the scaler on our features.
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [22]:
X_train_scaled = pd.DataFrame(X_train_scaled, index = X_train.index, columns = X_col)
X_test_scaled = pd.DataFrame(X_test_scaled, index = X_test.index, columns = X_col)

In [23]:
X_train_scaled.head()

Unnamed: 0,kills,assists,deaths,hs,flash_assists,kast,kddiff,adr,fkdiff,kills_ct,deaths_ct,kddiff_ct,adr_ct,kast_ct,kills_t,deaths_t,kddiff_t,adr_t,kast_t
25203,-0.413578,-1.028041,-1.825789,-0.085709,-0.351181,1.091136,1.264046,1.605045,0.330691,-1.903453,-2.440854,0.278748,1.309783,1.224913,1.503915,-0.200328,1.533933,1.371396,0.878266
17068,-1.764283,-1.437843,-0.110721,-1.031753,-1.696651,-0.499401,-1.48499,-1.676886,-0.488226,-2.257971,-1.452188,-0.929629,-2.253484,-1.721457,-0.059716,1.167508,-1.188843,-0.883055,0.0611
11817,0.081681,-0.345037,0.385746,0.387312,-0.014814,0.198982,-0.27218,-0.097159,1.477175,0.046395,1.184252,-0.987171,-0.694447,-0.167098,0.070587,-0.557155,0.60703,0.505724,0.468916
22347,-0.278507,-0.208436,-2.186856,-0.653336,-0.351181,1.212242,1.708743,2.325367,0.822041,1.523552,-0.397612,1.832375,1.586525,0.689648,-2.079406,-2.519703,0.60703,2.193862,1.747558
24194,-0.18846,0.337966,0.70168,-0.653336,-1.023916,-0.323796,-0.797731,-0.599516,-2.453628,0.105481,0.986519,-0.757004,-0.685853,-0.274794,-0.385472,0.037556,-0.377804,-0.349146,-0.296123


In [24]:
X_test_scaled.head()

Unnamed: 0,kills,assists,deaths,hs,flash_assists,kast,kddiff,adr,fkdiff,kills_ct,deaths_ct,kddiff_ct,adr_ct,kast_ct,kills_t,deaths_t,kddiff_t,adr_t,kast_t
2501,0.666986,0.201366,-0.471788,0.576521,-0.014814,0.394771,1.021484,0.778853,0.658258,0.991776,-0.068057,1.026791,0.725359,0.581952,-0.124867,-0.557155,0.433236,0.340614,-0.135143
21040,1.522433,1.703973,1.468947,0.671126,0.657921,0.138429,0.051236,0.100048,0.985825,1.523552,0.788786,0.796623,0.778645,0.421212,0.526646,1.226979,-0.725392,-0.599127,-0.210267
5357,-2.574706,-2.257447,-0.291254,-2.26161,-0.014814,-2.707582,-2.050968,-3.046952,-1.470927,-2.317057,-1.320366,-1.159796,-2.537101,-2.28887,-1.232439,0.75121,-1.88402,-2.069689,-1.912057
20854,-1.08893,-1.711044,0.340613,-1.126358,-1.696651,-0.648766,-1.282855,-1.224349,-1.307144,0.164568,1.250163,-0.929629,-0.644599,-0.274794,-1.753649,-0.676098,-0.899186,-1.98019,-1.209877
8449,-1.809307,-0.618238,-0.110721,-1.315567,-0.687549,-1.63175,-1.525417,-1.490059,-0.979577,-0.958072,1.447896,-2.195548,-2.043779,-2.095982,-1.558195,-1.449223,0.027716,0.46869,0.674357


### Encoding Dummy Variables

There is no categorical data present so we do not need to encode any duimmy variables.

### Merging Features Back Together

We have only one dataframe due to no categorical variables being present, so no merge is needed.

### Saving our train and test sets

In [25]:
X_train_scaled.to_csv('C:/Users/justi/SpringboardDS/Capstone II/1.Match Winner Predictor/Team Data/X_train_team.csv')

In [26]:
y_train.to_csv('C:/Users/justi/SpringboardDS/Capstone II/1.Match Winner Predictor/Team Data/y_train_team.csv')

In [27]:
X_test_scaled.to_csv('C:/Users/justi/SpringboardDS/Capstone II/1.Match Winner Predictor/Team Data/X_test_team.csv')

In [28]:
y_test.to_csv('C:/Users/justi/SpringboardDS/Capstone II/1.Match Winner Predictor/Team Data/y_test_team.csv')