# Capstone 2: Preprocessing (Match Winner Predictor)

## Importing Packages

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

## Individual Player Preprocessing

### Reading in the Individual Player File

In [2]:
ind_df = pd.read_csv('C:/Users/justi/SpringboardDS/Capstone II/1.Match Winner Predictor/df_individual.csv')
ind_df.drop(columns = 'Unnamed: 0', axis = 1, inplace = True)
ind_df.head()

Unnamed: 0,date,player_name,team,opponent,country,player_id,match_id,event_id,event_name,map,...,deaths_ct,kddiff_ct,adr_ct,kast_ct,kills_t,deaths_t,kddiff_t,adr_t,kast_t,match_outcome
0,2020-02-27,Andersin,Thunder Logic,Station7,United States,14038,2339816,5151,ESEA MDL Season 33 North America,Overpass,...,2,1,145.7,100.0,18,8,10,118.1,86.7,Win
1,2020-02-27,FrostayK,Station7,Thunder Logic,United States,12090,2339816,5151,ESEA MDL Season 33 North America,Overpass,...,13,-5,62.5,60.0,1,3,-2,12.3,33.3,Loss
2,2020-02-27,Inseaniac,Thunder Logic,Station7,Canada,18623,2339816,5151,ESEA MDL Season 33 North America,Overpass,...,0,5,192.3,100.0,15,7,8,80.8,86.7,Win
3,2020-02-27,PureR,Thunder Logic,Station7,United States,10622,2339816,5151,ESEA MDL Season 33 North America,Overpass,...,0,4,63.7,100.0,14,9,5,96.5,86.7,Win
4,2020-02-27,Sharkie,Thunder Logic,Station7,United States,19476,2339816,5151,ESEA MDL Season 33 North America,Overpass,...,0,3,58.3,100.0,10,7,3,77.9,86.7,Win


### Splitting Into Train/Test Sets

In [3]:
# Split features into X and y dataframes.
X = ind_df.drop(columns=['match_outcome', 'player_name', 'team', 'opponent', 'player_id', 'match_id', 'event_id', 'event_name', 'map'])
y = ind_df['match_outcome']

# Split into train/test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

The variable 'player_name' could be useful but since we are interested in how individual performance predicts match outcomes we don't want performance from other matches impacting our predictions, so we will drop this feature. Since we are interested in predicting whether a match will result in a win or a loss from individual player data, the variables 'team' and 'opponent' should not be included in our modeling (we are not interested in how the team a player is on or facing affects the outcome of a match). Since there are teams with players from different countries we will keep the 'country' variable. The 'event_name' and 'map' variables will also not be included since they don't provide much for our goal of predicting match outcomes. It is also worth noting that we dropped our 3 numeric id columns since they are of little value moving forward (we don't want them in our analysis and we can merge our numeric and categorical dataframes by using the index).

### Scaling Our Numeric Features

In [4]:
# We first need to split our data into two dataframes of numerical and categorical data.
X_train_numeric = X_train.drop(columns=['date', 'country'])
X_test_numeric = X_test.drop(columns=['date', 'country'])
X_train_categorical = X_train.drop(X_train.columns.difference(['country']), 1) # This code keeps the columns listed. (We don't need the date column)
X_test_categorical = X_test.drop(X_test.columns.difference(['country']), 1)

# Save the column names
X_col = X_train_numeric.columns

  X_train_categorical = X_train.drop(X_train.columns.difference(['country']), 1) # This code keeps the columns listed. (We don't need the date column)
  X_test_categorical = X_test.drop(X_test.columns.difference(['country']), 1)


In [5]:
# Initiate, fit, and transform the scaler on our numeric features.
scaler = StandardScaler()
scaler.fit(X_train_numeric)
X_train_scaled = scaler.transform(X_train_numeric)
X_test_scaled = scaler.transform(X_test_numeric)

In [6]:
X_train_scaled = pd.DataFrame(X_train_scaled, index = X_train_numeric.index, columns = X_col)
X_test_scaled = pd.DataFrame(X_test_scaled, index = X_test_numeric.index, columns = X_col)

In [7]:
X_train_scaled.head()

Unnamed: 0,kills,assists,deaths,hs,flash_assists,kast,kddiff,adr,fkdiff,kills_ct,deaths_ct,kddiff_ct,adr_ct,kast_ct,kills_t,deaths_t,kddiff_t,adr_t,kast_t
83945,-1.38873,-0.873434,0.062903,-1.555218,-0.900975,-2.008421,-1.258302,-0.353612,-0.003805,-0.499369,1.37224,-1.372674,-0.283038,-1.129047,-1.489892,-1.18381,-0.429894,-0.434357,-2.81149
64433,0.521517,-0.873434,0.893582,-0.264349,-0.900975,-0.871018,-0.143129,-0.401609,0.815253,0.365351,-0.124026,0.418935,-0.490894,0.381275,0.368435,1.303971,-0.631603,-0.088758,-1.77375
88034,0.043956,0.355714,-2.013794,-0.006175,0.878442,2.063154,1.390235,1.422246,1.63431,1.230072,-0.722533,1.61334,1.017011,1.336785,-1.257601,-2.013071,0.376942,0.410858,1.932467
78957,-0.592793,-0.463718,0.270573,0.251999,-0.011266,-1.225426,-0.700715,0.158347,-2.460978,-0.499369,-1.32104,0.418935,1.852217,-0.013258,-0.328438,1.580391,-1.438438,-0.644722,-1.228194
82528,-0.911168,0.355714,0.478242,-1.555218,-0.011266,-0.846292,-1.118905,-0.977563,-0.413334,-0.715549,-0.42328,-0.377336,-0.086519,-0.894793,-0.560728,1.027551,-1.23673,-0.918947,-0.439512


In [8]:
X_test_scaled.head()

Unnamed: 0,kills,assists,deaths,hs,flash_assists,kast,kddiff,adr,fkdiff,kills_ct,deaths_ct,kddiff_ct,adr_ct,kast_ct,kills_t,deaths_t,kddiff_t,adr_t,kast_t
38524,0.203143,-0.054002,-0.560106,-0.522523,-0.011266,1.255433,0.553855,-0.433606,-0.003805,-0.067009,-0.722533,0.418935,-0.32083,0.066881,0.368435,-0.07813,0.376942,-0.276584,1.535161
36291,0.36233,-0.054002,0.270573,1.801042,-0.900975,0.999929,0.135665,-0.044304,-0.003805,0.797712,0.47448,0.418935,0.54461,0.929922,-0.328438,-0.07813,-0.228185,-0.798739,0.44998
58593,-0.115232,2.404294,0.478242,1.026521,2.65786,-0.030328,-0.421922,-0.145629,0.405724,-0.067009,0.175227,-0.178268,-0.456882,0.103869,-0.096147,0.47471,-0.429894,0.245571,-0.184524
15465,-0.911168,-0.054002,-0.352437,-0.264349,-0.011266,0.373533,-0.561319,-0.337614,-0.003805,-1.36409,-0.722533,-0.775471,-0.9444,-0.561906,0.136144,0.19829,-0.026476,0.177954,0.746478
53369,1.158266,1.994578,2.1396,0.768347,1.768151,-0.302316,-0.421922,0.510319,-0.413334,0.581532,1.970747,-0.775471,0.801596,-0.426285,1.065308,1.027551,0.175233,-0.069976,-0.042205


### Encoding Dummy Variables

In [9]:
X_train_dummies = pd.get_dummies(X_train_categorical['country'])
X_test_dummies = pd.get_dummies(X_test_categorical['country'])
X_train_dummies.head()

Unnamed: 0,Albania,Algeria,Argentina,Australia,Austria,Azerbaijan,Bangladesh,Belarus,Belgium,Bosnia and Herzegovina,...,Tunisia,Turkey,Ukraine,United Arab Emirates,United Kingdom,United States,Uruguay,Uzbekistan,Venezuela,Vietnam
83945,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
64433,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
88034,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
78957,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
82528,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
X_test_dummies.head()

Unnamed: 0,Albania,Algeria,Argentina,Australia,Austria,Azerbaijan,Bangladesh,Belarus,Belgium,Bosnia and Herzegovina,...,Thailand,Tunisia,Turkey,Ukraine,United Arab Emirates,United Kingdom,United States,Uruguay,Uzbekistan,Vietnam
38524,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
36291,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
58593,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
15465,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
53369,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


We can see that there are countries in the training set that are not in the test set. This means that we need to add these 5 country's dummy features to the test set which will all be set to zeros across the entire columns. The countries are: Cambodia, Iceland, Iran, Montenegro, and Venezuela.

In [11]:
X_test_dummies['Cambodia'] = 0
X_test_dummies['Iceland'] = 0
X_test_dummies['Iran'] = 0
X_test_dummies['Montenegro'] = 0
X_test_dummies['Venezuela'] = 0
X_test_dummies.shape

(26100, 82)

### Merging Features Back Together

In [12]:
# Merge the numerical and categorical (dummy) dataframes for the training data and the testing data, respectively.
ind_X_train_final = X_train_scaled.merge(X_train_dummies, how = 'inner', left_index = True, right_index = True)
ind_X_train_final.head()

Unnamed: 0,kills,assists,deaths,hs,flash_assists,kast,kddiff,adr,fkdiff,kills_ct,...,Tunisia,Turkey,Ukraine,United Arab Emirates,United Kingdom,United States,Uruguay,Uzbekistan,Venezuela,Vietnam
83945,-1.38873,-0.873434,0.062903,-1.555218,-0.900975,-2.008421,-1.258302,-0.353612,-0.003805,-0.499369,...,0,0,0,0,0,0,0,0,0,0
64433,0.521517,-0.873434,0.893582,-0.264349,-0.900975,-0.871018,-0.143129,-0.401609,0.815253,0.365351,...,0,0,0,0,0,1,0,0,0,0
88034,0.043956,0.355714,-2.013794,-0.006175,0.878442,2.063154,1.390235,1.422246,1.63431,1.230072,...,0,0,0,0,0,1,0,0,0,0
78957,-0.592793,-0.463718,0.270573,0.251999,-0.011266,-1.225426,-0.700715,0.158347,-2.460978,-0.499369,...,0,0,0,0,0,0,0,0,0,0
82528,-0.911168,0.355714,0.478242,-1.555218,-0.011266,-0.846292,-1.118905,-0.977563,-0.413334,-0.715549,...,0,0,0,0,0,0,0,0,0,0


In [13]:
ind_X_test_final = X_test_scaled.merge(X_test_dummies, how = 'inner', left_index = True, right_index = True)
ind_X_test_final.head()

Unnamed: 0,kills,assists,deaths,hs,flash_assists,kast,kddiff,adr,fkdiff,kills_ct,...,United Arab Emirates,United Kingdom,United States,Uruguay,Uzbekistan,Vietnam,Cambodia,Iceland,Montenegro,Venezuela
38524,0.203143,-0.054002,-0.560106,-0.522523,-0.011266,1.255433,0.553855,-0.433606,-0.003805,-0.067009,...,0,0,0,0,0,0,0,0,0,0
36291,0.36233,-0.054002,0.270573,1.801042,-0.900975,0.999929,0.135665,-0.044304,-0.003805,0.797712,...,0,0,1,0,0,0,0,0,0,0
58593,-0.115232,2.404294,0.478242,1.026521,2.65786,-0.030328,-0.421922,-0.145629,0.405724,-0.067009,...,0,0,1,0,0,0,0,0,0,0
15465,-0.911168,-0.054002,-0.352437,-0.264349,-0.011266,0.373533,-0.561319,-0.337614,-0.003805,-1.36409,...,0,1,0,0,0,0,0,0,0,0
53369,1.158266,1.994578,2.1396,0.768347,1.768151,-0.302316,-0.421922,0.510319,-0.413334,0.581532,...,0,0,0,0,0,0,0,0,0,0


### Saving our train and test sets

In [14]:
ind_X_train_final.to_csv('C:/Users/justi/SpringboardDS/Capstone II/1.Match Winner Predictor/Individual Data/X_train_ind.csv')

In [15]:
y_train.to_csv('C:/Users/justi/SpringboardDS/Capstone II/1.Match Winner Predictor/Individual Data/y_train_ind.csv')

In [16]:
ind_X_test_final.to_csv('C:/Users/justi/SpringboardDS/Capstone II/1.Match Winner Predictor/Individual Data/X_test_ind.csv')

In [17]:
y_test.to_csv('C:/Users/justi/SpringboardDS/Capstone II/1.Match Winner Predictor/Individual Data/y_test_ind.csv')

## Team Players Preprocessing

### Reading in the Team Player File

In [18]:
team_df = pd.read_csv('df_team.csv')
team_df.head()

Unnamed: 0,match_id,team,date,opponent,event_name,map,kills,assists,deaths,hs,...,deaths_ct,kddiff_ct,adr_ct,kast_ct,kills_t,deaths_t,kddiff_t,adr_t,kast_t,match_outcome
0,2300412,Natus Vincere,2016-01-22,SK,DreamHack ZOWIE Open Leipzig 2016,Mirage,15.0,3.2,9.2,5.8,...,2.2,1.2,89.26,70.0,11.6,7.0,4.6,81.66,76.0,Win
1,2300412,SK,2016-01-22,Natus Vincere,DreamHack ZOWIE Open Leipzig 2016,Mirage,9.2,2.6,15.0,4.6,...,11.6,-4.6,59.9,58.68,2.2,3.4,-1.2,67.88,55.0,Loss
2,2300413,FaZe,2016-01-22,Luminosity,DreamHack ZOWIE Open Leipzig 2016,Inferno,19.8,4.0,14.6,9.0,...,6.6,0.6,79.4,66.0,12.6,8.0,4.6,88.3,74.66,Win
3,2300413,Luminosity,2016-01-22,FaZe,DreamHack ZOWIE Open Leipzig 2016,Inferno,14.6,3.6,20.0,5.8,...,12.6,-4.6,69.12,52.0,6.6,7.4,-0.8,74.7,72.0,Loss
4,2300414,Astralis,2016-01-22,Dignitas,DreamHack ZOWIE Open Leipzig 2016,Overpass,18.2,2.4,13.0,7.2,...,6.6,-0.2,77.38,66.0,11.8,6.4,5.4,86.86,78.66,Win


### Splitting Into Train/Test Sets

In [19]:
# Split features into X and y dataframes.
X = team_df.drop(columns=['match_outcome', 'match_id', 'team', 'opponent', 'event_name', 'map'])
y = team_df['match_outcome']

# Split into train/test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

The variables 'team' and 'opponent' should not be included in our modeling. We don't want teams' performances from other matches impacting our predictions, but rather how a teams performance in a given match affects the match outcome, so we will drop this feature. The 'event_name' and 'map' variables will also not be included since they don't provide much for our goal of predicting match outcomes. It is also worth noting that we dropped our numeric id column 'match_id' since it is of little value moving forward (we don't want it in our analysis and we can merge our numeric and categorical dataframes by using the index).

### Scaling Our Numeric Features

In [20]:
# We have no categorical features in our X train/test sets, so we can keep one dataframe.
X_train = X_train.drop(columns=['date'])
X_test = X_test.drop(columns=['date'])

# Save the column names
X_col = X_train.columns

In [21]:
# Initiate, fit, and transform the scaler on our features.
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [22]:
X_train_scaled = pd.DataFrame(X_train_scaled, index = X_train.index, columns = X_col)
X_test_scaled = pd.DataFrame(X_test_scaled, index = X_test.index, columns = X_col)

In [23]:
X_train_scaled.head()

Unnamed: 0,kills,assists,deaths,hs,flash_assists,kast,kddiff,adr,fkdiff,kills_ct,deaths_ct,kddiff_ct,adr_ct,kast_ct,kills_t,deaths_t,kddiff_t,adr_t,kast_t
14840,-0.15127,0.324869,-2.094226,-0.193273,-0.351136,1.651112,1.748676,2.135729,0.003075,-1.837573,-2.326457,0.268893,1.542559,2.276912,1.816033,-0.612419,2.207708,1.777187,1.164817
10992,0.120339,-0.355654,-0.652432,0.091721,-0.018476,0.825801,0.694739,0.765437,0.493183,-0.957541,-1.098648,0.039077,0.168072,0.478128,1.23188,0.14732,0.944805,0.864574,0.762055
5245,-0.106002,-0.627863,0.293745,-0.003277,-0.018476,0.341159,-0.359197,-0.478589,0.493183,-0.01884,-0.452432,0.383802,0.350778,1.344742,-0.131143,0.790176,-0.892145,-0.877136,-0.557182
11262,-0.558685,0.05266,-2.094226,-0.193273,-0.018476,1.448332,1.383852,1.850079,1.473397,-2.130917,-2.4557,0.096531,1.820809,-0.121467,1.55641,-0.495536,1.86328,1.511451,1.369998
2814,1.70473,0.460974,1.690483,0.376716,-1.681774,0.07349,0.005627,0.289354,-0.650401,0.919861,1.033863,-0.018377,-0.204045,-0.121467,1.426598,1.257708,0.02633,0.568468,0.202747


In [24]:
X_test_scaled.head()

Unnamed: 0,kills,assists,deaths,hs,flash_assists,kast,kddiff,adr,fkdiff,kills_ct,deaths_ct,kddiff_ct,adr_ct,kast_ct,kills_t,deaths_t,kddiff_t,adr_t,kast_t
8298,0.754096,0.460974,-0.066703,1.231699,-0.351136,0.365492,0.735275,0.3825,0.003075,1.389211,-0.646297,1.935063,1.302862,0.997777,-0.455673,0.497969,-0.892145,-0.846766,-0.558702
15875,1.478389,-0.219549,1.284979,1.231699,-1.681774,-0.242845,0.167771,0.067872,0.166444,0.567848,1.227727,-0.535464,-0.393456,-0.639516,1.491504,0.55641,0.772591,0.402952,0.202747
10984,0.708827,-0.491759,-0.652432,-0.003277,-0.018476,1.332748,1.221707,1.429884,0.819921,1.095867,0.06454,1.015797,1.038022,1.103305,-0.196049,-0.904626,0.715186,1.031608,0.81069
9279,-1.056636,-0.763968,0.47397,-0.478267,0.979502,-1.380436,-1.372598,-0.687652,-2.7742,-1.837573,-0.258568,-1.569638,-1.799791,-2.519845,0.517916,0.848617,-0.375503,0.182771,-0.25321
345,-0.15127,-0.763968,0.69925,-0.193273,-0.018476,-0.93635,-0.764558,-0.762169,-0.487032,-0.194846,0.839998,-0.937643,-0.911403,-1.346239,-0.001331,0.14732,-0.145884,-0.335034,-0.122503


### Encoding Dummy Variables

There is no categorical data present so we do not need to encode any duimmy variables.

### Merging Features Back Together

We have only one dataframe due to no categorical variables being present, so no merge is needed.

### Saving our train and test sets

In [25]:
X_train_scaled.to_csv('C:/Users/justi/SpringboardDS/Capstone II/1.Match Winner Predictor/Team Data/X_train_team.csv')

In [26]:
y_train.to_csv('C:/Users/justi/SpringboardDS/Capstone II/1.Match Winner Predictor/Team Data/y_train_team.csv')

In [27]:
X_test_scaled.to_csv('C:/Users/justi/SpringboardDS/Capstone II/1.Match Winner Predictor/Team Data/X_test_team.csv')

In [28]:
y_test.to_csv('C:/Users/justi/SpringboardDS/Capstone II/1.Match Winner Predictor/Team Data/y_test_team.csv')