In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
import os
%matplotlib inline

In [2]:
datadir = '../data/raw'
train = pd.read_csv(os.path.join(datadir, 'train_V2.csv'))
test = pd.read_csv(os.path.join(datadir, 'test_V2.csv'))

In [3]:
# Combine train and test data
print("Train: ",train.shape)
print("-"*20)
print("Test:", test.shape)
print("-"*20)
print("Train Columns: ",train.columns)
print("-"*20)
print("Test Columns:", test.columns)

Train:  (4446966, 29)
--------------------
Test: (1934174, 28)
--------------------
Train Columns:  Index(['Id', 'groupId', 'matchId', 'assists', 'boosts', 'damageDealt', 'DBNOs',
       'headshotKills', 'heals', 'killPlace', 'killPoints', 'kills',
       'killStreaks', 'longestKill', 'matchDuration', 'matchType', 'maxPlace',
       'numGroups', 'rankPoints', 'revives', 'rideDistance', 'roadKills',
       'swimDistance', 'teamKills', 'vehicleDestroys', 'walkDistance',
       'weaponsAcquired', 'winPoints', 'winPlacePerc'],
      dtype='object')
--------------------
Test Columns: Index(['Id', 'groupId', 'matchId', 'assists', 'boosts', 'damageDealt', 'DBNOs',
       'headshotKills', 'heals', 'killPlace', 'killPoints', 'kills',
       'killStreaks', 'longestKill', 'matchDuration', 'matchType', 'maxPlace',
       'numGroups', 'rankPoints', 'revives', 'rideDistance', 'roadKills',
       'swimDistance', 'teamKills', 'vehicleDestroys', 'walkDistance',
       'weaponsAcquired', 'winPoints'],
 

In [4]:
data = pd.concat([train, test], axis=0)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [5]:
# save for future usage
data.to_csv('../data/processed/data.csv')

In [6]:
# number of group per matches
group_per_match = data.groupby(['matchId'], as_index=False)['groupId'].agg('count')
group_per_match.columns = ['matchId', 'group_cnt']
group_per_match

Unnamed: 0,matchId,group_cnt
0,0000a43bce5eec,95
1,0000eb01ea6cdd,98
2,0002912fe5ed71,95
3,0003b92987589e,100
4,0006eb8c17708d,93
5,00077604e50a63,98
6,00086c74bb4efc,98
7,00086e740a5804,98
8,0008c31a9be4a7,98
9,000b598b79aa5e,93


In [7]:
data = pd.merge(data, group_per_match, how='left', on='matchId')
data.head(5)

Unnamed: 0,DBNOs,Id,assists,boosts,damageDealt,groupId,headshotKills,heals,killPlace,killPoints,...,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPlacePerc,winPoints,group_cnt
0,0,7f96b2f878858a,0,0,0.0,4d4b580de459be,0,0,60,1241,...,0.0,0,0.0,0,0,244.8,1,0.4444,1466,96
1,0,eef90569b9d03c,0,0,91.47,684d5656442f9e,0,0,57,0,...,0.0045,0,11.04,0,0,1434.0,5,0.64,0,91
2,0,1eaf90ac73de72,1,0,68.0,6a4a42c3245a74,0,0,47,0,...,0.0,0,0.0,0,0,161.8,2,0.7755,0,98
3,0,4616d365dd2853,0,0,32.9,a930a9c79cd721,0,0,75,0,...,0.0,0,0.0,0,0,202.7,3,0.1667,0,91
4,0,315c96c26c9aac,0,0,100.0,de04010b3458dd,0,0,45,0,...,0.0,0,0.0,0,0,49.75,2,0.1875,0,97


In [8]:
data.columns

Index(['DBNOs', 'Id', 'assists', 'boosts', 'damageDealt', 'groupId',
       'headshotKills', 'heals', 'killPlace', 'killPoints', 'killStreaks',
       'kills', 'longestKill', 'matchDuration', 'matchId', 'matchType',
       'maxPlace', 'numGroups', 'rankPoints', 'revives', 'rideDistance',
       'roadKills', 'swimDistance', 'teamKills', 'vehicleDestroys',
       'walkDistance', 'weaponsAcquired', 'winPlacePerc', 'winPoints',
       'group_cnt'],
      dtype='object')

In [12]:
# should we drop these cols?
data[['numGroups','maxPlace']]

Unnamed: 0,numGroups,maxPlace
0,26,28
1,25,26
2,47,50
3,30,31
4,95,97
5,28,28
6,28,28
7,92,96
8,27,28
9,27,29


In [20]:
train_cols = data.drop(['Id', 'numGroups', 'maxPlace', 'winPlacePerc', 'groupId'],axis=1).columns.values
#train_cols = train_cols.delete(['Id', 'numGroups', 'maxPlace'])
train_cols 

array(['DBNOs', 'assists', 'boosts', 'damageDealt', 'headshotKills',
       'heals', 'killPlace', 'killPoints', 'killStreaks', 'kills',
       'longestKill', 'matchDuration', 'matchId', 'matchType',
       'rankPoints', 'revives', 'rideDistance', 'roadKills',
       'swimDistance', 'teamKills', 'vehicleDestroys', 'walkDistance',
       'weaponsAcquired', 'winPoints', 'group_cnt'], dtype=object)

In [23]:
data.select_dtypes(exclude=(['object'])).columns

Index(['DBNOs', 'assists', 'boosts', 'damageDealt', 'headshotKills', 'heals',
       'killPlace', 'killPoints', 'killStreaks', 'kills', 'longestKill',
       'matchDuration', 'maxPlace', 'numGroups', 'rankPoints', 'revives',
       'rideDistance', 'roadKills', 'swimDistance', 'teamKills',
       'vehicleDestroys', 'walkDistance', 'weaponsAcquired', 'winPlacePerc',
       'winPoints', 'group_cnt'],
      dtype='object')