In [94]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns 
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings("ignore")

In [95]:
train = pd.read_csv('./data/train_V2.csv')

In [56]:
pd.options.display.max_columns = None
train.head()

Unnamed: 0,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,winPlacePerc,numJoined,totalDistance,headshotPerc,killsNorm,damageDealtNorm,DBNOsNorm,teamContribution,healsAndBoosts,boostsPerWalkDistance,healsPerWalkDistance,healsAndBoostsPerWalkDistance,has_cheater
0,0,0,0.0,0,0,0,60,1241,0,0,0.0,1306,squad-fpp,28,26,-1,0,0.0,0,0.0,0,0,244.8,1,1466,0.4444,96,244.8,0.0,0.0,0.0,0.0,-0.603399,0,0.0,0.0,0.0,0
1,0,0,91.47,0,0,0,57,0,0,0,0.0,1777,squad-fpp,26,25,1484,0,0.0045,0,11.04,0,0,1434.0,5,0,0.64,91,1445.0445,0.0,0.0,99.7023,0.0,-0.603399,0,0.0,0.0,0.0,0
2,1,0,68.0,0,0,0,47,0,0,0,0.0,1318,duo,50,47,1491,0,0.0,0,0.0,0,0,161.8,2,0,0.7755,98,161.8,0.0,0.0,69.36,0.0,1.095625,0,0.0,0.0,0.0,0
3,0,0,32.9,0,0,0,75,0,0,0,0.0,1436,squad-fpp,31,30,1408,0,0.0,0,0.0,0,0,202.7,3,0,0.1667,91,202.7,0.0,0.0,35.861,0.0,-0.603399,0,0.0,0.0,0.0,0
4,0,0,100.0,0,0,0,45,0,1,1,58.53,1424,solo-fpp,97,95,1560,0,0.0,0,0.0,0,0,49.75,2,0,0.1875,97,49.75,0.0,1.03,103.0,0.0,0.0,0,0.0,0.0,0.0,0


In [96]:
def clean_data(train):
    train['numJoined'] = train.groupby('matchId')['matchId'].transform('count')
    train['teamNum'] = train.groupby('groupId')['groupId'].transform('count')
    train['totalDistance'] = train['rideDistance'] + train['swimDistance'] + train['walkDistance']
    train['headshotPerc'] = train['headshotKills']/(train['kills']+1)
    train['headshotPerc'].fillna(0, inplace=True)

    #since each game has different num of players, needs to normalize
    normalize = (100-train['numJoined'])/100 + 1
    train['killsNorm'] = train['kills'] * normalize
    train['damageDealtNorm'] = train['damageDealt'] * normalize
    train['DBNOsNorm'] = train['DBNOs'] * normalize
    train['killsPerDistance'] = train['killsNorm']/(train['totalDistance']+1)

    #assessing teamwork
    train['teamContribution'] = preprocessing.scale(train['assists']) + preprocessing.scale(train['revives']) - preprocessing.scale(train['teamKills'])
    #solo players do not have teamwork
    train['teamContribution'].loc[train['matchType'].isin(['solo','solo-fpp'])] = None

    #explore heals and boost
    #allow you to stay out of zone longer
    train['healsAndBoosts'] = train['heals'] + train['boosts']
    train['boostsPerWalkDistance'] = train['boosts']/(train['walkDistance']+1)
    train['healsPerWalkDistance'] = train['heals']/(train['walkDistance']+1)
    train['healsAndBoostsPerWalkDistance'] = train['healsAndBoosts']/(train['walkDistance']+1)

    # if the kills per distance is larger than 3std 
    # cheater!!!!!
    outlier = train['killsPerDistance'].mean() +3*train['killsPerDistance'].std()
    cheat_match = np.array(train['matchId'].loc[train['killsPerDistance']>=outlier])
    cheat_player = np.array(train['Id'].loc[train['killsPerDistance']>=outlier])
    train['cheat_match'] = np.where(train['matchId'].isin(cheat_match),1,0)
    train['cheat_player'] = np.where(train['Id'].isin(cheat_player),1,0)
    
    id_lst = train['Id']
    train.drop('Id', axis=1, inplace=True)
    train.drop('groupId',axis=1,inplace=True)
    train.drop('matchId',axis=1,inplace=True)
    train.fillna(0,inplace=True)
    
    return id_lst

In [97]:
id_train = clean_data(train)
data = train.loc[:,(train.columns != 'winPlacePerc')]
label = train['winPlacePerc']

In [98]:
#dealing with categorical data
data_object = data.select_dtypes(include=[object])

le = preprocessing.LabelEncoder()
data_object2 = data_object.apply(le.fit_transform)

enc = preprocessing.OneHotEncoder()
enc.fit(data_object2)
onehotlabels = enc.transform(data_object2).toarray()
onehotlabels = pd.DataFrame(onehotlabels)

data.drop('matchType',axis=1,inplace=True)

data = pd.concat([data, onehotlabels], axis=1, sort=False)

data = preprocessing.scale(data)

dtrain = xgb.DMatrix(data, label=label, )

param = {'max_depth': 6, 'eta': 0.3, 'objective': 'reg:linear'}
#param['nthread'] = 4
param['eval_metric'] = 'mae'
#bst = xgb.train(param, dtrain,num_boost_round=30)
bst = xgb.train(param, dtrain)

[23:12:18] Tree method is automatically selected to be 'approx' for faster speed. To use old behavior (exact greedy algorithm on single machine), set tree_method to 'exact'.
[23:13:13] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 126 extra nodes, 0 pruned nodes, max_depth=6
[23:13:55] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 126 extra nodes, 0 pruned nodes, max_depth=6
[23:14:41] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 126 extra nodes, 0 pruned nodes, max_depth=6
[23:15:27] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 126 extra nodes, 0 pruned nodes, max_depth=6
[23:16:12] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 126 extra nodes, 0 pruned nodes, max_depth=6
[23:16:56] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 126 extra nodes, 0 pruned nodes, max_depth=6
[23:17:35] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 126 extra nodes, 0 pruned nodes, max_depth=6
[23:18:15] src/tree/updater_prune.c

In [99]:
test = pd.read_csv('./data/test_V2.csv')
id_test = clean_data(test)

test_object = test.select_dtypes(include=[object])

le = preprocessing.LabelEncoder()
test_object2 = test_object.apply(le.fit_transform)

enc = preprocessing.OneHotEncoder()
enc.fit(test_object2)
onehotlabels = enc.transform(test_object2).toarray()
onehotlabels = pd.DataFrame(onehotlabels)

test.drop('matchType',axis=1,inplace=True)

test = pd.concat([test, onehotlabels], axis=1, sort=False)

dtest = xgb.DMatrix(preprocessing.scale(test))

In [100]:
ypred = bst.predict(dtest)

In [101]:
ypred

array([0.24637657, 0.87198544, 0.5693787 , ..., 0.88123417, 0.81118274,
       0.0561392 ], dtype=float32)

In [102]:
submission = pd.DataFrame({'Id':id_test,'winPlacePerc':ypred})

In [103]:
submission['winPlacePerc'].loc[submission['winPlacePerc']<0] = 0

In [104]:
submission['winPlacePerc'].loc[submission['winPlacePerc']>1] = 1

In [105]:
submission.to_csv('submission7.csv')