In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#Imports
import itertools
import pickle

from sklearn import preprocessing
from sklearn.svm import NuSVR
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
import seaborn as sns
import gc, sys
gc.enable()

In [None]:
INPUT_DIR = '/home/lgriffin/Documents/Jupyter/Kaggle/Data/PUBG/'

# 1) Preprocess

In [None]:
def preprocess_data(dataframe, target = 'winPlacePerc'):
    
    #get a list of featuress, remove the uncessary ones
    features = list(dataframe.columns)
    features.remove("Id")
    features.remove("matchId")
    features.remove("groupId")
    features.remove("matchType")
    features.remove("walkDistance")
    features.remove("kills")
    
    if target is not None:
        features.remove(target)
    
    #Add some new features
    dataframe.loc[:,'avgVelocity'] = dataframe.loc[:,'walkDistance']/dataframe.loc[:,'matchDuration']
    dataframe.loc[:,'headshotRate'] = dataframe.loc[:,'kills']/dataframe.loc[:,'headshotKills']
    dataframe.loc[:,'killStreakRate'] = dataframe.loc[:,'killStreaks']/dataframe.loc[:,'kills']
    dataframe.loc[:,'kills_assists'] = dataframe.loc[:,'assists']+dataframe.loc[:,'kills']
    
    dataframe['headshotRate'] = dataframe['headshotRate'].apply(lambda x: 0 if x == np.inf else x)
    dataframe['killStreakRate'] = dataframe['killStreakRate'].apply(lambda x: 0 if x == np.inf else x)
    
    #don't forget to fill nan's with zeros, these nan's are due to headshotRate being kills/headshots
    #dataframe.fillna(0)
    
    #Add these new features to the feature list
    features.append('avgVelocity')
    features.append('killStreakRate')
    features.append('headshotRate')
    features.append('kills_assists')
    
    #Get the group means and rank them according to match
    #also get the target values
    print("Get group mean feature")
    grouped = dataframe.groupby(['matchId','groupId'])
    
    if target is not None:
        y = grouped[target].agg('mean').reset_index()
    else:
        y = None
    
    agg = grouped[features].agg('mean')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    
    df_out = agg.reset_index()[['matchId','groupId']]
    
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_mean", "_mean_rank"], how='left', on=['matchId', 'groupId'])
        
    print("Get group max feature")
    agg = dataframe.groupby(['matchId','groupId'])[features].agg('max')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_max", "_max_rank"], how='left', on=['matchId', 'groupId'])
    
    print("Get group min feature")
    agg = dataframe.groupby(['matchId','groupId'])[features].agg('min')
    agg_rank = agg.groupby('matchId')[features].rank(pct=True).reset_index()
    df_out = df_out.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    df_out = df_out.merge(agg_rank, suffixes=["_min", "_min_rank"], how='left', on=['matchId', 'groupId'])
    
    print("Get match mean feature")
    agg = dataframe.groupby(['matchId'])[features].agg('mean').reset_index()
    df_out = df_out.merge(agg, suffixes=["", "_match_mean"], how='left', on=['matchId'])
    
    print("get match size feature")
    agg = dataframe.groupby(['matchId']).size().reset_index(name='match_size')
    df_out = df_out.merge(agg, how='left', on=['matchId'])
    
    #Merge these new features with the original features
    if target is not None:
        df_out = df_out.merge(y, suffixes=["", target], how='left', on=['matchId', 'groupId'])
    
    del agg, agg_rank
    gc.collect()
    
    return df_out

In [None]:
#scale down the features according to number of deviations away from the mean
def scale_features(dataframe):
    
    features = ['assists_mean', 'boosts_mean',
       'damageDealt_mean', 'DBNOs_mean', 'headshotKills_mean',
       'heals_mean', 'killPlace_mean', 'killPoints_mean',
       'killStreaks_mean', 'longestKill_mean', 'matchDuration_mean',
       'maxPlace_mean', 'numGroups_mean', 'rankPoints_mean',
       'revives_mean', 'rideDistance_mean', 'roadKills_mean',
       'swimDistance_mean', 'teamKills_mean', 'vehicleDestroys_mean',
       'weaponsAcquired_mean', 'winPoints_mean', 'avgVelocity_mean',
       'killStreakRate_mean', 'headshotRate_mean', 'kills_assists_mean',
       'assists_max', 'boosts_max',
       'damageDealt_max', 'DBNOs_max', 'headshotKills_max', 'heals_max',
       'killPlace_max', 'killPoints_max', 'killStreaks_max',
       'longestKill_max', 'matchDuration_max', 'maxPlace_max',
       'numGroups_max', 'rankPoints_max', 'revives_max',
       'rideDistance_max', 'roadKills_max', 'swimDistance_max',
       'teamKills_max', 'vehicleDestroys_max', 'weaponsAcquired_max',
       'winPoints_max', 'avgVelocity_max', 'killStreakRate_max',
       'headshotRate_max', 'kills_assists_max', 
       'assists_min', 'boosts_min',
       'damageDealt_min', 'DBNOs_min', 'headshotKills_min', 'heals_min',
       'killPlace_min', 'killPoints_min', 'killStreaks_min',
       'longestKill_min', 'matchDuration_min', 'maxPlace_min',
       'numGroups_min', 'rankPoints_min', 'revives_min',
       'rideDistance_min', 'roadKills_min', 'swimDistance_min',
       'teamKills_min', 'vehicleDestroys_min', 'weaponsAcquired_min',
       'winPoints_min', 'avgVelocity_min', 'killStreakRate_min',
       'headshotRate_min', 'kills_assists_min',
       'assists', 'boosts', 'damageDealt',
       'DBNOs', 'headshotKills', 'heals', 'killPlace', 'killPoints',
       'killStreaks', 'longestKill', 'matchDuration', 'maxPlace',
       'numGroups', 'rankPoints', 'revives', 'rideDistance', 'roadKills',
       'swimDistance', 'teamKills', 'vehicleDestroys', 'weaponsAcquired',
       'winPoints', 'avgVelocity', 'killStreakRate', 'headshotRate',
       'kills_assists']
    
    dataframe[features] = dataframe[features].apply(lambda x: (x - x.mean())/(x.std()))


In [None]:
def prep_train(directory, target='winPlacePerc'):
    
    train = pd.read_csv(directory+'train_V2.csv')
    train.drop(2744604, inplace=True)
    train = preprocess_data(train, target)
    scale_features(train)
    train = train.fillna(0)
    
    features = list(train.columns.values)
    features.remove("matchId")
    features.remove("groupId")
    features.remove(target)
    
    gc.collect()
    
    return train[features].values, train[target].values, features

In [None]:
def prep_test(directory):
    
    test = pd.read_csv(directory+'test_V2.csv')
    test = preprocess_data(test, None)
    scale_features(test)
    test = test.fillna(0)
    
    features = list(train.columns.values)
    features.remove("matchId")
    features.remove("groupId")
    features.remove(target)
    
    gc.collect()
    
    return test[features].values

In [None]:
X, y, features = prep_train(INPUT_DIR)

# 2) Train

In [None]:
#----- Model
model = NuSVR(kernel='rbf', C=0.9, nu=1)

#Use a simple 10% hold out for validation
x_train, x_valid, y_train, y_valid = train_test_split(X, 
                                                      y, test_size=0.10)  

del X, y
gc.collect()

fitted = model.fit(x_train, y_train)
y_valid_predict = fitted.predict(x_valid)
print('R2: ', fitted.score(x_valid, y_valid))
print('MSE: ', mean_squared_error(y_valid, y_valid_predict))
print('MAE: ', mean_absolute_error(y_valid, y_valid_predict))

In [None]:
pkl_name = "svr_model.pkl"
with open(pkl_name, 'wb') as file:
    pickle_model.dump(model, file)

# 3) Prep Test Set

In [None]:
X = prep_test(INPUT_DIR)

# 4) Fit Test Set

In [None]:
predictions = fitted.predict(X)