## Setup and Import Libraries

In [None]:
###NOTE: Only run if you do not have fastai library installed ###
!pip3 install fastai

In [4]:
from fastai.tabular import *

In [5]:
path = "./"

## Data Preprocessing

In [6]:
import pandas as pd
import random


def data_loader(filename='./train_V2.csv', test_size=.10):


    data = pd.read_csv(filename)
    train_size = 1-test_size
    #Remove attributes that are presumed to be irrelevant [Id, groupId]
    data.drop(['Id', 'groupId'], axis=1, inplace=True)

    #one hot_encode match type
    data_dummies = pd.get_dummies(data['matchType'], prefix='matchType')
    data = pd.concat([data, data_dummies], axis=1)
    data.drop(['matchType'], axis=1, inplace=True)
    # print(data.isna().any())
    print("before drop NaNs")
    print(data.shape)
    data = data.dropna()
    print("after drop NaNs")
    print(data.shape)
    #Split into training and testing sets
    #split based on matchID
    #want all data from a given match to be together (in test or train)
    #Don't want dependent data in train and test
    matches = data.matchId.unique().tolist()
    num_training = int(len(matches)*train_size)
    train_matches = random.sample(matches, num_training)
    train_data = data[data.matchId.isin(train_matches)]
    test_data = data[~data.matchId.isin(train_matches)]

    #No need for matcId anymore
    train_data.drop(['matchId'], axis=1, inplace=True)
    test_data.drop(['matchId'], axis=1, inplace=True)



    y_train = train_data['winPlacePerc']
    x_train = train_data.drop(['winPlacePerc'], axis=1)

    y_test = test_data['winPlacePerc']
    x_test = test_data.drop(['winPlacePerc'], axis=1)




    return x_train, y_train, x_test, y_test

x_train, y_train, x_test, y_test = data_loader()

before drop NaNs
(4446966, 42)
after drop NaNs
(4446965, 42)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [4]:
x_train['winPlacePerc'] = y_train

## Setup Model

In [5]:
dep_var = 'winPlacePerc'
cont_var_names = ['assists', 'boosts', 'damageDealt', 'DBNOs', 'headshotKills', 'heals',
       'killPlace', 'killPoints', 'kills', 'killStreaks', 'longestKill',
       'matchDuration', 'maxPlace', 'numGroups', 'rankPoints', 'revives',
       'rideDistance', 'roadKills', 'swimDistance', 'teamKills',
       'vehicleDestroys', 'walkDistance', 'weaponsAcquired', 'winPoints',
       'matchType_crashfpp', 'matchType_crashtpp', 'matchType_duo',
       'matchType_duo-fpp', 'matchType_flarefpp', 'matchType_flaretpp',
       'matchType_normal-duo', 'matchType_normal-duo-fpp',
       'matchType_normal-solo', 'matchType_normal-solo-fpp',
       'matchType_normal-squad', 'matchType_normal-squad-fpp',
       'matchType_solo', 'matchType_solo-fpp', 'matchType_squad',
       'matchType_squad-fpp']
# cat_var_names = ['matchType']
# procs = [Normalize, Categorify]

In [6]:
data = (TabularList.from_df(x_train, path=path, cont_names = cont_var_names)
            .split_by_idx(list(range(len(x_train) - 400234, len(x_train))))
            .label_from_df(cols=dep_var)
            .add_test(x_test)
            .databunch())

In [7]:
data.show_batch(rows=10)

assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints,matchType_crashfpp,matchType_crashtpp,matchType_duo,matchType_duo-fpp,matchType_flarefpp,matchType_flaretpp,matchType_normal-duo,matchType_normal-duo-fpp,matchType_normal-solo,matchType_normal-solo-fpp,matchType_normal-squad,matchType_normal-squad-fpp,matchType_solo,matchType_solo-fpp,matchType_squad,matchType_squad-fpp,target
0.0,2.0,181.8,0.0,0.0,0.0,29.0,0.0,1.0,1.0,141.3,1377.0,92.0,88.0,1510.0,0.0,1611.0,0.0,11.83,0.0,0.0,875.4,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.5714
0.0,3.0,278.2,0.0,1.0,0.0,20.0,0.0,2.0,1.0,142.6,1876.0,99.0,97.0,1514.0,0.0,673.5,0.0,0.0,0.0,0.0,2071.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.6429
0.0,1.0,72.93,0.0,0.0,0.0,86.0,1174.0,0.0,0.0,0.0,1313.0,26.0,26.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,96.15,1.0,1451.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.08
0.0,6.0,1262.0,7.0,2.0,4.0,1.0,1732.0,15.0,2.0,173.2,1460.0,45.0,41.0,-1.0,2.0,722.8,0.0,0.0,0.0,0.0,3050.0,5.0,1474.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
0.0,1.0,130.4,1.0,0.0,0.0,52.0,0.0,0.0,0.0,0.0,1268.0,28.0,28.0,1190.0,0.0,0.0,0.0,0.0,1.0,0.0,2094.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.7037
0.0,1.0,0.0,0.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,1383.0,27.0,25.0,1376.0,0.0,0.0,0.0,0.0,0.0,0.0,795.4,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.3077
0.0,0.0,0.0,0.0,0.0,0.0,85.0,1161.0,0.0,0.0,0.0,1885.0,49.0,44.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,125.2,0.0,1448.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1875
1.0,0.0,0.0,0.0,0.0,0.0,56.0,1461.0,0.0,0.0,0.0,1464.0,29.0,29.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,89.07,1.0,1524.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.7143
0.0,0.0,23.22,0.0,0.0,0.0,65.0,1274.0,0.0,0.0,0.0,1419.0,31.0,31.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,131.3,1.0,1454.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.3333
0.0,3.0,144.3,1.0,1.0,12.0,25.0,0.0,1.0,1.0,9.603,1320.0,29.0,28.0,1517.0,0.0,609.7,0.0,13.37,0.0,0.0,3118.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.7143


## Train Model on Preprocessed Data

In [8]:
learn = tabular_learner(data, layers=[14, 7, 3, 1], metrics=rmse)

In [9]:
learn.fit(1, lr=1e-2)

epoch,train_loss,valid_loss,root_mean_squared_error,time
0,0.009727,0.009101,0.093965,10:33


## Obtain Predictions

In [10]:
y_preds = learn.get_preds()

In [None]:
# save predictions locally to read later
import pickle

with open('y_preds_raw', 'wb') as fp:
    pickle.dump(y_preds, fp)

In [None]:
# read saved predictions
with open ('y_preds_raw', 'rb') as fp:
    y_preds = pickle.load(fp)

In [11]:
# extract float value of prediction from tensor
y_preds_extracted = []
for y in y_preds[0]:
    y_preds_extracted.append(y.data[0])

## Measure Performance

In [12]:
from sklearn.metrics import mean_absolute_error as mae

# Measure RMSE error
rmse = np.sqrt(np.mean((np.array(y_preds_extracted)-y_test.values[:len(y_preds_extracted)])**2))
print("RMSE Error: ", rmse)
print()

# Measure MAE error (metric used in Kaggle)
print("MAE Error", mae(np.array(y_preds_extracted),y_test.values[:len(y_preds_extracted)]))

RMSE Error:  0.407788716913769

MAE Error 0.33352112093440056


# Applying PCA to Data for Dimension Reduction

In [7]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

def perform_pca():
    X_train, y_train, X_test, y_test = data_loader()
    scaler = StandardScaler()
    x_train_scaled = scaler.fit_transform(X_train)
    X_train = pd.DataFrame(x_train_scaled, index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.fit_transform(X_test), index=X_test.index, columns=X_test.columns)
    pca = PCA(0.95)
    pca.fit(X_train)
    columns = ['pca_%i' % i for i in range(pca.components_.shape[0])]
    X_train = pd.DataFrame(pca.transform(X_train), columns=columns, index=X_train.index)
    X_test = pd.DataFrame(pca.transform(X_test), columns=columns, index=X_test.index)
    return X_train, X_test, y_train, y_test

x_train_pca, x_test_pca, y_train_pca, y_test_pca = perform_pca()

before drop NaNs
(4446966, 42)
after drop NaNs
(4446965, 42)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [8]:
x_train_pca['winPlacePerc'] = y_train_pca

## Retrain Model on Dimension Reduced Data produced by PCA

In [9]:
dep_var = 'winPlacePerc'
cont_var_names_pca = x_test_pca.columns

In [10]:
data_pca = (TabularList.from_df(x_train_pca, path=path, cont_names = cont_var_names_pca)
            .split_by_idx(list(range(len(x_train_pca) - 400234, len(x_train_pca))))
            .label_from_df(cols=dep_var)
            .databunch())

In [11]:
learn_pca = tabular_learner(data_pca, layers=[14, 7, 3, 1], metrics=rmse)

In [12]:
learn_pca.fit(1, 1e-2)

epoch,train_loss,valid_loss,root_mean_squared_error,time
0,0.015221,0.012584,0.110938,11:36


## Obtain Predictions

In [13]:
y_preds_pca = learn_pca.get_preds()

In [None]:
for i in range(len(x_test_pca)):
    y_preds_pca.append(learn_pca.predict(x_test_pca.iloc[i]))

In [None]:
# save predictions locally
import pickle

with open('y_preds_pca', 'wb') as fp:
    pickle.dump(y_preds_pca, fp)

In [None]:
# read saved predictions
with open ('y_preds_pca', 'rb') as fp:
    y_preds_pca = pickle.load(fp)

In [14]:
y_preds_pca_extracted = []
for y in y_preds_pca[0]:
    y_preds_pca_extracted.append(y.data[0])

## Measure Performance

In [15]:
from sklearn.metrics import mean_absolute_error as mae

# Measure RMSE error
rmse = np.sqrt(np.mean((np.array(y_preds_pca_extracted)-y_test_pca.values[:len(y_preds_pca_extracted)])**2))
print("RMSE Error: ", rmse)
print()

# Measure MAE error (metric used in Kaggle)
print("MAE Error", mae(np.array(y_preds_pca_extracted),y_test_pca.values[:len(y_preds_pca_extracted)]))

RMSE Error:  0.40755381438899435

MAE Error 0.3333712585652546
