In [2]:
# Import necessary everyday os libs
import sys
import gc

# Import the usual suspects
import numpy as np
import pandas as pd
def df_footprint_reduce(df, skip_obj=False, skip_int=False, skip_float=False, print_comparison=True):
    '''
    :param df              : Pandas Dataframe to shrink in memory footprint size
    :param skip_obj        : If not desired string columns can be skipped during shrink operation
    :param skip_int        : If not desired integer columns can be skipped during shrink operation
    :param skip_float      : If not desired float columns can be skipped during shrink operation
    :param print_comparison: Beware! Printing comparison needs calculation of each columns datasize
                             so if you need speed turn this off. It's just here to show you info                            
    :return                : Pandas Dataframe of exactly the same data and dtypes but in less memory footprint    
    '''
    if print_comparison:
        print(f"Dataframe size before shrinking column types into smallest possible: {round((sys.getsizeof(df)/1024/1024),4)} MB")
    for column in df.columns:
        if (skip_obj is False) and (str(df[column].dtype)[:6] == 'object'):
            num_unique_values = len(df[column].unique())
            num_total_values = len(df[column])
            if num_unique_values / num_total_values < 0.5:
                df.loc[:,column] = df[column].astype('category')
            else:
                df.loc[:,column] = df[column]
        elif (skip_int is False) and (str(df[column].dtype)[:3] == 'int'):
            if df[column].min() > np.iinfo(np.int8).min and df[column].max() < np.iinfo(np.int8).max:
                df[column] = df[column].astype(np.int8)
            elif df[column].min() > np.iinfo(np.int16).min and df[column].max() < np.iinfo(np.int16).max:
                df[column] = df[column].astype(np.int16)
            elif df[column].min() > np.iinfo(np.int32).min and df[column].max() < np.iinfo(np.int32).max:
                df[column] = df[column].astype(np.int32)
        elif (skip_float is False) and (str(df[column].dtype)[:5] == 'float'):
            if df[column].min() > np.finfo(np.float16).min and df[column].max() < np.finfo(np.float16).max:
                df[column] = df[column].astype(np.float16)
            elif df[column].min() > np.finfo(np.float32).min and df[column].max() < np.finfo(np.float32).max:
                df[column] = df[column].astype(np.float32)
    if print_comparison:
        print(f"Dataframe size after shrinking column types into smallest possible: {round((sys.getsizeof(df)/1024/1024),4)} MB")
    return df
def df_null_cleaner(df, fill_with=None, drop_na=False, axis=0):
    df[(df == np.NINF)] = np.NaN
    df[(df == np.Inf)] = np.NaN
    if drop_na:
        df.dropna(axis=axis,inplace=True)
    if ~fill_with:
        df.fillna(fill_with, inplace=True)
    return df
def feature_engineering(df,is_train=True):
    if is_train:          
        df = df[df['maxPlace'] > 1].copy()

    target = 'winPlacePerc'
    print('Grouping similar match types together')
    df.loc[(df['matchType'] == 'solo'), 'matchType'] = 1
    df.loc[(df['matchType'] == 'normal-solo'), 'matchType'] = 1
    df.loc[(df['matchType'] == 'solo-fpp'), 'matchType'] = 1
    df.loc[(df['matchType'] == 'normal-solo-fpp'), 'matchType'] = 1

    df.loc[(df['matchType'] == 'duo'), 'matchType'] = 2
    df.loc[(df['matchType'] == 'normal-duo'), 'matchType'] = 2
    df.loc[(df['matchType'] == 'duo-fpp'), 'matchType'] = 2    
    df.loc[(df['matchType'] == 'normal-duo-fpp'), 'matchType'] = 2

    df.loc[(df['matchType'] == 'squad'), 'matchType'] = 3
    df.loc[(df['matchType'] == 'normal-squad'), 'matchType'] = 3    
    df.loc[(df['matchType'] == 'squad-fpp'), 'matchType'] = 3
    df.loc[(df['matchType'] == 'normal-squad-fpp'), 'matchType'] = 3
    
    df.loc[(df['matchType'] == 'flaretpp'), 'matchType'] = 0
    df.loc[(df['matchType'] == 'flarefpp'), 'matchType'] = 0
    df.loc[(df['matchType'] == 'crashtpp'), 'matchType'] = 0
    df.loc[(df['matchType'] == 'crashfpp'), 'matchType'] = 0
    df.loc[(df['rankPoints'] < 0), 'rankPoints'] = 0
     print('Adding new features using existing ones')
    df['headshotrate'] = df['kills']/df['headshotKills']
    df['killStreakrate'] = df['killStreaks']/df['kills']
    df['healthitems'] = df['heals'] + df['boosts']
    df['totalDistance'] = df['rideDistance'] + df["walkDistance"] + df["swimDistance"]
    df['killPlace_over_maxPlace'] = df['killPlace'] / df['maxPlace']
    df['headshotKills_over_kills'] = df['headshotKills'] / df['kills']
    df['distance_over_weapons'] = df['totalDistance'] / df['weaponsAcquired']
    df['walkDistance_over_heals'] = df['walkDistance'] / df['heals']
    df['walkDistance_over_kills'] = df['walkDistance'] / df['kills']
    df['killsPerWalkDistance'] = df['kills'] / df['walkDistance']
    df['skill'] = df['headshotKills'] + df['roadKills']
    
    print('Adding normalized features')
    df['playersJoined'] = df.groupby('matchId')['matchId'].transform('count')
    gc.collect()
    df['killsNorm'] = df['kills']*((100-df['playersJoined'])/100 + 1)
    df['damageDealtNorm'] = df['damageDealt']*((100-df['playersJoined'])/100 + 1)
    df['maxPlaceNorm'] = df['maxPlace']*((100-df['playersJoined'])/100 + 1)
    df['matchDurationNorm'] = df['matchDuration']*((100-df['playersJoined'])/100 + 1)
    df['headshotKillsNorm'] = df['headshotKills']*((100-df['playersJoined'])/100 + 1)
    df['killPlaceNorm'] = df['killPlace']*((100-df['playersJoined'])/100 + 1)
    df['killPointsNorm'] = df['killPoints']*((100-df['playersJoined'])/100 + 1)
    df['killStreaksNorm'] = df['killStreaks']*((100-df['playersJoined'])/100 + 1)
    df['longestKillNorm'] = df['longestKill']*((100-df['playersJoined'])/100 + 1)
    df['roadKillsNorm'] = df['roadKills']*((100-df['playersJoined'])/100 + 1)
    df['teamKillsNorm'] = df['teamKills']*((100-df['playersJoined'])/100 + 1)
    df['damageDealtNorm'] = df['damageDealt']*((100-df['playersJoined'])/100 + 1)
    df['DBNOsNorm'] = df['DBNOs']*((100-df['playersJoined'])/100 + 1)
    df['revivesNorm'] = df['revives']*((100-df['playersJoined'])/100 + 1)    
    
    # Clean null values from dataframe
    df = df_null_cleaner(df,fill_with=0)

    features = list(df.columns)
    features.remove("Id")
    features.remove("matchId")
    features.remove("groupId")
    features.remove("matchType")  
     y = pd.DataFrame()
    if is_train: 
        print('Preparing target variable')
        y = df.groupby(['matchId','groupId'])[target].agg('mean')
        gc.collect()
        features.remove(target)
        
    print('Aggregating means')
    means_features = list(df.columns)
    means_features.remove("Id")
    means_features.remove("matchId")
    means_features.remove("groupId")
    means_features.remove("matchType")  
    
    if is_train:
        means_features.remove(target)
    
    agg = df.groupby(['matchId','groupId'])[means_features].agg('mean')
    gc.collect()
    agg_rank = agg.groupby('matchId')[means_features].rank(pct=True).reset_index()
    gc.collect()
    
    if is_train: 
        X = agg.reset_index()[['matchId','groupId']]
    else: 
        X = df[['matchId','groupId']]

    X = X.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    X = X.merge(agg_rank, suffixes=["_mean", "_mean_rank"], how='left', on=['matchId', 'groupId'])
    del agg, agg_rank
    gc.collect()
    
    print('Aggregating maxes')
    maxes_features = list(df.columns) 
    maxes_features.remove("Id")
    maxes_features.remove("matchId")
    maxes_features.remove("groupId")
    maxes_features.remove("matchType")  

    if is_train:
        maxes_features.remove(target)
    
    agg = df.groupby(['matchId','groupId'])[maxes_features].agg('max')
    gc.collect()
    agg_rank = agg.groupby('matchId')[maxes_features].rank(pct=True).reset_index()
    gc.collect()
    X = X.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    X = X.merge(agg_rank, suffixes=["_max", "_max_rank"], how='left', on=['matchId', 'groupId'])
    del agg, agg_rank
    gc.collect()
    
    print('Aggregating mins')
    mins_features = list(df.columns) 
    mins_features.remove("Id")
    mins_features.remove("matchId")
    mins_features.remove("groupId")
    mins_features.remove("matchType")  
    
    if is_train:
        mins_features.remove(target)
    
    agg = df.groupby(['matchId','groupId'])[mins_features].agg('min')
    gc.collect()
    agg_rank = agg.groupby('matchId')[mins_features].rank(pct=True).reset_index()
    gc.collect()
    X = X.merge(agg.reset_index(), suffixes=["", ""], how='left', on=['matchId', 'groupId'])
    X = X.merge(agg_rank, suffixes=["_min", "_min_rank"], how='left', on=['matchId', 'groupId'])
    del agg, agg_rank
    gc.collect()
    
    print('Aggregating group sizes')
    grpsize_features = list(df.columns) 
    grpsize_features.remove("Id")
    grpsize_features.remove("matchId")
    grpsize_features.remove("groupId")
    grpsize_features.remove("matchType")  
    grpsize_features.remove("DBNOsNorm")
    grpsize_features.remove("damageDealtNorm")
    grpsize_features.remove("headshotKillsNorm")
    grpsize_features.remove("killPlaceNorm")
    grpsize_features.remove("killPlace_over_maxPlace")
    grpsize_features.remove("killPointsNorm")
    grpsize_features.remove("killStreaksNorm")
    grpsize_features.remove("killsNorm")
    grpsize_features.remove("longestKillNorm")
    grpsize_features.remove("matchDurationNorm")
    grpsize_features.remove("matchDuration")
    grpsize_features.remove("maxPlaceNorm")
    grpsize_features.remove("maxPlace")
    grpsize_features.remove("numGroups")
    grpsize_features.remove("playersJoined")
    grpsize_features.remove("revivesNorm")
    grpsize_features.remove("roadKillsNorm")
    grpsize_features.remove("teamKillsNorm")    
    agg = df.groupby(['matchId','groupId'])[grpsize_features].size().reset_index(name='group_size')
    gc.collect()
    X = X.merge(agg, how='left', on=['matchId', 'groupId'])
    
    print('Aggregating match means')
    mmeans_features = list(df.columns) 
    mmeans_features.remove("Id")
    mmeans_features.remove("matchId")
    mmeans_features.remove("groupId")
    mmeans_features.remove("DBNOsNorm")
    mmeans_features.remove("damageDealtNorm")
    mmeans_features.remove("headshotKillsNorm")
    mmeans_features.remove("killPlace_over_maxPlace")
    mmeans_features.remove("killPointsNorm")
    mmeans_features.remove("killStreaksNorm")
     mmeans_features.remove("longestKillNorm")
    mmeans_features.remove("matchDurationNorm")
    mmeans_features.remove("matchDuration")
    mmeans_features.remove("maxPlaceNorm")
    mmeans_features.remove("numGroups")
    mmeans_features.remove("revivesNorm")
    mmeans_features.remove("roadKillsNorm")
    mmeans_features.remove("teamKillsNorm")      
    agg = df.groupby(['matchId'])[mmeans_features].agg('mean').reset_index()
    gc.collect()
    X = X.merge(agg, suffixes=["", "_match_mean"], how='left', on=['matchId'])
    
    print('Aggregating match sizes')
    msizes_features = list(df.columns) 
    msizes_features.remove("Id")
    msizes_features.remove("matchId")
    msizes_features.remove("groupId")
    msizes_features.remove("DBNOsNorm")
    msizes_features.remove("damageDealtNorm")
    msizes_features.remove("headshotKillsNorm")
    msizes_features.remove("killPlace_over_maxPlace")
    msizes_features.remove("killPointsNorm")
    msizes_features.remove("killStreaksNorm")
    msizes_features.remove("longestKillNorm")
    msizes_features.remove("matchDurationNorm")
    msizes_features.remove("matchDuration")
    msizes_features.remove("maxPlaceNorm")
    msizes_features.remove("numGroups")
    msizes_features.remove("revivesNorm")
    msizes_features.remove("roadKillsNorm")
    msizes_features.remove("teamKillsNorm")      
    agg = df.groupby(['matchId']).size().reset_index(name='match_size')
    gc.collect()
    X = X.merge(agg, how='left', on=['matchId'])
    del df, agg
    gc.collect()
 X.drop(columns = ['matchId', 
                      'groupId'
                     ], axis=1, inplace=True)  
    gc.collect()
    if is_train:
        return X, y
    
    return X

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 261)

In [None]:
X_train = pd.read_csv('../input/train_V2.csv', engine='c')

In [None]:
X_train = df_footprint_reduce(X_train, skip_obj=True)
gc.collect()

In [None]:
import matplotlib.pyplot as plt

In [None]:
ax = xgb.plot_importance(model)
fig = ax.figure
fig.set_size_inches(20, 50)

In [None]:
# Clean memory and load test set
del X_train, X_validation, y_train, y_validation 
gc.collect()



In [None]:
test_set = pd.read_csv('../input/test_V2.csv', engine='c')

In [None]:
pred_test = pred_test.reshape(-1)
pred_test = (pred_test + 1) / 2
for i in range(len(test_set)):
    winPlacePerc = pred_test[i]
    maxPlace = int(test_set.iloc[i]['maxPlace'])
    if maxPlace == 0:
        winPlacePerc = 0.0
    elif maxPlace == 1:
        winPlacePerc = 1.0
    else:
        gap = 1.0 / (maxPlace - 1)
        winPlacePerc = round(winPlacePerc / gap) * gap
    
    if winPlacePerc < 0: winPlacePerc = 0.0
    if winPlacePerc > 1: winPlacePerc = 1.0    
    pred_test[i] = winPlacePerc

    if (i + 1) % 100000 == 0:
        print(i, flush=True, end=" ")

test_set['winPlacePerc'] = pred_test

submission = test_set[['Id', 'winPlacePerc']]
submission.to_csv('submission.csv', index=False)