In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import warnings
warnings.filterwarnings("ignore")

In [2]:
df_train = pd.read_csv("train.csv")

In [3]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    #start_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    #end_mem = df.memory_usage().sum() / 1024**2
    #print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    #print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df
df_train = reduce_mem_usage(df_train)
df_test = reduce_mem_usage(df_test)

In [3]:
# Reference: https://www.kaggle.com/anycode/simple-nn-baseline/code
def FeatureEngineering(df):
    df_size = df.groupby(['matchId','groupId']).size().reset_index(name='group_size')
        
    df_mean = df.groupby(['matchId','groupId']).mean().reset_index()
    
    df_sum = df.groupby(['matchId','groupId']).sum().reset_index()
    
    df_max = df.groupby(['matchId','groupId']).max().reset_index()
    
    df_min = df.groupby(['matchId','groupId']).min().reset_index()
   
    df_match_mean = df.groupby(['matchId']).mean().reset_index()
    
    df = pd.merge(df, df_size, how='left', on=['matchId', 'groupId'])
    del df_size
    df = pd.merge(df, df_mean, suffixes=["", "_mean"], how='left', on=['matchId', 'groupId'])
    del df_mean
    df = pd.merge(df, df_sum, suffixes=["", "_sum"], how='left', on=['matchId', 'groupId'])
    del df_sum
    df = pd.merge(df, df_max, suffixes=["", "_max"], how='left', on=['matchId', 'groupId'])
    del df_max
    df = pd.merge(df, df_min, suffixes=["", "_min"], how='left', on=['matchId', 'groupId'])
    del df_min
    df = pd.merge(df, df_match_mean, suffixes=["", "_match_mean"], how='left', on=['matchId'])
    del df_match_mean
        
    columns = list(df.columns)
    columns.remove("Id")
    columns.remove("matchId")
    columns.remove("groupId")
    columns.remove("Id_mean")
    columns.remove("Id_sum")
    columns.remove("Id_max")
    columns.remove("Id_min")
    columns.remove("Id_match_mean")

    df = df[columns]
    return df

In [4]:
# MAE(Mean_absolute_error)
def MAE(y_estimate, y_true):
    return sum(abs(y_estimate-y_true))/len(y_estimate)

In [5]:
from sklearn.model_selection import train_test_split

In [6]:
X = df_train.drop(columns=['winPlacePerc'])
y = df_train['winPlacePerc']
X = FeatureEngineering(X)
X_train, X_test, y_train, y_test = train_test_split(X, y)

print(('Training set size: {train:d}, test set size: {test:d}').format(train=len(X_train), test=len(X_test)))

Training set size: 3268002, test set size: 1089334


# RandomForest

In [7]:
def RandomForestModel():
    print('\nCreating and training random forest regressor')
    from sklearn.ensemble import RandomForestRegressor
    rfr = RandomForestRegressor(n_jobs=4, n_estimators=10, verbose=True)
    rfr.fit(X_train, y_train)

    y_rfr = rfr.predict(X_test)
    score_rfr = MAE(y_rfr, y_test)
    print(('Random Forest training testset score: {s:.3f}').format(s=score_rfr))
    
    # Read the test set data and make predictions
    X_submit = pd.read_csv('test.csv')
    X_submit = FeatureEngineering(X_submit)
    y_submit = rfr.predict(X_submit)

    df_test = pd.read_csv('sample_submission.csv')
    df_test['winPlacePerc'] = y_submit

    df_test.to_csv('submission_rfr.csv', index=False) 
    print('Random Forest submission file made\n')

In [8]:
RandomForestModel()


Creating and training random forest regressor


[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed: 26.3min finished
[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    4.2s finished


Random Forest training testset score: 0.019


[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:    7.6s finished


Random Forest submission file made



In [9]:
def RandomForestModel():
    print('\nCreating and training random forest regressor')
    from sklearn.ensemble import RandomForestRegressor
    rfr = RandomForestRegressor(n_jobs=4, n_estimators=30, verbose=True)
    rfr.fit(X_train, y_train)

    y_rfr = rfr.predict(X_test)
    score_rfr = MAE(y_rfr, y_test)
    print(('Random Forest training testset score: {s:.3f}').format(s=score_rfr))
    
    # Read the test set data and make predictions
    X_submit = pd.read_csv('test.csv')
    X_submit = FeatureEngineering(X_submit)
    y_submit = rfr.predict(X_submit)

    df_test = pd.read_csv('sample_submission.csv')
    df_test['winPlacePerc'] = y_submit

    df_test.to_csv('submission_rfr_1017.csv', index=False) 
    print('Random Forest submission file made\n')

In [10]:
RandomForestModel()


Creating and training random forest regressor


[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed: 76.5min finished
[Parallel(n_jobs=4)]: Done  30 out of  30 | elapsed:   19.4s finished


Random Forest training testset score: 0.018


MemoryError: 