In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import matplotlib.pyplot as plt # Collection of functions for scientific and publication-ready visualization
import seaborn as sns  # Visualization library based on matplotlib, provides interface for drawing attractive statistical graphics

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
#        else:
#            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
# Importing the traning set
training_set = pd.read_csv("../input/train_V2.csv") #read Train data
training_set = training_set.dropna() #discard NaN values from training set

In [None]:
# label encode object attributes
le = LabelEncoder()
train_conv = training_set.select_dtypes(include=[object])
train_conv = train_conv.apply(le.fit_transform)
training_set["matchType"] = train_conv["matchType"]
# training_set["Id"] = train_conv["Id"]
# training_set["groupId"] = train_conv["groupId"]
# training_set["matchId"] = train_conv["matchId"]

In [None]:
training_set = reduce_mem_usage(training_set) # reduce memory usage

training_set.head() # check format, NA should be gone

In [None]:
# Pearson Heat map
colormap = plt.cm.RdBu
plt.figure(figsize=(14,12))
plt.title('Pearson Correlation of Features', y=1.05, size=15)
sns.heatmap(training_set.iloc[:, 3:29].astype(float).corr(),linewidths=0.6,vmax=1.0, square=True, cmap=colormap, linecolor='white', annot=True)

Columns in question for removal for having close to 0 correlation on winPlacePerc
* killPoints : 0.013
* matchDuration: 0.0052
* matchType: 0.029
* maxPlace: 0.0370
* numGroups: 0.040
* rankPoints: 0.014
* roadKills: 0.035
* teamKills: 0.016
* vehicleDestroys: 0.073
* winPoints: 0.007

In [None]:
# TODO (makes a difference?)
# training_set = training_set.drop(columns = ['killPoints', 'matchDuration', 'matchType', 'maxPlace', 'numGroups', 'rankPoints', 'roadKills',
#                                            'teamKills', 'vehicleDestroys', 'winPoints'])

In [None]:
training_set.head()

In [None]:
X_train = training_set.iloc[:, 3:28]

In [None]:
X_train.head() #check for correct changes

In [None]:
y_train = training_set.iloc[:, 28]

In [None]:
y_train.head() #check for correct changes

In [None]:
X_train.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train_train, X_train_test, y_train_train, y_train_test = train_test_split(X_train, y_train, test_size=0.50, random_state=42)

### Supervised Learning

In [None]:
# baseline linear regression testing
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()

In [None]:
regressor.fit(X_train_train, y_train_train)

In [None]:
print("Accuracy on training set: ", regressor.score(X_train_train, y_train_train))
print("Accuracy on test set: ", regressor.score(X_train_test, y_train_test))

In [None]:
# TODO (neural networks effective?)
# from sklearn.preprocessing import StandardScaler
# scaler = StandardScaler()
# sc_X_train = scaler.fit_transform(X_train)

In [None]:
# X_train_train_scaled, X_train_test_scaled, y_train_train_scaled, y_train_test_scaled = train_test_split(sc_X_train, y_train, test_size=1/3, random_state=42)

In [None]:
# from sklearn.neural_network import MLPRegressor
# regr = MLPRegressor(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1, max_iter = 200)
# regr.fit(X_train_train_scaled, y_train_train_scaled)
# print("Score on Training Set: ", regr.score(X_train_train_scaled, y_train_train_scaled))
# print("Score on Test Set: ", regr.score(X_train_test_scaled, y_train_test_scaled))

### Test File Submission

In [26]:
# Importing the test set
test_set = pd.read_csv("../input/test_V2.csv")

le = LabelEncoder()
test_conv = test_set.select_dtypes(include=[object])
test_conv = test_conv.apply(le.fit_transform)
test_set["matchType"] = test_conv["matchType"]
test_set = reduce_mem_usage(test_set)

test_set.head()

Memory usage of dataframe is 413.18 MB
Memory usage after optimization is: 108.83 MB
Decreased by 73.7%


Unnamed: 0,Id,groupId,matchId,assists,boosts,damageDealt,DBNOs,headshotKills,heals,killPlace,killPoints,kills,killStreaks,longestKill,matchDuration,matchType,maxPlace,numGroups,rankPoints,revives,rideDistance,roadKills,swimDistance,teamKills,vehicleDestroys,walkDistance,weaponsAcquired,winPoints
0,9329eb41e215eb,676b23c24e70d6,45b576ab7daa7f,0,0,51.46875,0,0,0,73,0,0,0,0.0,1884,15,28,28,1500,0,0.0,0,0.0,0,0,588.0,1,0
1,639bd0dcd7bda8,430933124148dd,42a9a0b906c928,0,4,179.125,0,0,2,11,0,2,1,362.0,1811,3,48,47,1503,2,4668.0,0,0.0,0,0,2017.0,6,0
2,63d5c8ef8dfe91,0b45f5db20ba99,87e7e4477a048e,1,0,23.40625,0,0,4,49,0,0,0,0.0,1793,15,28,27,1565,0,0.0,0,0.0,0,0,788.0,4,0
3,cf5b81422591d1,b7497dbdc77f4a,1b9a94f1af67f1,0,0,65.5,0,0,0,54,0,0,0,0.0,1834,3,45,44,1465,0,0.0,0,0.0,0,0,1812.0,3,0
4,ee6a295187ba21,6604ce20a1d230,40754a93016066,0,4,330.25,1,2,1,7,0,3,1,60.0625,1326,15,28,27,1480,1,0.0,0,0.0,0,0,2964.0,4,0


In [None]:
X_test = test_set.iloc[:, 3:28]

In [None]:
X_test.head()

In [None]:
# random forest model
from sklearn.ensemble import RandomForestRegressor
test_regressor = RandomForestRegressor()

# LinearSVC model
# from sklearn.svm import LinearSVC
# test_regressor = LinearSVC()

In [None]:
test_regressor.fit(X_train, y_train) #fit X train and Y train data using uncommented model

In [None]:
y_pred = test_regressor.predict(X_test) #predict win place percentage Result using X_test

In [27]:
data_to_submit = pd.DataFrame({
    'Id':test_set['Id'],
    'winPlacePerc':y_pred
})
data_to_submit.to_csv('csv_to_submit.csv', index = False)

NameError: name 'y_pred' is not defined

In [None]:
data_to_submit.count() #view submission spreadsheet count