In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
#        else:
#            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [2]:
# Importing the traning set
training_set = pd.read_csv("../input/train_V2.csv") #read Train data
training_set = training_set.dropna() #discard NA from training set

training_set = reduce_mem_usage(training_set) #reduce memory usage

training_set.head() #check format, NA should be gone

In [3]:
# Importing the test set
test_set = pd.read_csv("../input/test_V2.csv") #Read Test data file

test_set = reduce_mem_usage(test_set) #reduce memory usage

test_set.head() #check format of Test data file

In [None]:
test_set = test_set.drop(columns = ['matchType'])

In [None]:
test_set.head() #check for correct changes

In [None]:
training_set = training_set.drop(columns = ['matchType'])

In [None]:
training_set.head() #check for correct changes

In [None]:
X_train = training_set.iloc[:, 3:26]

In [None]:
X_train.head()  #check for correct changes

In [None]:
y_train = training_set.iloc[:, 27]

In [None]:
y_train.head() #check for correct changes

In [None]:
X_test = test_set.iloc[:, 3:26]

In [None]:
X_test.head() #check for correct changes

In [None]:
X_train.head() #check for correct changes

In [None]:
from sklearn.model_selection import train_test_split
X_train_train, X_train_test, y_train_train, y_train_test = train_test_split(X_train, y_train, test_size=0.50, random_state=42)

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()

In [None]:
regressor.fit(X_train_train, y_train_train)

In [None]:
print("Accuracy on training set: ", regressor.score(X_train_train, y_train_train))
print("Accuracy on test set: ", regressor.score(X_train_test, y_train_test))

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
sc_X_train = scaler.fit_transform(X_train)

In [None]:
X_train_train_scaled, X_train_test_scaled, y_train_train_scaled, y_train_test_scaled = train_test_split(sc_X_train, y_train, test_size=1/3, random_state=42)

In [None]:
from sklearn.neural_network import MLPRegressor
regr = MLPRegressor(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=1, max_iter = 200)
regr.fit(X_train_train_scaled, y_train_train_scaled)
print("Score on Training Set: ", regr.score(X_train_train_scaled, y_train_train_scaled))
print("Score on Test Set: ", regr.score(X_train_test_scaled, y_train_test_scaled))

In [None]:
from sklearn.ensemble import RandomForestRegressor
rf_regressor = RandomForestRegressor()

In [None]:
rf_regressor.fit(X_train, y_train) #fit X train and Y train data using Random Forest

In [None]:
y_pred = rf_regressor.predict(X_test) #predict win place percentage Result using X_test

In [None]:
data_to_submit = pd.DataFrame({
    'Id':test_set['Id'],
    'winPlacePerc':y_pred
})
data_to_submit.to_csv('csv_to_submit.csv', index = False)