In [6]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import math

#model import 
from sklearn.neighbors import KNeighborsRegressor

#splitting and scaling
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#parameter tuning
from sklearn.model_selection import GridSearchCV

#evaluation
from sklearn import linear_model, metrics
from sklearn.metrics import (mean_squared_error, r2_score, mean_absolute_error, r2_score,
mean_squared_log_error, explained_variance_score, max_error)
from sklearn.model_selection import LeaveOneOut, cross_val_score, cross_val_predict

#from sklearn.svm import LinearSVR
#from sklearn.svm import SVR
#from sklearn.datasets import make_regression
#from sklearn.preprocessing import StandardScaler
#from sklearn.preprocessing import QuantileTransformer
#from sklearn.compose import TransformedTargetRegressor
#from sklearn.pipeline import Pipeline
#from sklearn.model_selection import GridSearchCV

import warnings
# We silence warnings concerning future version updates
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

In [7]:
# Load training and testing data
X_train = np.loadtxt("X_train.csv", delimiter=',', skiprows=1)
X_test = np.loadtxt("X_test.csv", delimiter=',', skiprows=1)
y_train = np.loadtxt("y_train.csv", delimiter=',', skiprows=1)[:,1]


In [8]:
# Substracting Max & Min
X_train2 = np.zeros((len(X_train),4))
X_test2 = np.zeros((len(X_test),4))

X_train2[:,0] = X_train[:,0]
X_train2[:,1] = X_train[:,2]-X_train[:,1]
X_train2[:,2] = X_train[:,3]
X_train2[:,3] = X_train[:,5]-X_train[:,4]

X_test2[:,0] = X_test[:,0]
X_test2[:,1] = X_test[:,2]-X_test[:,1]
X_test2[:,2] = X_test[:,3]
X_test2[:,3] = X_test[:,5]-X_test[:,4]

scaler = StandardScaler()
X_train2_scaled = scaler.fit_transform(X_train2)
X_test2_scaled = scaler.fit_transform(X_test2)

In [9]:
# Averaging Max & Min
X_train3 = np.zeros((len(X_train),4))
X_test3 = np.zeros((len(X_test),4))

X_train3[:,0] = X_train[:,0]
X_train3[:,1] = (X_train[:,2]+X_train[:,1])/2
X_train3[:,2] = X_train[:,3]
X_train3[:,3] = (X_train[:,5]+X_train[:,4])/2

X_test3[:,0] = X_test[:,0]
X_test3[:,1] = (X_test[:,2]+X_test[:,1])/2
X_test3[:,2] = X_test[:,3]
X_test3[:,3] = (X_test[:,5]+X_test[:,4])/2

scaler = StandardScaler()
X_train3_scaled = scaler.fit_transform(X_train3)
X_test3_scaled = scaler.fit_transform(X_test3)

In [10]:
#scaling 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [11]:
def saveFile(y_pred,name):
    test_header = "Id,PRP"
    n_points = X_test.shape[0]
    y_pred_pp = np.ones((n_points, 2))
    y_pred_pp[:, 0] = range(n_points)
    y_pred_pp[:, 1] = y_pred
    np.savetxt(name, y_pred_pp, fmt='%d,%f', delimiter=",",
               header=test_header, comments="")


In [12]:
#split the dataset for training
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_train, y_train, test_size=0.3, random_state=42)

#scaling split training dataset
scaler = StandardScaler()
X_train1_scaled = scaler.fit_transform(X_train1)
X_test1_scaled = scaler.fit_transform(X_test1)

In [13]:
def basicKNR(X_train1, X_test1, y_train1, y_test1):
    knr = KNeighborsRegressor()
    knr.fit(X_train1, y_train1)
    y_pred = knr.predict(X_test1)
    print("R2 score: " + str(knr.score(X_test1, y_test1)))
    print("Explained variance: " + str(explained_variance_score(y_test1, y_pred)))
    print("Max error: " + str(max_error(y_test1, y_pred)))
    print("Mean absolute error: " + str(mean_absolute_error(y_test1, y_pred)))
    print("Root mean squared error: " + str(math.sqrt(mean_squared_error(y_test1, y_pred))))

In [49]:
basicKNR(X_train1, X_test1, y_train1, y_test1)

R2 score: 0.745258685851206
Explained variance: 0.7452720740983032
Max error: 181.0
Mean absolute error: 36.06274509803922
Root mean squared error: 56.25757291071063


In [None]:
basicKNR(X_train1, X_test1, y_train1, y_test1)

In [44]:
basicKNR(X_train1_scaled, X_test1_scaled, y_train1, y_test1)

R2 score: 0.8455452767778092
Explained variance: 0.8497178348941711
Max error: 170.0
Mean absolute error: 28.015686274509807
Root mean squared error: 43.80582824505541


In [31]:
# K neighbors regression - Further Tuning with Grid Search
knn = KNeighborsRegressor()
# Look at parameters used by our regression
print('Parameters currently in use:\n')
print(knn.get_params())


#Creating the parameter grid

param_grid = [
    {'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size' : [10,15,20,25,30,40,50],
    'n_neighbors' : [2,3,4,5,6,7,8,9,10],
    'p' : [1,2],
    'weights' : ['uniform', 'distance']}
]

Parameters currently in use:

{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}


In [11]:
#fit the model with the parameter
knn = GridSearchCV(KNeighborsRegressor(), param_grid = param_grid, cv = 10, verbose=True, n_jobs=-1)
# Fit the random search model
best_reg= knn.fit(X_train1_scaled, y_train1)


best_reg.best_params_

Fitting 10 folds for each of 1008 candidates, totalling 10080 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  36 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 10080 out of 10080 | elapsed:    3.9s finished


{'algorithm': 'auto',
 'leaf_size': 15,
 'n_neighbors': 6,
 'p': 1,
 'weights': 'distance'}

In [41]:
def tunedKNR(X_train1, X_test1, y_train1, y_test1):
    knr = KNeighborsRegressor(algorithm = 'auto', leaf_size = 15, n_neighbors = 6, p = 1, weights = 'distance')
    knr.fit(X_train1, y_train1)
    y_pred = knr.predict(X_test1)
    print("R2 score: " + str(knr.score(X_test1, y_test1)))
    print("Explained variance: " + str(explained_variance_score(y_test1, y_pred)))
    print("Max error: " + str(max_error(y_test1, y_pred)))
    print("Mean absolute error: " + str(mean_absolute_error(y_test1, y_pred)))
    print("Root mean squared error: " + str(math.sqrt(mean_squared_error(y_test1, y_pred))))
    
    return y_pred

In [43]:
tunedKNR(X_train1_scaled, X_test1_scaled, y_train1, y_test1)

R2 score: 0.8709506172635552
Explained variance: 0.8765968443318394
Max error: 161.95059200774546
Mean absolute error: 24.56248828278559
Root mean squared error: 40.04140104982561


array([270.55602753,  22.65084022, 104.5356992 ,  45.88586952,
        22.20682092,  21.80611253,  20.08584007,  27.89833962,
        81.82786273,  43.33388911,  43.10385853,  71.83508607,
        65.6538148 ,  18.81926319,  30.02271761,  29.50344449,
       255.98519718,  48.44564355, 396.62557943,  48.3275335 ,
        21.58375625,  51.79344579,  74.90410284,  29.76735216,
        36.08516353,  98.23421723,  15.01995688,  75.49195737,
       388.07558583,  58.18111616,  18.81926319,  30.02105801,
        42.3746263 ,  81.00336182,  42.16474203, 301.95059201,
       290.74690112,  53.92733158,  66.90917072, 130.16551757,
        17.57336017,  54.53483926, 112.60385751,  32.68108296,
       176.36683746,  73.70998418,  18.74623587,  30.58593261,
       539.33131313,  26.81589614,  13.87223317])

In [18]:
#run our final prediction
knr_tuned = KNeighborsRegressor(algorithm = 'auto', leaf_size = 15, n_neighbors = 6, p = 1, weights = 'distance')
knr_tuned.fit(X_train_scaled, y_train)
y_pred = knr_tuned.predict(X_test_scaled)
saveFile(y_pred, "kNeighbour_submission.csv")