In [6]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import math

#model import 
from sklearn.neighbors import KNeighborsRegressor

#splitting and scaling
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

#parameter tuning
from sklearn.model_selection import GridSearchCV

#evaluation
from sklearn import linear_model, metrics
from sklearn.metrics import (mean_squared_error, r2_score, mean_absolute_error, r2_score,
mean_squared_log_error, explained_variance_score, max_error)
from sklearn.model_selection import LeaveOneOut, cross_val_score, cross_val_predict

#from sklearn.svm import LinearSVR
#from sklearn.svm import SVR
#from sklearn.datasets import make_regression
#from sklearn.preprocessing import StandardScaler
#from sklearn.preprocessing import QuantileTransformer
#from sklearn.compose import TransformedTargetRegressor
#from sklearn.pipeline import Pipeline
#from sklearn.model_selection import GridSearchCV

import warnings
# We silence warnings concerning future version updates
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)

In [7]:
# Load training and testing data
X_train = np.loadtxt("X_train.csv", delimiter=',', skiprows=1)
X_test = np.loadtxt("X_test.csv", delimiter=',', skiprows=1)
y_train = np.loadtxt("y_train.csv", delimiter=',', skiprows=1)[:,1]


In [29]:
# Averaging Max & Min
X_train3 = np.zeros((len(X_train),4))
X_test3 = np.zeros((len(X_test),4))

X_train3[:,0] = X_train[:,0]
X_train3[:,1] = (X_train[:,2]+X_train[:,1])/2
X_train3[:,2] = X_train[:,3]
X_train3[:,3] = (X_train[:,5]+X_train[:,4])/2

X_test3[:,0] = X_test[:,0]
X_test3[:,1] = (X_test[:,2]+X_test[:,1])/2
X_test3[:,2] = X_test[:,3]
X_test3[:,3] = (X_test[:,5]+X_test[:,4])/2

scaler = StandardScaler()
X_train3_scaled = scaler.fit_transform(X_train3)
X_test3_scaled = scaler.fit_transform(X_test3)


In [30]:
#split the dataset for training
X_train1_avg, X_test1_avg, y_train1_avg, y_test1_avg = train_test_split(X_train3, y_train, test_size=0.3, random_state=42)

#scaling split training dataset
scaler = StandardScaler()
X_train1_scaled_avg = scaler.fit_transform(X_train1_avg)
X_test1_scaled_avg = scaler.fit_transform(X_test1_avg)

In [31]:
#scaling 
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)

In [32]:
def saveFile(y_pred,name):
    test_header = "Id,PRP"
    n_points = X_test.shape[0]
    y_pred_pp = np.ones((n_points, 2))
    y_pred_pp[:, 0] = range(n_points)
    y_pred_pp[:, 1] = y_pred
    np.savetxt(name, y_pred_pp, fmt='%d,%f', delimiter=",",
               header=test_header, comments="")


In [33]:
#split the dataset for training
X_train1, X_test1, y_train1, y_test1 = train_test_split(X_train, y_train, test_size=0.3, random_state=42)

#scaling split training dataset
scaler = StandardScaler()
X_train1_scaled = scaler.fit_transform(X_train1)
X_test1_scaled = scaler.fit_transform(X_test1)

In [34]:
def basicKNR(X_train1, X_test1, y_train1, y_test1):
    knr = KNeighborsRegressor()
    knr.fit(X_train1, y_train1)
    y_pred = knr.predict(X_test1)
    print("R2 score: " + str(knr.score(X_test1, y_test1)))
    print("Explained variance: " + str(explained_variance_score(y_test1, y_pred)))
    print("Max error: " + str(max_error(y_test1, y_pred)))
    print("Mean absolute error: " + str(mean_absolute_error(y_test1, y_pred)))
    print("Root mean squared error: " + str(math.sqrt(mean_squared_error(y_test1, y_pred))))

In [35]:
basicKNR(X_train1, X_test1, y_train1, y_test1)

R2 score: 0.745258685851206
Explained variance: 0.7452720740983032
Max error: 181.0
Mean absolute error: 36.06274509803922
Root mean squared error: 56.25757291071063


In [36]:
#with average
basicKNR(X_train1_scaled_avg, X_test1_scaled_avg, y_train1_avg, y_test1_avg)

R2 score: 0.8413630599621043
Explained variance: 0.8488234242335814
Max error: 215.39999999999998
Mean absolute error: 27.51764705882353
Root mean squared error: 44.39493876823454


In [19]:
basicKNR(X_train1_avg, X_test1_avg, y_train1_avg, y_test1_avg)

R2 score: 0.7473844835093151
Explained variance: 0.7473936384164462
Max error: 178.8
Mean absolute error: 35.79607843137255
Root mean squared error: 56.02234848176007


In [44]:
basicKNR(X_train1_scaled, X_test1_scaled, y_train1, y_test1)

R2 score: 0.8455452767778092
Explained variance: 0.8497178348941711
Max error: 170.0
Mean absolute error: 28.015686274509807
Root mean squared error: 43.80582824505541


In [37]:
# K neighbors regression - Further Tuning with Grid Search
knn = KNeighborsRegressor()
# Look at parameters used by our regression
print('Parameters currently in use:\n')
print(knn.get_params())


#Creating the parameter grid

param_grid = [
    {'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size' : [10,15,20,25,30,40,50],
    'n_neighbors' : [2,3,4,5,6,7,8,9,10],
    'p' : [1,2],
    'weights' : ['uniform', 'distance']}
]

Parameters currently in use:

{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}


In [38]:
#fit the model with the parameter
knn = GridSearchCV(KNeighborsRegressor(), param_grid = param_grid, cv = 10, verbose=True, n_jobs=-1)
# Fit the random search model
best_reg= knn.fit(X_train1_scaled_avg, y_train1_avg)


best_reg.best_params_

Fitting 10 folds for each of 1008 candidates, totalling 10080 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  39 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 10080 out of 10080 | elapsed:    4.0s finished


{'algorithm': 'ball_tree',
 'leaf_size': 10,
 'n_neighbors': 8,
 'p': 1,
 'weights': 'distance'}

In [39]:
def tunedKNR(X_train1, X_test1, y_train1, y_test1):
    knr = KNeighborsRegressor(algorithm = 'ball_tree', leaf_size = 15, n_neighbors = 8, p = 1, weights = 'distance')
    knr.fit(X_train1, y_train1)
    y_pred = knr.predict(X_test1)
    print("R2 score: " + str(knr.score(X_test1, y_test1)))
    print("Explained variance: " + str(explained_variance_score(y_test1, y_pred)))
    print("Max error: " + str(max_error(y_test1, y_pred)))
    print("Mean absolute error: " + str(mean_absolute_error(y_test1, y_pred)))
    print("Root mean squared error: " + str(math.sqrt(mean_squared_error(y_test1, y_pred))))
    
    return y_pred

In [40]:
tunedKNR(X_train1_scaled, X_test1_scaled, y_train1, y_test1)

R2 score: 0.869169583493462
Explained variance: 0.8708839937974988
Max error: 129.51478583367066
Mean absolute error: 25.267601077971314
Root mean squared error: 40.3167635169427


array([246.99910037,  23.04522103, 104.93539901,  42.75259428,
        20.32654471,  20.04776251,  20.01580697,  28.0693336 ,
        77.28381749,  39.98229911,  42.8742181 ,  68.76990685,
        67.0038312 ,  17.15390789,  33.00948784,  26.33794865,
       227.41840047,  52.33600985, 375.95829329,  45.97717575,
        19.40167632,  48.28718916,  73.70640047,  30.02283555,
        33.56784483, 105.28577498,  16.3335091 ,  72.2240916 ,
       355.27913474,  58.14243973,  17.15390789,  31.24012207,
        42.44004244,  89.00164006,  41.13817992, 269.51478583,
       276.5570052 ,  53.27326785,  68.07453109, 125.14574543,
        17.5483679 ,  55.14754485, 115.25344551,  30.15433801,
       201.99499632,  71.50203997,  21.46979875,  29.57195553,
       493.03135846,  24.27939194,  15.33378089])

In [18]:
#run our final prediction
knr_tuned = KNeighborsRegressor(algorithm = 'auto', leaf_size = 15, n_neighbors = 6, p = 1, weights = 'distance')
knr_tuned.fit(X_train_scaled, y_train)
y_pred = knr_tuned.predict(X_test_scaled)
saveFile(y_pred, "kNeighbour_submission.csv")

In [41]:
tunedKNR(X_train1_scaled_avg, X_test1_scaled_avg, y_train1_avg, y_test1_avg)

R2 score: 0.8505982054646073
Explained variance: 0.8543825959411191
Max error: 184.00594772188225
Mean absolute error: 26.7072571692877
Root mean squared error: 43.0833243280906


array([257.46259783,  28.05454947, 121.56744987,  42.25534251,
        24.50124471,  22.13792332,  22.77544447,  26.62084118,
        81.75741676,  38.20191401,  43.24520578,  86.45552468,
        66.06523668,  18.59037994,  34.73369346,  28.29005983,
       240.13983106,  52.95914704, 350.95532068,  50.42762304,
        21.29235381,  51.56899301,  77.94285838,  31.16763951,
        35.81254068, 119.70455758,  16.07392869,  76.13386116,
       362.30727555,  59.49488616,  18.59037994,  38.50225457,
        53.20726298, 120.95477025,  39.24809048, 324.00594772,
       215.10042153,  48.5524122 ,  87.0947623 , 127.18877519,
        17.71304968,  55.53528038, 100.028145  ,  36.41845515,
       184.5067603 ,  69.65792108,  22.55093334,  30.10179302,
       475.11979077,  27.71817115,  15.2120528 ])

In [42]:
knr_tuned = KNeighborsRegressor(algorithm = 'auto', leaf_size = 15, n_neighbors = 6, p = 1, weights = 'distance')
knr_tuned.fit(X_train3_scaled, y_train)
y_pred = knr_tuned.predict(X_test3_scaled)
saveFile(y_pred, "kNeighbour_submission_avg.csv")

In [None]:
algorithm = 'ball_tree', leaf_size = 15, n_neighbors = 8, p = 1, weights = 'distance'