In [7]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import numpy.random as nr
import math
import pickle

from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split, RepeatedKFold, GridSearchCV, StratifiedKFold, RandomizedSearchCV

from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

In [2]:
def mape(actual, pred): 
    actual, pred = np.array(actual), np.array(pred)
    return np.mean(np.abs((actual - pred) / actual)) * 100

In [3]:
%matplotlib inline
train_data = pd.read_csv('DataMiningProject/train.csv')
train_data.head(20)

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out,pressure
0,1,1,20,50,0.0,0.083334,0,5.837492
1,2,1,20,50,0.033652,18.383041,0,5.907794
2,3,1,20,50,0.067514,22.509278,0,7.876254
3,4,1,20,50,0.101542,22.808822,0,11.742872
4,5,1,20,50,0.135756,25.35585,0,12.234987
5,6,1,20,50,0.169698,27.259866,0,12.867706
6,7,1,20,50,0.203708,27.127486,0,14.695562
7,8,1,20,50,0.237723,26.807732,0,15.890699
8,9,1,20,50,0.271776,27.864715,0,15.539188
9,10,1,20,50,0.305732,28.313036,0,15.750094


In [4]:
test_data = pd.read_csv('DataMiningProject/test.csv')
test_data.head(20)

Unnamed: 0,id,breath_id,R,C,time_step,u_in,u_out
0,1,0,5,20,0.0,0.0,0
1,2,0,5,20,0.031904,7.515046,0
2,3,0,5,20,0.063827,14.651675,0
3,4,0,5,20,0.095751,21.23061,0
4,5,0,5,20,0.127644,26.320956,0
5,6,0,5,20,0.159557,30.486938,0
6,7,0,5,20,0.191471,33.54595,0
7,8,0,5,20,0.223402,35.7176,0
8,9,0,5,20,0.255337,36.971061,0
9,10,0,5,20,0.28723,37.542219,0


In [5]:
data_train = train_data.copy()
data_train.drop(columns=['id', 'breath_id', 'time_step'], axis = 1, inplace = True)

In [6]:
X = data_train.drop(columns='pressure')
y = data_train['pressure']

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=6)
print('Dimension of X_train:', X_train.shape,'\nDimension of X_test:', X_test.shape,'\nDimension of y_train:', y_train.shape,'\nDimension of y_test:', y_test.shape)

Dimension of X_train: (4225200, 4) 
Dimension of X_test: (1810800, 4) 
Dimension of y_train: (4225200,) 
Dimension of y_test: (1810800,)


In [26]:
X_train = X_train[:80000]
y_train = y_train[:80000]

# KNN Regressor


In [16]:
%%time
KNN_model = GridSearchCV(
    estimator = KNeighborsRegressor(),
    param_grid = {
        'leaf_size' : list(range(1,25)),
        'n_neighbors' : list(range(30, 50)),
        'p' : [1,2]
    },
    scoring = 'neg_mean_absolute_error',
    n_jobs = 1,
    cv = 3
)

KNN_result = KNN_model.fit(X_train, y_train)
# summarize result
print('Best Score: %s' % KNN_result.best_score_)
print('Best Hyperparameters: %s' % KNN_result.best_params_)

Best Score: -3.2877206212204797
Best Hyperparameters: {'leaf_size': 5, 'n_neighbors': 37, 'p': 1}
Wall time: 4h 5min 24s


In [23]:
%%time
KNN_predict = KNN_model.predict(X_test)

Wall time: 8min 53s


In [24]:
KNN_R2 = r2_score(y_test, KNN_predict)
KNN_MAPE = mape(y_test, KNN_predict)
KNN_MAE = mean_absolute_error(y_test, KNN_predict)
KNN_MSE = mean_squared_error(y_test, KNN_predict)
KNN_RMSE = np.sqrt(KNN_MSE)

In [19]:
#save model
filename = 'KNN_model.sav'
pickle.dump(KNN_model, open(filename, 'wb'))


In [25]:
KNN_results = pd.DataFrame({'R Squared': KNN_R2,
                          'Mean Absolute Percentage Error': KNN_MAPE,
                          'Mean Absolute Error': KNN_MAE,
                          'Mean Squared Error': KNN_MSE,
                          'Root Mean Squared Error': KNN_RMSE}
                           ,index=['K Nearest Neighbour Regressor'])

KNN_results

Unnamed: 0,R Squared,Mean Absolute Percentage Error,Mean Absolute Error,Mean Squared Error,Root Mean Squared Error
K Nearest Neighbour Regressor,0.498907,38.749683,3.297574,32.98438,5.743203
