# K Nearest Neighbour Classifier

In [133]:
import pandas as pd
import seaborn as sns
import matplotlib as plt
import numpy as np
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

In [134]:
from sklearn.datasets import make_classification
# n_redundant - meaning one feature is linearly dependent on other
X, Y = make_classification(n_samples=1000, n_features=3, n_redundant=1, n_classes=2, random_state=42)

In [135]:
from sklearn.model_selection import train_test_split

In [136]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.25, random_state=42)

In [137]:
pd.DataFrame(X_train)

Unnamed: 0,0,1,2
0,0.135299,0.226223,-0.118902
1,1.399763,1.338486,-0.819910
2,0.442557,1.336680,-0.633230
3,1.532297,-1.345665,0.253287
4,-1.437485,-2.214393,1.185851
...,...,...,...
745,2.772119,-0.560383,-0.309062
746,-0.453555,0.763122,-0.224332
747,-1.202371,-0.968245,0.629982
748,0.244385,-0.929696,0.333162


In [138]:
from sklearn.neighbors import KNeighborsClassifier

In [139]:
classifier = KNeighborsClassifier(n_neighbors=5, algorithm='auto')
classifier.fit(X_train, Y_train)

In [140]:
y_pred = classifier.predict(X_test)

In [141]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [142]:
print("Confusion Matrix\n", confusion_matrix(Y_test, y_pred))
print("Accuracy", accuracy_score(Y_test, y_pred))
print("Report\n", classification_report(Y_test, y_pred))

Confusion Matrix
 [[110  12]
 [ 12 116]]
Accuracy 0.904
Report
               precision    recall  f1-score   support

           0       0.90      0.90      0.90       122
           1       0.91      0.91      0.91       128

    accuracy                           0.90       250
   macro avg       0.90      0.90      0.90       250
weighted avg       0.90      0.90      0.90       250



## Hyperparamter Tuning with GridSearchCv

In [143]:
model = KNeighborsClassifier()
n_neighbors = [7,11,17]
algorithm = ['ball_tree', 'kd_tree', 'brute']
p = [1,2]
weights = ["uniform", "distance"]
params = dict(n_neighbors = n_neighbors, algorithm = algorithm, p=p, weights=weights)

In [144]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
cv = StratifiedKFold(n_splits=5)
grid = GridSearchCV(estimator=model, param_grid=params, scoring='accuracy', cv=cv, n_jobs=-1)
grid.fit(X_train, Y_train)

grid.best_params_

In [145]:
grid.best_score_

np.float64(0.9146666666666666)

In [146]:
y_pred = grid.best_estimator_.predict(X_test)

In [147]:
score = accuracy_score(Y_test, y_pred)
print('Score: ',score)
print('Score in pcnt:', score * 100, '%')
cm = confusion_matrix(Y_test, y_pred)
print('Confusion Matrix: \n',cm)
cr = classification_report(Y_test, y_pred)
print('Report:\n',cr)

Score:  0.924
Score in pcnt: 92.4 %
Confusion Matrix: 
 [[114   8]
 [ 11 117]]
Report:
               precision    recall  f1-score   support

           0       0.91      0.93      0.92       122
           1       0.94      0.91      0.92       128

    accuracy                           0.92       250
   macro avg       0.92      0.92      0.92       250
weighted avg       0.92      0.92      0.92       250



# K-Nearest Neighbour Regressor

In [148]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.datasets import make_regression

In [149]:
X, Y = make_regression(n_samples=1000, n_features=3, noise=10, random_state=42)

In [150]:
pd.DataFrame(X).head()

Unnamed: 0,0,1,2
0,-0.18912,-1.330314,0.92165
1,-0.653329,-0.474945,1.765454
2,-0.224856,0.076852,-0.650003
3,0.570599,-0.662624,-0.763259
4,0.32788,-0.125454,0.085893


In [151]:
from sklearn.model_selection import train_test_split

In [152]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=.30, random_state=42)

In [153]:
knn_regressor = KNeighborsRegressor(n_neighbors=6, algorithm='auto', n_jobs=-1)
knn_regressor.fit(X_train, Y_train)

In [154]:
y_pred = knn_regressor.predict(X_test)

In [155]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [156]:
print("MAE:",mean_absolute_error(Y_test, y_pred))
print("MSE:",mean_squared_error(Y_test, y_pred))
print("R2 Score:",r2_score(Y_test, y_pred))

MAE: 17.346438446011906
MSE: 553.1913759753946
R2 Score: 0.9665725004194289


## Hyperparamter Tuning using GridSearchCv

In [160]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

In [182]:
model = KNeighborsRegressor()
n_neighbors = [8, 10]
weights = ['uniform', 'distance']
p = [1, 2]
algorithm = ['ball_tree', 'kd_tree', 'brute']
params = dict(n_neighbors = n_neighbors, weights=weights, p=p, algorithm=algorithm)
cv = KFold(10)

In [183]:
knn_grid = GridSearchCV(estimator=model, param_grid=params, scoring='r2', cv= cv, n_jobs=-1)
knn_grid.fit(X_train, Y_train)

In [184]:
knn_grid.best_params_

{'algorithm': 'ball_tree', 'n_neighbors': 8, 'p': 2, 'weights': 'distance'}

In [185]:
knn_grid.best_score_

np.float64(0.9640721289884528)

In [186]:
y_pred = knn_grid.predict(X_test)
# Score
print("MAE:",mean_absolute_error(Y_test, y_pred))
print("MSE:",mean_squared_error(Y_test, y_pred))
print("R2 Score:",r2_score(Y_test, y_pred))

MAE: 15.849191304585633
MSE: 490.510953277604
R2 Score: 0.9703600681481305


- **Mean Absolute Error (MAE):** **15.85**  
  → On average, predictions deviate by **15.85** units from actual values.

- **Mean Squared Error (MSE):** **490.51**  
  → Squared error penalty; sensitive to large errors.

- **R² Score:** **0.9704**  
  → **97.04% variance explained** by the model. A value close to **1** indicates a strong fit.

### 📌 **Analysis & Next Steps:**
✅ **High R² score (0.97) suggests an excellent fit.**  
📉 **MAE & MSE indicate errors exist—can we reduce them?**  