In [24]:
import joblib

X_train = joblib.load("/Users/venkatchandan/Desktop/ML_Projects/CosmicClassifier/pklfiles/X_train.pkl")
X_test  = joblib.load("/Users/venkatchandan/Desktop/ML_Projects/CosmicClassifier/pklfiles/X_test.pkl")
X_valid = joblib.load("/Users/venkatchandan/Desktop/ML_Projects/CosmicClassifier/pklfiles/X_valid.pkl")
y_train = joblib.load("/Users/venkatchandan/Desktop/ML_Projects/CosmicClassifier/pklfiles/y_train.pkl")
y_test  = joblib.load("/Users/venkatchandan/Desktop/ML_Projects/CosmicClassifier/pklfiles/y_test.pkl")
y_valid  = joblib.load("/Users/venkatchandan/Desktop/ML_Projects/CosmicClassifier/pklfiles/y_valid.pkl")


# K-Nearest- Neighbours

### From Scratch
We are Looking to code the Library KNN from scratch, get an Intutive feeling and then use
Math:
$$ i​Nk​(x)y^​(x)​=∥x−xi​∥2 $$
$$ Nk​(x)​=k indices of smallest di$$
$$ y^​(x)​=⎩⎨ ⎧​mode{yi​:i∈Nk​(x)}argmaxc​∑i∈Nk​(x)​di​+ε1{yi​=c}​​(unweighted)(weighted)​​ $$


In [None]:
import numpy as np
from collections import Counter

import numpy as np
from collections import Counter

class KNN:
    def __init__(self, k=5, weighted=False, eps=1e-9):
        self.k = k
        self.weighted = weighted
        self.eps = eps

    def fit(self, X, y):
        self.X = np.asarray(X, dtype=float)
        self.y = np.asarray(y, dtype=int)

    def dist(self, A, b):
        diff = A - b
        return np.sqrt((diff * diff).sum(axis=1))  # Euclidean

    def predict(self, Xq):
        Xq = np.asarray(Xq, dtype=float)
        preds = []
        for i in Xq:
            d = self.dist(self.X, i)                
            idx = np.argpartition(d, self.k)[:self.k]
            if not self.weighted:
                vote = Counter(self.y[idx]).most_common(1)[0][0]
            else:
                w = 1.0 / (d[idx] + self.eps)
                scores = {}
                for cls in (0, 1):
                    scores[cls] = w[self.y[idx] == cls].sum()
                vote = 1 if scores[1] >= scores[0] else 0
            preds.append(vote)
        return np.array(preds)
    



array([0, 0, 1, ..., 1, 0, 1])

#### Without Weight


In [None]:
scratch = KNN(k=5, weighted=False)
scratch.fit(X_train, y_train)
y_pred_s = scratch.predict(X_test)
y_pred_s

In [33]:
from sklearn.metrics import classification_report
print(classification_report(y_pred_s,y_test))

              precision    recall  f1-score   support

           0       0.75      0.72      0.73      1413
           1       0.84      0.86      0.85      2391

    accuracy                           0.81      3804
   macro avg       0.79      0.79      0.79      3804
weighted avg       0.80      0.81      0.81      3804



#### With Weight

In [32]:
scratch_weighted = KNN(k = 5, weighted= True)
scratch_weighted.fit(X_train,y_train)
y_pred_sw = scratch_weighted.predict(X_test)
y_pred_sw

array([0, 0, 1, ..., 1, 0, 1])

In [34]:
from sklearn.metrics import classification_report
print(classification_report(y_pred_sw,y_test))

              precision    recall  f1-score   support

           0       0.75      0.72      0.74      1414
           1       0.84      0.86      0.85      2390

    accuracy                           0.81      3804
   macro avg       0.80      0.79      0.79      3804
weighted avg       0.81      0.81      0.81      3804



#### Using Libraries


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report
knn_1 = KNeighborsClassifier(n_neighbors= 1)
knn_1.fit(X_train,y_train)
y_pred_1 = knn_1.predict(X_test)
print(classification_report(y_pred_1,y_test))



              precision    recall  f1-score   support

           0       0.70      0.74      0.72      1284
           1       0.87      0.84      0.85      2520

    accuracy                           0.81      3804
   macro avg       0.78      0.79      0.79      3804
weighted avg       0.81      0.81      0.81      3804



In [None]:
knn_2 = KNeighborsClassifier(n_neighbors= 3)
knn_2.fit(X_train,y_train)
y_pred_2 = knn_2.predict(X_test)
print(classification_report(y_pred_2,y_test))

              precision    recall  f1-score   support

           0       0.74      0.70      0.72      1442
           1       0.82      0.85      0.84      2362

    accuracy                           0.79      3804
   macro avg       0.78      0.78      0.78      3804
weighted avg       0.79      0.79      0.79      3804



In [None]:
knn_3 = KNeighborsClassifier(n_neighbors= 5)
knn_3.fit(X_train,y_train)
y_pred_3 = knn_3.predict(X_test)
print(classification_report(y_pred_3,y_test))

              precision    recall  f1-score   support

           0       0.75      0.72      0.73      1413
           1       0.84      0.86      0.85      2391

    accuracy                           0.81      3804
   macro avg       0.79      0.79      0.79      3804
weighted avg       0.80      0.81      0.81      3804



In [None]:
knn_4 = KNeighborsClassifier(n_neighbors= 7)
knn_4.fit(X_train,y_train)
y_pred_4 = knn_4.predict(X_test)
print(classification_report(y_pred_4,y_test))

              precision    recall  f1-score   support

           0       0.76      0.74      0.75      1415
           1       0.85      0.87      0.86      2389

    accuracy                           0.82      3804
   macro avg       0.81      0.80      0.80      3804
weighted avg       0.82      0.82      0.82      3804



#### FInally using the Ultimate usage of GridSearch CV

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, roc_auc_score

In [None]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('knn', KNeighborsClassifier())
])

param_grid = {
    'knn__n_neighbors': [1,3,5,7,9,11,15,21,31,41,51],
    'knn__weights': ['uniform', 'distance'],     # distance = weighted kNN
    'knn__metric': ['euclidean', 'manhattan']    # try L1 vs L2
}

gs = GridSearchCV(
    pipe, param_grid,
    scoring='f1',        # or 'roc_auc', 'balanced_accuracy'
    cv=5, n_jobs=-1
)
gs.fit(X_train, y_train)

print("Best params:", gs.best_params_)
y_pred = gs.predict(X_test)
print(classification_report(y_test, y_pred, digits=3))

# Optional probability-style score for ROC-AUC (vote fraction)
if hasattr(gs.best_estimator_['knn'], "predict_proba"):
    y_score = gs.predict_proba(X_test)[:,1]
else:
    # fallback: use kneighbors distances to derive a score (less standard)
    y_score = None

if y_score is not None:
    print("ROC-AUC:", roc_auc_score(y_test, y_score))

Best params: {'knn__metric': 'manhattan', 'knn__n_neighbors': 41, 'knn__weights': 'distance'}
              precision    recall  f1-score   support

           0      0.803     0.751     0.776      1363
           1      0.866     0.897     0.881      2441

    accuracy                          0.845      3804
   macro avg      0.834     0.824     0.829      3804
weighted avg      0.843     0.845     0.843      3804

ROC-AUC: 0.9033754192486332
