# Unit Testing Model

In [None]:
import pandas as pd
import os

xlsx_path = os.path.join('..', 'dataset', 'dataset_ur3_cobotops.xlsx')
csv_path = os.path.join('..', 'dataset', 'dataset_ur3_cobotops.csv')
df = pd.read_excel(xlsx_path)
# df.to_csv(csv_path, index=False)

In [None]:
import time
import numpy as np

arr = np.array([i for i in range(1000000)])
start = time.time()
np.sum(arr)
print("NumPy sum took:", time.time() - start)

start = time.time()
sum(arr)
print("Python sum took:", time.time() - start)

start = time.time()
arr.sum()
print("NumPy sum (method) took:", time.time() - start)

start = time.time()
for i in range(1000000):
    arr[i]
print("Python for loop took:", time.time() - start)


NumPy sum took: 0.0010037422180175781
Python sum took: 0.04256606101989746
NumPy sum (method) took: 0.0
Python for loop took: 0.08015179634094238


## KNNClassifier

In [4]:
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, cross_val_score

# Synthesize data using sklearn
X, y = make_classification(n_samples=100, n_features=2, n_informative=2, n_redundant=0, random_state=42)

# Hold out validation
X_train_hold, X_test_hold, y_train_hold, y_test_hold = train_test_split(X, y, test_size=0.2, random_state=42)
knn_hold = KNeighborsClassifier(n_neighbors=3)
knn_hold.fit(X_train_hold, y_train_hold)
hold_out_score = knn_hold.score(X_test_hold, y_test_hold)
print("Hold out validation accuracy:", hold_out_score)

# K-Fold cross validation
knn_kfold = KNeighborsClassifier(n_neighbors=3)
cv_scores = cross_val_score(knn_kfold, X, y, cv=5)
print("K-Fold cross validation scores:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())

Hold out validation accuracy: 0.95
K-Fold cross validation scores: [0.95 0.95 1.   1.   1.  ]
Mean CV accuracy: 0.9800000000000001


In [2]:
# Import your custom KNN implementation
import sys
sys.path.append('./supervised-learning')
from knn import KNNClassifier

import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import time

# Generate test data
X, y = make_classification(n_samples=200, n_features=4, n_informative=3, 
                          n_redundant=1, n_classes=3, random_state=42)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

print("Comparing Custom KNN vs Sklearn KNN")
print("="*50)

# Test different k values
k_values = [1, 3, 5, 7]
distance_methods = ["euclidean", "manhattan", "minkowski"]

for k in k_values:
    print(f"\nTesting with k={k}")
    print("-" * 30)
    
    for distance in distance_methods:
        print(f"\nDistance method: {distance}")
        
        # Your custom KNN
        start_time = time.time()
        custom_knn = KNNClassifier(k=k, distance_method=distance)
        custom_knn.fit(X_train, y_train)
        
        # Note: Your predict method needs modification to handle multiple test samples
        custom_predictions = []
        for test_sample in X_test:
            pred = custom_knn.predict(test_sample)
            # Your current implementation returns all k neighbors, we need the most common
            unique, counts = np.unique(pred, return_counts=True)
            most_common = unique[np.argmax(counts)]
            custom_predictions.append(most_common)
        
        custom_time = time.time() - start_time
        custom_accuracy = accuracy_score(y_test, custom_predictions)
        
        # Sklearn KNN
        start_time = time.time()
        if distance == "euclidean":
            sklearn_knn = KNeighborsClassifier(n_neighbors=k, metric='euclidean')
        elif distance == "manhattan":
            sklearn_knn = KNeighborsClassifier(n_neighbors=k, metric='manhattan')
        elif distance == "minkowski":
            sklearn_knn = KNeighborsClassifier(n_neighbors=k, metric='minkowski')

        sklearn_knn.fit(X_train, y_train)
        sklearn_predictions = sklearn_knn.predict(X_test)
        sklearn_time = time.time() - start_time
        sklearn_accuracy = accuracy_score(y_test, sklearn_predictions)
        
        # Compare results
        print(f"Custom KNN    - Accuracy: {custom_accuracy:.4f}, Time: {custom_time:.4f}s")
        print(f"Sklearn KNN  - Accuracy: {sklearn_accuracy:.4f}, Time: {sklearn_time:.4f}s")
        print(f"Accuracy difference: {abs(custom_accuracy - sklearn_accuracy):.4f}")

# Detailed comparison for k=3, minkowski
print("\n" + "="*50)
print("DETAILED COMPARISON (k=3, minkowski)")
print("="*50)

custom_knn = KNNClassifier(k=3, distance_method="minkowski")
custom_knn.fit(X_train, y_train)

sklearn_knn = KNeighborsClassifier(n_neighbors=3, metric='minkowski')
sklearn_knn.fit(X_train, y_train)

# Get predictions for first 10 test samples
print("\nFirst 10 predictions comparison:")
print("Sample | Custom | Sklearn | Actual | Match")
print("-" * 40)

for i in range(min(10, len(X_test))):
    test_sample = X_test[i]
    actual = y_test[i]
    
    # Custom prediction
    # custom_pred = custom_knn.predict(test_sample)
    custom_pred = custom_knn.predict(test_sample)
    unique, counts = np.unique(custom_pred, return_counts=True)
    custom_result = unique[np.argmax(counts)]
    
    # Sklearn prediction00
    sklearn_result = sklearn_knn.predict([test_sample])[0]
    
    match = "✓" if custom_result == sklearn_result else "✗"
    print(f"{i:6d} | {custom_result:6d} | {sklearn_result:7d} | {actual:6d} | {match:5s}")

Comparing Custom KNN vs Sklearn KNN

Testing with k=1
------------------------------

Distance method: euclidean
Custom KNN    - Accuracy: 0.8000, Time: 0.0311s
Sklearn KNN  - Accuracy: 0.8000, Time: 0.0025s
Accuracy difference: 0.0000

Distance method: manhattan
Custom KNN    - Accuracy: 0.8000, Time: 0.0185s
Sklearn KNN  - Accuracy: 0.8000, Time: 0.0025s
Accuracy difference: 0.0000

Distance method: minkowski
Custom KNN    - Accuracy: 0.8000, Time: 0.0175s
Sklearn KNN  - Accuracy: 0.8000, Time: 0.0025s
Accuracy difference: 0.0000

Testing with k=3
------------------------------

Distance method: euclidean
Custom KNN    - Accuracy: 0.8167, Time: 0.0361s
Sklearn KNN  - Accuracy: 0.8167, Time: 0.0045s
Accuracy difference: 0.0000

Distance method: manhattan
Custom KNN    - Accuracy: 0.7667, Time: 0.0220s
Sklearn KNN  - Accuracy: 0.7667, Time: 0.0045s
Accuracy difference: 0.0000

Distance method: minkowski
Custom KNN    - Accuracy: 0.8167, Time: 0.0366s
Sklearn KNN  - Accuracy: 0.8167, Ti