In [57]:
import numpy as np
import pandas as pd
from sklearn import preprocessing, model_selection, neighbors

df = pd.read_csv('breast_cancer.csv')
df.replace('?', -99999, inplace=True)
df.drop(['id'], inplace=True, axis=1)

X = np.array(df.drop(['class'], axis=1))
y = np.array(df['class'])
#X = preprocessing.scale(X)

X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2)
clf = neighbors.KNeighborsClassifier()
clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)
print(accuracy)

example_measures = np.array([4, 2, 1, 1, 1, 2, 3, 2, 1])
example_measures = example_measures.reshape(1, -1)
example_measures_1 = np.array([[4, 2, 1, 1, 1, 2, 3, 2, 1], [5, 2, 1, 2, 1, 2, 1, 2, 1]])
example_measures_1 = example_measures_1.reshape(len(example_measures_1), -1)
prediction_1 = clf.predict(example_measures_1)
prediction = clf.predict(example_measures)
print(prediction)
print(prediction_1)

0.9857142857142858
[2]
[2 2]


In [58]:
from math import sqrt

plot1 = [1, 3]
plot2 = [3, 5]

euclidean_distance = sqrt((plot1[0]-plot2[0])**2 + (plot1[1]-plot2[1])**2)
print(euclidean_distance)

2.8284271247461903


In [60]:
# Manual K-Nearest Neighbors Implementation
import numpy as np
from collections import Counter

# First, let's check and fix the data types
print("Checking data types and shapes:")
print(f"X_train shape: {X_train.shape}")
print(f"X_train dtype: {X_train.dtype}")
print(f"Sample X_train values: {X_train[0]}")
print(f"y_train shape: {y_train.shape}")
print(f"y_train dtype: {y_train.dtype}")
print(f"Sample y_train values: {y_train[:5]}")

# Convert data to float if needed
X_train_numeric = X_train.astype(float)
X_test_numeric = X_test.astype(float)
y_train_numeric = y_train
y_test_numeric = y_test

print(f"\nAfter conversion:")
print(f"X_train_numeric dtype: {X_train_numeric.dtype}")
print(f"Sample X_train_numeric values: {X_train_numeric[0]}")

class ManualKNN:
    def __init__(self, k=3):
        self.k = k
        
    def fit(self, X_train, y_train):
        """Store the training data"""
        self.X_train = X_train.astype(float)
        self.y_train = y_train
        
    def euclidean_distance(self, point1, point2):
        """Calculate euclidean distance between two points"""
        return np.sqrt(np.sum((point1 - point2) ** 2))
    
    def predict(self, X_test):
        """Predict the class labels for the test data"""
        X_test = X_test.astype(float)
        predictions = []
        
        for test_point in X_test:
            # Calculate distances from test point to all training points
            distances = []
            for i, train_point in enumerate(self.X_train):
                distance = self.euclidean_distance(test_point, train_point)
                distances.append((distance, self.y_train[i]))
            
            # Sort by distance and get k nearest neighbors
            distances.sort(key=lambda x: x[0])
            k_nearest = distances[:self.k]
            
            # Get the labels of k nearest neighbors
            k_nearest_labels = [label for distance, label in k_nearest]
            
            # Vote for the most common class
            most_common = Counter(k_nearest_labels).most_common(1)
            predictions.append(most_common[0][0])
            
        return np.array(predictions)
    
    def score(self, X_test, y_test):
        """Calculate accuracy of predictions"""
        predictions = self.predict(X_test)
        accuracy = np.mean(predictions == y_test)
        return accuracy

# Test the manual KNN implementation
print("\nManual KNN Implementation Results:")
print("=" * 40)

# Create and train the manual KNN classifier
manual_knn = ManualKNN(k=5)
manual_knn.fit(X_train_numeric, y_train_numeric)

# Make predictions
manual_predictions = manual_knn.predict(X_test_numeric)
manual_accuracy = manual_knn.score(X_test_numeric, y_test_numeric)

print(f"Manual KNN Accuracy: {manual_accuracy:.4f}")

# Test with the same examples as before
example_prediction = manual_knn.predict(example_measures)
example_prediction_1 = manual_knn.predict(example_measures_1)

print(f"Manual KNN prediction for single example: {example_prediction}")
print(f"Manual KNN prediction for multiple examples: {example_prediction_1}")

# Compare with sklearn results
print("\nComparison with sklearn:")
print(f"sklearn accuracy: {accuracy:.4f}")
print(f"Manual KNN accuracy: {manual_accuracy:.4f}")
print(f"Difference: {abs(accuracy - manual_accuracy):.4f}")

Checking data types and shapes:
X_train shape: (559, 9)
X_train dtype: object
Sample X_train values: [3 1 1 3 8 '1' 5 8 1]
y_train shape: (559,)
y_train dtype: int64
Sample y_train values: [2 2 4 2 2]

After conversion:
X_train_numeric dtype: float64
Sample X_train_numeric values: [3. 1. 1. 3. 8. 1. 5. 8. 1.]

Manual KNN Implementation Results:
Manual KNN Accuracy: 0.9857
Manual KNN prediction for single example: [2]
Manual KNN prediction for multiple examples: [2 2]

Comparison with sklearn:
sklearn accuracy: 0.9857
Manual KNN accuracy: 0.9857
Difference: 0.0000
Manual KNN Accuracy: 0.9857
Manual KNN prediction for single example: [2]
Manual KNN prediction for multiple examples: [2 2]

Comparison with sklearn:
sklearn accuracy: 0.9857
Manual KNN accuracy: 0.9857
Difference: 0.0000


In [61]:
# Detailed demonstration of how KNN works step by step
print("Step-by-step KNN demonstration:")
print("=" * 50)

# Let's trace through one prediction to show how it works
test_sample = X_test_numeric[0]  # Take the first test sample
print(f"Test sample: {test_sample}")

# Calculate distances to all training points
distances_demo = []
for i, train_point in enumerate(X_train_numeric):
    distance = manual_knn.euclidean_distance(test_sample, train_point)
    distances_demo.append((distance, y_train_numeric[i], i))

# Sort by distance
distances_demo.sort(key=lambda x: x[0])

# Show the 5 nearest neighbors
print(f"\nK={manual_knn.k} nearest neighbors:")
for i in range(manual_knn.k):
    dist, label, idx = distances_demo[i]
    print(f"  Neighbor {i+1}: Distance={dist:.4f}, Label={label}, Training Index={idx}")

# Count votes
k_nearest_labels = [label for distance, label, idx in distances_demo[:manual_knn.k]]
vote_count = Counter(k_nearest_labels)
print(f"\nVote count: {dict(vote_count)}")
prediction = vote_count.most_common(1)[0][0]
print(f"Predicted class: {prediction}")
print(f"Actual class: {y_test_numeric[0]}")

# Show the algorithm components
print(f"\nAlgorithm Summary:")
print(f"1. Calculate distance from test point to all {len(X_train_numeric)} training points")
print(f"2. Sort distances and select {manual_knn.k} nearest neighbors")
print(f"3. Take majority vote among the {manual_knn.k} nearest neighbor labels")
print(f"4. Assign the most common class as the prediction")

Step-by-step KNN demonstration:
Test sample: [3. 1. 1. 1. 2. 1. 1. 1. 1.]

K=5 nearest neighbors:
  Neighbor 1: Distance=0.0000, Label=2, Training Index=27
  Neighbor 2: Distance=0.0000, Label=2, Training Index=48
  Neighbor 3: Distance=0.0000, Label=2, Training Index=70
  Neighbor 4: Distance=0.0000, Label=2, Training Index=169
  Neighbor 5: Distance=0.0000, Label=2, Training Index=251

Vote count: {2: 5}
Predicted class: 2
Actual class: 2

Algorithm Summary:
1. Calculate distance from test point to all 559 training points
2. Sort distances and select 5 nearest neighbors
3. Take majority vote among the 5 nearest neighbor labels
4. Assign the most common class as the prediction


In [62]:
# Testing different k values to see their impact
print("Testing different k values:")
print("=" * 30)

k_values = [1, 3, 5, 7, 9, 11, 15, 20]
accuracies = []

for k in k_values:
    knn = ManualKNN(k=k)
    knn.fit(X_train_numeric, y_train_numeric)
    accuracy = knn.score(X_test_numeric, y_test_numeric)
    accuracies.append(accuracy)
    print(f"k={k:2d}: Accuracy = {accuracy:.4f}")

# Find the best k
best_k_idx = np.argmax(accuracies)
best_k = k_values[best_k_idx]
best_accuracy = accuracies[best_k_idx]

print(f"\nBest k value: {best_k} with accuracy: {best_accuracy:.4f}")

# Show some theory about choosing k
print(f"\nChoosing k:")
print(f"- Small k (like 1): More sensitive to noise, may overfit")
print(f"- Large k: Smoother decision boundary, may underfit")
print(f"- Odd k values help avoid ties in binary classification")
print(f"- Rule of thumb: k = sqrt(n_samples), here sqrt({len(X_train_numeric)}) ≈ {int(np.sqrt(len(X_train_numeric)))}")

Testing different k values:
k= 1: Accuracy = 1.0000
k= 3: Accuracy = 0.9929
k= 5: Accuracy = 0.9857
k= 7: Accuracy = 0.9857
k= 9: Accuracy = 0.9786
k=11: Accuracy = 0.9714
k=15: Accuracy = 0.9786
k=20: Accuracy = 0.9786

Best k value: 1 with accuracy: 1.0000

Choosing k:
- Small k (like 1): More sensitive to noise, may overfit
- Large k: Smoother decision boundary, may underfit
- Odd k values help avoid ties in binary classification
- Rule of thumb: k = sqrt(n_samples), here sqrt(559) ≈ 23
