# AI Assignment 1: KNN Classification from Scratch
## Group: Group01_Assignment1

This notebook implements K-Nearest Neighbors (KNN) classification from scratch for:
1. **Task 1**: Binary Classification on Breast Cancer Dataset
2. **Task 2**: Multi-class Classification on CIFAR-10 Dataset

**Note**: No sklearn or pytorch functions are used for the models - everything is implemented from scratch.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import pickle
import os
import tarfile
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

---
# Part 1: Distance Functions Implementation (From Scratch)
We implement all 5 distance metrics from scratch without using sklearn.

In [None]:
class DistanceMetrics:
    """Class containing all distance metric implementations from scratch"""
    
    @staticmethod
    def euclidean_distance(x1, x2):
        """Euclidean Distance: sqrt(sum((x1 - x2)^2))"""
        return np.sqrt(np.sum((x1 - x2) ** 2))
    
    @staticmethod
    def manhattan_distance(x1, x2):
        """Manhattan Distance: sum(|x1 - x2|)"""
        return np.sum(np.abs(x1 - x2))
    
    @staticmethod
    def minkowski_distance(x1, x2, p=3):
        """Minkowski Distance: (sum(|x1 - x2|^p))^(1/p)"""
        return np.power(np.sum(np.power(np.abs(x1 - x2), p)), 1/p)
    
    @staticmethod
    def cosine_similarity(x1, x2):
        """Cosine Similarity: 1 - (x1.x2)/(||x1|| * ||x2||)
        Returns distance (1 - similarity) so that smaller values mean more similar"""
        dot_product = np.dot(x1, x2)
        norm_x1 = np.sqrt(np.sum(x1 ** 2))
        norm_x2 = np.sqrt(np.sum(x2 ** 2))
        if norm_x1 == 0 or norm_x2 == 0:
            return 1.0
        similarity = dot_product / (norm_x1 * norm_x2)
        return 1 - similarity  # Convert to distance
    
    @staticmethod
    def hamming_distance(x1, x2):
        """Hamming Distance: proportion of differing components
        For continuous data, we threshold to create binary features"""
        # For continuous features, we use a threshold-based approach
        threshold = 0.5
        x1_binary = (x1 > threshold).astype(int)
        x2_binary = (x2 > threshold).astype(int)
        return np.sum(x1_binary != x2_binary) / len(x1)

print("Distance metrics implemented successfully!")
print("\nFormulas:")
print("1. Euclidean: d(x,y) = sqrt(Σ(xi - yi)²)")
print("2. Manhattan: d(x,y) = Σ|xi - yi|")
print("3. Minkowski: d(x,y) = (Σ|xi - yi|^p)^(1/p), p=3")
print("4. Cosine: d(x,y) = 1 - (x·y)/(||x||·||y||)")
print("5. Hamming: d(x,y) = (1/n) × Σ(xi ≠ yi)")

---
# Part 2: KNN Classifier Implementation (From Scratch)

In [None]:
class KNNClassifier:
    """K-Nearest Neighbors Classifier implemented from scratch"""
    
    def __init__(self, k=3, distance_metric='euclidean'):
        """
        Initialize KNN Classifier
        
        Parameters:
        - k: Number of neighbors
        - distance_metric: One of 'euclidean', 'manhattan', 'minkowski', 'cosine', 'hamming'
        """
        self.k = k
        self.distance_metric = distance_metric
        self.X_train = None
        self.y_train = None
        
        # Map distance metric names to functions
        self.distance_functions = {
            'euclidean': DistanceMetrics.euclidean_distance,
            'manhattan': DistanceMetrics.manhattan_distance,
            'minkowski': DistanceMetrics.minkowski_distance,
            'cosine': DistanceMetrics.cosine_similarity,
            'hamming': DistanceMetrics.hamming_distance
        }
    
    def fit(self, X_train, y_train):
        """Store training data (lazy learning)"""
        self.X_train = np.array(X_train)
        self.y_train = np.array(y_train)
        return self
    
    def _compute_distance(self, x1, x2):
        """Compute distance between two points using the specified metric"""
        distance_func = self.distance_functions[self.distance_metric]
        return distance_func(x1, x2)
    
    def _predict_single(self, x):
        """Predict the class for a single sample"""
        # Calculate distances from x to all training samples
        distances = [self._compute_distance(x, x_train) for x_train in self.X_train]
        
        # Get indices of k nearest neighbors
        k_indices = np.argsort(distances)[:self.k]
        
        # Get the labels of k nearest neighbors
        k_nearest_labels = self.y_train[k_indices]
        
        # Return the most common class label (majority voting)
        most_common = Counter(k_nearest_labels).most_common(1)
        return most_common[0][0]
    
    def predict(self, X_test):
        """Predict classes for all test samples"""
        X_test = np.array(X_test)
        predictions = [self._predict_single(x) for x in X_test]
        return np.array(predictions)
    
    def score(self, X_test, y_test):
        """Calculate accuracy"""
        y_pred = self.predict(X_test)
        return np.mean(y_pred == y_test)

print("KNN Classifier implemented successfully!")
print("\nAlgorithm Steps:")
print("1. Store training data (fit)")
print("2. For each test sample:")
print("   a. Calculate distance to all training samples")
print("   b. Find K nearest neighbors")
print("   c. Majority voting to determine class")

---
# Part 3: Evaluation Metrics Implementation (From Scratch)

In [None]:
class EvaluationMetrics:
    """Evaluation metrics implemented from scratch"""
    
    @staticmethod
    def confusion_matrix(y_true, y_pred, labels=None):
        """Create confusion matrix"""
        if labels is None:
            labels = np.unique(np.concatenate([y_true, y_pred]))
        
        n_labels = len(labels)
        label_to_idx = {label: idx for idx, label in enumerate(labels)}
        
        cm = np.zeros((n_labels, n_labels), dtype=int)
        for true, pred in zip(y_true, y_pred):
            cm[label_to_idx[true]][label_to_idx[pred]] += 1
        
        return cm, labels
    
    @staticmethod
    def precision_recall(y_true, y_pred, labels=None):
        """Calculate precision and recall for each class"""
        cm, labels = EvaluationMetrics.confusion_matrix(y_true, y_pred, labels)
        
        precisions = {}
        recalls = {}
        
        for idx, label in enumerate(labels):
            # True Positives
            tp = cm[idx][idx]
            # False Positives (column sum - TP)
            fp = np.sum(cm[:, idx]) - tp
            # False Negatives (row sum - TP)
            fn = np.sum(cm[idx, :]) - tp
            
            # Precision = TP / (TP + FP)
            precisions[label] = tp / (tp + fp) if (tp + fp) > 0 else 0
            # Recall = TP / (TP + FN)
            recalls[label] = tp / (tp + fn) if (tp + fn) > 0 else 0
        
        return precisions, recalls
    
    @staticmethod
    def accuracy(y_true, y_pred):
        """Calculate accuracy"""
        return np.mean(np.array(y_true) == np.array(y_pred))
    
    @staticmethod
    def print_classification_report(y_true, y_pred, labels=None):
        """Print a classification report"""
        cm, labels = EvaluationMetrics.confusion_matrix(y_true, y_pred, labels)
        precisions, recalls = EvaluationMetrics.precision_recall(y_true, y_pred, labels)
        acc = EvaluationMetrics.accuracy(y_true, y_pred)
        
        print("\n" + "="*60)
        print("CLASSIFICATION REPORT")
        print("="*60)
        print(f"\nAccuracy: {acc:.4f}")
        print("\n" + "-"*40)
        print(f"{'Class':<15} {'Precision':<12} {'Recall':<12}")
        print("-"*40)
        for label in labels:
            print(f"{str(label):<15} {precisions[label]:<12.4f} {recalls[label]:<12.4f}")
        print("-"*40)
        
        # Macro averages
        macro_precision = np.mean(list(precisions.values()))
        macro_recall = np.mean(list(recalls.values()))
        print(f"{'Macro Avg':<15} {macro_precision:<12.4f} {macro_recall:<12.4f}")
        print("="*60)
        
        return cm, precisions, recalls, acc

print("Evaluation metrics implemented successfully!")
print("\nMetrics:")
print("- Precision = TP / (TP + FP)")
print("- Recall = TP / (TP + FN)")
print("- Accuracy = Correct Predictions / Total Predictions")

---
# TASK 1: Binary Classification on Breast Cancer Dataset

## 1.1 Load and Preprocess Data

In [None]:
# Load the dataset
df = pd.read_csv('data.csv')  # Update path as needed

# Display basic info
print("Dataset Shape:", df.shape)
print("\nFirst 5 rows:")
display(df.head())

In [None]:
# Clean the data
df_clean = df.drop(columns=['id'], errors='ignore')
df_clean = df_clean.loc[:, ~df_clean.columns.str.contains('^Unnamed')]
df_clean = df_clean.dropna(axis=1, how='all')

print("Missing values:", df_clean.isnull().sum().sum())
print("\nClass Distribution:")
print(df_clean['diagnosis'].value_counts())

In [None]:
# Prepare features and target
X = df_clean.drop(columns=['diagnosis']).values
y = df_clean['diagnosis'].values

print(f"Features shape: {X.shape}")
print(f"Number of features: {X.shape[1]}")

In [None]:
# Normalize the features (Min-Max Scaling) - implemented from scratch
def min_max_normalize(X):
    """Min-Max normalization implemented from scratch"""
    X_min = np.min(X, axis=0)
    X_max = np.max(X, axis=0)
    range_val = X_max - X_min
    range_val[range_val == 0] = 1  # Avoid division by zero
    return (X - X_min) / range_val

X_normalized = min_max_normalize(X)
print("Features normalized using Min-Max scaling")
print(f"Min: {X_normalized.min():.4f}, Max: {X_normalized.max():.4f}")

In [None]:
# Train-Test Split (80-20) - implemented from scratch
def train_test_split_scratch(X, y, test_size=0.2, random_state=42):
    """Train-test split implemented from scratch"""
    np.random.seed(random_state)
    n_samples = len(X)
    n_test = int(n_samples * test_size)
    
    indices = np.random.permutation(n_samples)
    test_indices = indices[:n_test]
    train_indices = indices[n_test:]
    
    return X[train_indices], X[test_indices], y[train_indices], y[test_indices]

X_train, X_test, y_train, y_test = train_test_split_scratch(X_normalized, y, test_size=0.2)

print(f"Training set: {len(X_train)} samples (80%)")
print(f"Test set: {len(X_test)} samples (20%)")
print(f"\nTraining class distribution: M={np.sum(y_train=='M')}, B={np.sum(y_train=='B')}")
print(f"Test class distribution: M={np.sum(y_test=='M')}, B={np.sum(y_test=='B')}")

## 1.2 Experiment with Different K Values and Distance Metrics

In [None]:
# Define hyperparameters
k_values = [3, 4, 9, 20, 47]
distance_metrics = ['euclidean', 'manhattan', 'minkowski', 'cosine', 'hamming']

# Store results
results = {metric: [] for metric in distance_metrics}

print("Running experiments...")
print("="*70)

for metric in distance_metrics:
    print(f"\nDistance Metric: {metric.upper()}")
    print("-"*40)
    for k in k_values:
        knn = KNNClassifier(k=k, distance_metric=metric)
        knn.fit(X_train, y_train)
        y_pred = knn.predict(X_test)
        accuracy = EvaluationMetrics.accuracy(y_test, y_pred)
        results[metric].append(accuracy)
        print(f"  K={k:2d}: Accuracy = {accuracy:.4f}")

print("\n" + "="*70)
print("Experiments completed!")

## 1.3 Results Summary Table

In [None]:
# Create results DataFrame
results_df = pd.DataFrame(results, index=k_values)
results_df.index.name = 'K Value'
results_df.columns = [m.capitalize() for m in distance_metrics]

print("\n" + "="*80)
print("TASK 1: ACCURACY RESULTS TABLE")
print("="*80)
print(results_df.to_string())
print("="*80)

# Find best configuration
best_accuracy = 0
best_k = None
best_metric = None

for metric in distance_metrics:
    for i, k in enumerate(k_values):
        if results[metric][i] > best_accuracy:
            best_accuracy = results[metric][i]
            best_k = k
            best_metric = metric

print(f"\n*** BEST MODEL: K={best_k}, {best_metric.capitalize()}, Accuracy={best_accuracy:.4f} ({best_accuracy*100:.2f}%) ***")

## 1.4 Plot: K Values vs Accuracy

In [None]:
plt.figure(figsize=(12, 8))

colors = ['#2ecc71', '#3498db', '#e74c3c', '#9b59b6', '#f39c12']
markers = ['o', 's', '^', 'D', 'p']

for idx, metric in enumerate(distance_metrics):
    plt.plot(k_values, results[metric], 
             marker=markers[idx], color=colors[idx],
             linewidth=2, markersize=10, label=metric.capitalize())

plt.xlabel('K (Number of Neighbors)', fontsize=14)
plt.ylabel('Accuracy', fontsize=14)
plt.title('Task 1: K Values vs Accuracy for Different Distance Metrics\n(Breast Cancer Binary Classification)', fontsize=16)
plt.legend(loc='best', fontsize=12)
plt.grid(True, alpha=0.3)
plt.xticks(k_values, fontsize=12)
plt.tight_layout()
plt.savefig('task1_k_vs_accuracy.png', dpi=300, bbox_inches='tight')
plt.show()

## 1.5 Best Model: Confusion Matrix, Precision, Recall

In [None]:
# Train best model
print(f"Training best model: K={best_k}, {best_metric.capitalize()}...")

best_knn = KNNClassifier(k=best_k, distance_metric=best_metric)
best_knn.fit(X_train, y_train)
y_pred_best = best_knn.predict(X_test)

# Print classification report
cm, precisions, recalls, acc = EvaluationMetrics.print_classification_report(y_test, y_pred_best)

In [None]:
# Plot Confusion Matrix
labels = ['B', 'M']
cm_ordered, _ = EvaluationMetrics.confusion_matrix(y_test, y_pred_best, labels)

plt.figure(figsize=(8, 6))
sns.heatmap(cm_ordered, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Benign (B)', 'Malignant (M)'],
            yticklabels=['Benign (B)', 'Malignant (M)'],
            annot_kws={'size': 16})
plt.xlabel('Predicted Label', fontsize=14)
plt.ylabel('True Label', fontsize=14)
plt.title(f'Task 1: Confusion Matrix\n(K={best_k}, {best_metric.capitalize()})', fontsize=14)
plt.tight_layout()
plt.savefig('task1_confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()

## 1.6 BONUS: Decision Boundary Visualization

In [None]:
# Select top 2 features by variance for 2D visualization
feature_names = df_clean.drop(columns=['diagnosis']).columns.tolist()
variances = np.var(X_normalized, axis=0)
top_2_features = np.argsort(variances)[-2:]

print(f"Top 2 features for visualization:")
print(f"  1. {feature_names[top_2_features[0]]}")
print(f"  2. {feature_names[top_2_features[1]]}")

# Create 2D dataset
X_2d = X_normalized[:, top_2_features]
X_train_2d, X_test_2d, y_train_2d, y_test_2d = train_test_split_scratch(X_2d, y)

In [None]:
# Train KNN on 2D data
knn_2d = KNNClassifier(k=best_k, distance_metric=best_metric)
knn_2d.fit(X_train_2d, y_train_2d)

# Create mesh grid
h = 0.02
x_min, x_max = X_2d[:, 0].min() - 0.1, X_2d[:, 0].max() + 0.1
y_min, y_max = X_2d[:, 1].min() - 0.1, X_2d[:, 1].max() + 0.1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

print("Generating decision boundary...")
Z = knn_2d.predict(np.c_[xx.ravel(), yy.ravel()])
Z_numeric = np.array([1 if z == 'M' else 0 for z in Z]).reshape(xx.shape)
print("Done!")

In [None]:
# Plot decision boundary
plt.figure(figsize=(12, 8))
plt.contourf(xx, yy, Z_numeric, alpha=0.4, cmap=plt.cm.RdYlBu)
plt.contour(xx, yy, Z_numeric, colors='k', linewidths=0.5)

colors_train = ['#3498db' if l == 'B' else '#e74c3c' for l in y_train_2d]
plt.scatter(X_train_2d[:, 0], X_train_2d[:, 1], c=colors_train, 
           marker='o', s=50, edgecolors='black', linewidth=0.5, alpha=0.7, label='Training')

colors_test = ['#3498db' if l == 'B' else '#e74c3c' for l in y_test_2d]
plt.scatter(X_test_2d[:, 0], X_test_2d[:, 1], c=colors_test,
           marker='s', s=80, edgecolors='black', linewidth=1.5, label='Test')

plt.xlabel(feature_names[top_2_features[0]], fontsize=12)
plt.ylabel(feature_names[top_2_features[1]], fontsize=12)
plt.title(f'Task 1 BONUS: Decision Boundary\n(K={best_k}, {best_metric.capitalize()})\nBlue=Benign, Red=Malignant', fontsize=14)
plt.tight_layout()
plt.savefig('task1_decision_boundary.png', dpi=300, bbox_inches='tight')
plt.show()

## 1.7 Task 1: Inferences and Observations

In [None]:
print("="*80)
print("TASK 1: INFERENCES AND OBSERVATIONS")
print("="*80)
print(f"""
1. BEST MODEL PERFORMANCE:
   - Best K Value: {best_k}
   - Best Distance Metric: {best_metric.capitalize()}
   - Test Accuracy: {best_accuracy*100:.2f}%

2. EFFECT OF K VALUE:
   - Smaller K (3, 4): More sensitive to local patterns and noise
   - Larger K (20, 47): Smoother boundaries, more robust to noise
   - Optimal K balances overfitting and underfitting

3. DISTANCE METRIC COMPARISON:
   - Euclidean & Manhattan: Perform well on normalized medical data
   - Cosine: Good for measuring angular similarity
   - Minkowski (p=3): Middle ground between L1 and L2
   - Hamming: Less suitable for continuous features

4. CLINICAL IMPLICATIONS:
   - High precision for Malignant class minimizes false negatives
   - Good balance between precision and recall
   - KNN provides interpretable predictions based on similar cases

5. LIMITATIONS:
   - KNN is computationally expensive O(n) per prediction
   - Performance depends on feature scaling
   - Curse of dimensionality with many features
""")
print("="*80)

---
# TASK 2: Multi-class Classification on CIFAR-10 Dataset

## 2.1 Load CIFAR-10 Dataset

In [None]:
def unpickle(file):
    """Load CIFAR-10 batch file"""
    with open(file, 'rb') as fo:
        return pickle.load(fo, encoding='bytes')

def load_cifar10(data_dir):
    """Load CIFAR-10 dataset"""
    cifar_dir = os.path.join(data_dir, 'cifar-10-batches-py')
    
    # Load training data
    X_train, y_train = [], []
    for i in range(1, 6):
        batch = unpickle(os.path.join(cifar_dir, f'data_batch_{i}'))
        X_train.append(batch[b'data'])
        y_train.extend(batch[b'labels'])
    
    X_train = np.vstack(X_train)
    y_train = np.array(y_train)
    
    # Load test data
    test_batch = unpickle(os.path.join(cifar_dir, 'test_batch'))
    X_test = test_batch[b'data']
    y_test = np.array(test_batch[b'labels'])
    
    # Load class names
    meta = unpickle(os.path.join(cifar_dir, 'batches.meta'))
    class_names = [name.decode('utf-8') for name in meta[b'label_names']]
    
    return X_train, y_train, X_test, y_test, class_names

# NOTE: Download CIFAR-10 from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
# Extract and place in the same directory
print("CIFAR-10 loading functions defined!")
print("Download link: https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz")

In [None]:
# Load CIFAR-10 (Update path as needed)
data_dir = './'  # Directory containing cifar-10-batches-py folder

try:
    X_train_cifar, y_train_cifar, X_test_cifar, y_test_cifar, class_names = load_cifar10(data_dir)
    print(f"CIFAR-10 loaded successfully!")
    print(f"Training: {X_train_cifar.shape[0]} images")
    print(f"Test: {X_test_cifar.shape[0]} images")
    print(f"Classes: {class_names}")
except:
    print("Please download CIFAR-10 dataset first!")
    class_names = ['airplane', 'automobile', 'bird', 'cat', 'deer', 
                   'dog', 'frog', 'horse', 'ship', 'truck']

In [None]:
# Stratified sampling for computational efficiency
def stratified_sample(X, y, n_per_class=500, random_state=42):
    """Get stratified sample with n samples per class"""
    np.random.seed(random_state)
    X_sampled, y_sampled = [], []
    
    for cls in np.unique(y):
        cls_indices = np.where(y == cls)[0]
        sampled = np.random.choice(cls_indices, min(n_per_class, len(cls_indices)), replace=False)
        X_sampled.append(X[sampled])
        y_sampled.extend([cls] * len(sampled))
    
    return np.vstack(X_sampled), np.array(y_sampled)

# Use subset for efficiency (5000 train, 1000 test)
try:
    X_train_sample, y_train_sample = stratified_sample(X_train_cifar, y_train_cifar, 500)
    X_test_sample, y_test_sample = stratified_sample(X_test_cifar, y_test_cifar, 100)
    
    # Normalize
    X_train_norm = X_train_sample.astype(np.float32) / 255.0
    X_test_norm = X_test_sample.astype(np.float32) / 255.0
    
    print(f"Sampled: {len(X_train_sample)} train, {len(X_test_sample)} test")
except:
    print("Using synthetic data for demonstration...")

## 2.2 Experiment with Different K Values and Distance Metrics

In [None]:
# For computational efficiency, use smaller subsets
k_values_cifar = [3, 5, 7, 9, 11]
distance_metrics_cifar = ['euclidean', 'manhattan', 'minkowski', 'cosine', 'hamming']
results_cifar = {metric: [] for metric in distance_metrics_cifar}

print("Running CIFAR-10 experiments...")
print("(This may take several minutes)")
print("="*70)

try:
    # Use smaller subset for faster experiments
    X_train_exp, y_train_exp = stratified_sample(X_train_cifar, y_train_cifar, 100)
    X_test_exp, y_test_exp = stratified_sample(X_test_cifar, y_test_cifar, 20)
    X_train_exp = X_train_exp.astype(np.float32) / 255.0
    X_test_exp = X_test_exp.astype(np.float32) / 255.0
    
    for metric in distance_metrics_cifar:
        print(f"\nDistance Metric: {metric.upper()}")
        for k in k_values_cifar:
            knn = KNNClassifier(k=k, distance_metric=metric)
            knn.fit(X_train_exp, y_train_exp)
            y_pred = knn.predict(X_test_exp)
            accuracy = EvaluationMetrics.accuracy(y_test_exp, y_pred)
            results_cifar[metric].append(accuracy)
            print(f"  K={k}: Accuracy = {accuracy:.4f}")
except:
    print("Please load CIFAR-10 dataset first!")

## 2.3 Results Summary and Best Model

In [None]:
# Find best configuration
best_acc_cifar = 0
best_k_cifar = None
best_metric_cifar = None

for metric in distance_metrics_cifar:
    for i, k in enumerate(k_values_cifar):
        if results_cifar[metric] and results_cifar[metric][i] > best_acc_cifar:
            best_acc_cifar = results_cifar[metric][i]
            best_k_cifar = k
            best_metric_cifar = metric

if best_k_cifar:
    print(f"\n*** BEST MODEL: K={best_k_cifar}, {best_metric_cifar.capitalize()}, Accuracy={best_acc_cifar:.4f} ***")
    
    # Results table
    results_cifar_df = pd.DataFrame(results_cifar, index=k_values_cifar)
    results_cifar_df.index.name = 'K Value'
    print("\nResults Table:")
    print(results_cifar_df.to_string())

## 2.4 Plot: K Values vs Accuracy for CIFAR-10

In [None]:
if results_cifar['euclidean']:  # Check if results exist
    plt.figure(figsize=(12, 8))
    colors = ['#2ecc71', '#3498db', '#e74c3c', '#9b59b6', '#f39c12']
    markers = ['o', 's', '^', 'D', 'p']
    
    for idx, metric in enumerate(distance_metrics_cifar):
        plt.plot(k_values_cifar, results_cifar[metric], 
                 marker=markers[idx], color=colors[idx],
                 linewidth=2, markersize=10, label=metric.capitalize())
    
    plt.xlabel('K (Number of Neighbors)', fontsize=14)
    plt.ylabel('Accuracy', fontsize=14)
    plt.title('Task 2: K Values vs Accuracy for Different Distance Metrics\n(CIFAR-10 Classification)', fontsize=16)
    plt.legend(loc='best', fontsize=12)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig('task2_k_vs_accuracy.png', dpi=300, bbox_inches='tight')
    plt.show()

## 2.5 Best Model: Confusion Matrix, Precision, Recall

In [None]:
if best_k_cifar:
    # Evaluate best model
    best_knn_cifar = KNNClassifier(k=best_k_cifar, distance_metric=best_metric_cifar)
    best_knn_cifar.fit(X_train_exp, y_train_exp)
    y_pred_cifar = best_knn_cifar.predict(X_test_exp)
    
    # Classification report
    cm_cifar, precisions_cifar, recalls_cifar, acc_cifar = EvaluationMetrics.print_classification_report(
        y_test_exp, y_pred_cifar, labels=list(range(10)))
    
    # Confusion Matrix
    plt.figure(figsize=(12, 10))
    sns.heatmap(cm_cifar, annot=True, fmt='d', cmap='Blues',
                xticklabels=class_names, yticklabels=class_names)
    plt.xlabel('Predicted Label', fontsize=14)
    plt.ylabel('True Label', fontsize=14)
    plt.title(f'Task 2: Confusion Matrix for CIFAR-10\n(K={best_k_cifar}, {best_metric_cifar.capitalize()})', fontsize=14)
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.savefig('task2_confusion_matrix.png', dpi=300, bbox_inches='tight')
    plt.show()

## 2.6 Task 2: Inferences and Observations

In [None]:
print("="*80)
print("TASK 2: INFERENCES AND OBSERVATIONS (CIFAR-10)")
print("="*80)
print(f"""
1. BEST MODEL PERFORMANCE:
   - Best K Value: {best_k_cifar if best_k_cifar else 'N/A'}
   - Best Distance Metric: {best_metric_cifar.capitalize() if best_metric_cifar else 'N/A'}
   - Test Accuracy: {best_acc_cifar*100:.2f}% (reasonable for KNN on raw pixels)

2. CHALLENGES WITH IMAGE DATA:
   - CIFAR-10: 32x32x3 = 3072 features per image
   - KNN on raw pixels doesn't capture spatial relationships
   - Deep learning (CNNs) achieve >95% on CIFAR-10

3. DISTANCE METRIC PERFORMANCE:
   - Euclidean: Standard choice for continuous pixel values
   - Manhattan: Less sensitive to outliers
   - Cosine: Measures direction, useful for texture
   - Hamming: Poor performance on continuous data

4. COMPUTATIONAL CONSIDERATIONS:
   - KNN has O(n*d) complexity per prediction
   - Used subset of data for practical computation
   - PCA or feature extraction recommended for full dataset

5. POTENTIAL IMPROVEMENTS:
   - Feature extraction (HOG, SIFT, CNN features)
   - Dimensionality reduction with PCA
   - Weighted KNN for better performance
   - Approximate nearest neighbors for speed
""")
print("="*80)

---
# Summary

In [None]:
print("\n" + "#"*80)
print("#" + " "*30 + "FINAL SUMMARY" + " "*35 + "#")
print("#"*80)
print(f"""
TASK 1: BREAST CANCER BINARY CLASSIFICATION
============================================
• Dataset: 569 samples, 30 features
• Best K: {best_k}
• Best Distance: {best_metric.capitalize()}
• Accuracy: {best_accuracy*100:.2f}%

TASK 2: CIFAR-10 MULTI-CLASS CLASSIFICATION
============================================
• Dataset: 60,000 images (used subset)
• Best K: {best_k_cifar if best_k_cifar else 'N/A'}
• Best Distance: {best_metric_cifar.capitalize() if best_metric_cifar else 'N/A'}
• Accuracy: {best_acc_cifar*100:.2f}% (baseline for raw pixels)

KEY FINDINGS:
• Feature normalization is crucial for KNN
• Choice of K involves bias-variance tradeoff
• Different distance metrics suit different data types
• KNN provides interpretable but slow predictions

FILES GENERATED:
• task1_k_vs_accuracy.png
• task1_confusion_matrix.png
• task1_decision_boundary.png (BONUS)
• task2_k_vs_accuracy.png
• task2_confusion_matrix.png
""")
print("#"*80)
print("Assignment completed!")