## In-Class Exercise: Nearest Neighbor Classification
### Predicting Fruit Types

In this exercise, we'll use k-nearest neighbors to classify fruits based on their weight, sweetness, and color.

In [None]:
# Import libraries
import numpy as np
from datascience import *
import matplotlib.pyplot as plt
%matplotlib inline

### Load the data

In [None]:
# Load fruits dataset
fruits = Table.read_table('fruits.csv')
fruits

**Features:**
- Weight_g: weight in grams
- Sweetness: sweetness rating (1-10)
- Color_code: 1=Red, 2=Orange, 3=Yellow
- Fruit_Type: Apple, Orange, or Lemon (our target)

### Lab 10 Functions

In [None]:
def distance(pt1, pt2):
    """The distance between two points, represented as arrays."""
    return np.sqrt(sum((pt1 - pt2) ** 2))

def row_distance(row1, row2):
    """The distance between two rows of a table."""
    return distance(np.array(row1), np.array(row2))

def distances(training, test, target, features):
    """Compute the distance from test for each row in training."""
    dists = []
    attributes = training.select(features)
    for row in attributes.rows:
        dists.append(row_distance(row, test))
    return training.with_column('Distance', dists)

def closest(training, test, k, target, features):
    """Return a table of the k closest neighbors to test point."""
    return distances(training, test, target, features).sort('Distance').take(np.arange(k))

### Split into training and test sets

In [None]:
# Use first 15 rows for training, last 5 for testing
train = fruits.take(np.arange(15))
test = fruits.take(np.arange(15, 20))

print(f"{train.num_rows} training samples")
print(f"{test.num_rows} test samples")

### Define our target and features

In [None]:
target = ['Fruit_Type']
features = ['Weight_g', 'Sweetness', 'Color_code']

### Example: Find the 3 nearest neighbors for the first test fruit

In [None]:
# Get the first test point's features
test_point = test.select(features).row(0)
print("Test point features:", test_point)
print("Actual fruit type:", test.column('Fruit_Type').item(0))

In [None]:
# Find 3 nearest neighbors
k = 3
nearest = closest(train, test_point, k, target, features)
nearest

### Make a prediction using majority vote

In [None]:
def predict_knn_class(row, train, test, k=5):
    """Return the predicted class using k nearest neighbors (majority vote)."""
    closestk = closest(train, test.select(features).row(row), k, target, features)
    classes = list(closestk.column(target[0]))
    # Return the most common class
    return max(classes, key=classes.count)

# Predict for first test point
prediction = predict_knn_class(0, train, test, k=3)
actual = test.column('Fruit_Type').item(0)

print(f"Predicted: {prediction}")
print(f"Actual: {actual}")
print(f"Correct: {prediction == actual}")

### Challenge: Predict all test points and calculate accuracy

In [None]:
# Predict for all test points
predictions = []
for i in np.arange(test.num_rows):
    pred = predict_knn_class(i, train, test, k=3)
    predictions.append(pred)

# Add predictions to test table
test_with_predictions = test.with_column('Predicted', predictions)
test_with_predictions

In [None]:
# Calculate accuracy
correct = np.count_nonzero(test_with_predictions.column('Fruit_Type') == 
                           test_with_predictions.column('Predicted'))
accuracy = correct / test.num_rows

print(f"Accuracy: {accuracy:.1%} ({correct}/{test.num_rows} correct)")

### Discussion Questions
1. Why did the algorithm make correct/incorrect predictions?
2. What happens if you change k to 1 or 5?
3. Which features seem most important for classification?