In [None]:
import numpy as np
x1 = np.array([3,4])
XX = np.array([[0,0], [1,1], [3,4], [-3,-4]])
knn = KNN(3)
print(knn.euclidean_distance(x1, x2))

In [None]:
l = [1,2,3,3,3,2,2,2,2,1]
print(knn._majority(l))

In [None]:
d = x1 - XX
print(d)
distance = np.sqrt(np.sum(d ** 2, axis=1))
print(distance)

k = 2
tki = np.argpartition(distance, k)[:k]
print(tki)
XX[tki]

In [None]:
ii = np.argpartition(distance, -2)[-2:]
XX[ii]

In [None]:
l = np.zeros(4)
l[0]=1
print(l)


In [14]:
import numpy as np

class KNN:
  def __init__(self, k):
    self.k = k
    self.X_train = None
    self.y_train = None

  # X: (n, m), y: (n,)
  def fit(self, X, y):
    self.X_train = X
    self.y_train = y

  def predict(self, X):
    n = len(X)
    if len(X[0]) != len(self.X_train[0]):
        raise ValueError(f"shape mismatch: need points in {len(self.X_train[0])}-dim")
    y_pred = np.zeros(n, dtype=self.y_train.dtype)
    for i in range(n):
        y_pred[i] = self._predict_one(X[i])
    return y_pred

  def _predict_one(self, x):
    squared_distances = np.sum((x - self.X_train) ** 2, axis=1)
    top_k_indices = np.argpartition(squared_distances, self.k)[:self.k]
    top_k_labels = self.y_train[top_k_indices]
    return self._majority(top_k_labels)

  def _majority(self, labels):
    unique_values, counts = np.unique(labels, return_counts=True)
    idx = np.argmax(counts)
    return unique_values[idx]  
    

In [15]:
knn = KNN(3)
x1 = np.array([[3,4]])
XX = np.array([[0,0], [1,1], [3,4], [-3,-4]])
knn.fit(XX, np.array([1,0,2,1]))
knn.predict(x1)

array([0])

 üü° Minor Issues (Easy fixes)

  Issue 1: Variable Scope ‚ö†Ô∏è
```
  def predict(self, X):
      ...
      if len(X[0]) != len(X_train[0]):  # ‚ùå Should be self.X_train
          ...
      for i in range(n):
          y_pred[i] = self._predict_one(X[i])  # Uses X_train without self
```
  Problem: X_train, y_train, k are referenced without self.

  Fix:
  ```
  def predict(self, X):
      n = len(X)
      if len(X[0]) != len(self.X_train[0]):  # ‚úÖ Add self.
          raise ValueError(f"shape mismatch: need points in {len(self.X_train[0])}-dim")
      y_pred = np.zeros(n)
      for i in range(n):
          y_pred[i] = self._predict_one(X[i])
      return y_pred

  def _predict_one(self, x):
      squared_distances = np.sum((x - self.X_train) ** 2, axis=1)  # ‚úÖ Add self.
      top_k_indices = np.argpartition(squared_distances, self.k)[:self.k]  # ‚úÖ Add self.
      top_k_labels = self.y_train[top_k_indices]  # ‚úÖ Add self.
      return self._majority(top_k_labels)
```
  ---
  Issue 2: Return Type üü°
```
  y_pred = np.zeros(n)  # Returns float array
```
  Problem: Labels might be integers, but np.zeros creates floats by default.

  Fix:
  ```
  y_pred = np.zeros(n, dtype=self.y_train.dtype)  # ‚úÖ Match label type
  # OR
  y_pred = []  # Then convert to array at the end
```


In [16]:
# Test your implementation

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Load data
iris = load_iris()
X, y = iris.data, iris.target

# Split
X_train, X_test, y_train, y_test = train_test_split(
  X, y, test_size=0.2, random_state=42
)

# Train your model
knn = KNN(k=3)
knn.fit(X_train, y_train)

# Predict
y_pred = knn.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
print(f"Your K-NN Accuracy: {accuracy:.3f}")

# Compare with sklearn
from sklearn.neighbors import KNeighborsClassifier
sklearn_knn = KNeighborsClassifier(n_neighbors=3)
sklearn_knn.fit(X_train, y_train)
sklearn_acc = sklearn_knn.score(X_test, y_test)
print(f"Sklearn K-NN Accuracy: {sklearn_acc:.3f}")

Your K-NN Accuracy: 1.000
Sklearn K-NN Accuracy: 1.000


# Optimization

  Your current version loops over test samples. Here's a fully vectorized version:
```
  def predict_vectorized(self, X):
      """Vectorized prediction (faster for large datasets)"""
      if self.X_train is None:
          raise ValueError("Model not trained")

      # Compute all distances at once: (n_test, n_train)
      # Using broadcasting: X is (n_test, m), X_train is (n_train, m)
      distances = np.sqrt(((X[:, np.newaxis, :] - self.X_train[np.newaxis, :, :]) ** 2).sum(axis=2))

      # Get top k indices for all test samples
      top_k_indices = np.argpartition(distances, self.k, axis=1)[:, :self.k]

      # Get labels
      top_k_labels = self.y_train[top_k_indices]  # Shape: (n_test, k)

      # Majority vote for each test sample
      predictions = np.array([self._majority(labels) for labels in top_k_labels])

      return predictions
```
  But your loop version is fine for interviews! It's clearer and easier to explain.