In [25]:
import numpy as np

def knn_clustering(data, k, distance_metric):
  """
  Performs KNN clustering on the given data.

  Args:
      data: A NumPy array representing the data points (with features as columns).
      k: The number of nearest neighbors to consider.
      distance_metric: A function that calculates the distance between two data points.

  Returns:
      A list containing the cluster labels for each data point.
  """

  # Calculate pairwise distances between all data points
  distances = np.zeros((data.shape[0], data.shape[0]))
  for i in range(data.shape[0]):
    for j in range(i + 1, data.shape[0]):
      distances[i, j] = distance_metric(data[i], data[j])
      distances[j, i] = distances[i, j]  # Fill the other half for symmetry

  # Cluster label initialization (all -1 for unassigned)
  cluster_labels = np.full(data.shape[0], -1, dtype=int)

  # Iterate over each data point
  for i in range(data.shape[0]):
    # Find the K nearest neighbors
    nearest_neighbors = np.argsort(distances[i])[:k]

    # Majority vote for cluster label (or average features for unlabeled data)
    neighbor_labels = cluster_labels[nearest_neighbors]
    unique_labels, counts = np.unique(neighbor_labels, return_counts=True)
    if len(unique_labels) == 0:  # Handle case with no labeled neighbors (create new cluster)
      cluster_labels[i] = len(set(cluster_labels))
    else:
      # Most frequent label among neighbors
      majority_label = unique_labels[np.argmax(counts)]
      cluster_labels[i] = majority_label

  return cluster_labels

# Example usage (replace with your data and distance metric)
data = np.array([[1, 2], [3, 4], [1.5, 1.8], [5, 8], [8, 8], [1, 0.6], [9, 11]])

def euclidean_distance(p1, p2):
  return ((p1[0] - p2[0])**2 + (p1[1] - p2[1])**2)**0.5

clusters = knn_clustering(data, 3, euclidean_distance)

# Print the cluster labels for each data point
print("Cluster labels:", clusters)


Cluster labels: [-1 -1 -1 -1 -1 -1 -1]


In [27]:
from sklearn.neighbors import KNeighborsClassifier

def knn_clustering(data, k, distance_metric):
  """
  Performs KNN clustering on the given data using scikit-learn's KNeighborsClassifier.

  Args:
      data: A NumPy array representing the data points.
      k: The number of nearest neighbors to consider.
      distance_metric: A string representing the distance metric ('euclidean', 'manhattan', etc.).

  Returns:
      A list containing the cluster labels for each data point.
  """

  # Create a dummy target variable (all zeros)
  dummy_target = np.zeros(data.shape[0])

  # KNN classifier with specified k and distance metric
  knn = KNeighborsClassifier(n_neighbors=k, metric=distance_metric)

  # Fit the classifier with dummy target variable
  knn.fit(data, dummy_target)

  # Leverage KNeighborsClassifier's kneighbors method for efficient neighbor finding
  _, neighbors = knn.kneighbors(data)

  # Cluster label assignment based on majority vote among K nearest neighbors
  cluster_labels = []
  for neighbor_indices in neighbors:
    neighbor_labels = knn.classes_[neighbor_indices]  # Access labels of neighbors
    unique_labels, counts = np.unique(neighbor_labels, return_counts=True)
    majority_label = unique

In [28]:
print(cluster_labels)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 2 0 2 2 2 2 0 2 2 2 2
 2 2 0 0 2 2 2 2 0 2 0 2 0 2 2 0 0 2 2 2 2 2 0 0 2 2 2 0 2 2 2 0 2 2 2 0 2
 2 0]
