<a href="https://colab.research.google.com/github/jadhav-rakesh/ML/blob/main/ds15.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#K-Nearest Neighbors

In [2]:
import numpy as np
import pandas as pd

#Finding an Observation’s Nearest Neighbors

In [1]:
from sklearn import datasets
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

iris = datasets.load_iris()

features = iris.data

standardizer = StandardScaler()

features_standarized = standardizer.fit_transform(features)

nearest_neighbours = NearestNeighbors(n_neighbors=2).fit(features_standarized)

new_observation = [1, 1, 1, 1]

distance, indices = nearest_neighbours.kneighbors([new_observation])

features_standarized[indices]


array([[[1.03800476, 0.55861082, 1.10378283, 1.18556721],
        [0.79566902, 0.32841405, 0.76275827, 1.05393502]]])

In [4]:
nearestneighbors_euclidean = NearestNeighbors(n_neighbors=2,
                                              metric="euclidean").fit(features_standarized)


In [5]:
distance

array([[0.49140089, 0.74294782]])

In [3]:
indices

array([[124, 110]])

In [13]:
nearest_neighbours_euclidean = NearestNeighbors(n_neighbors=3,
                                                metric="euclidean").fit(features_standarized)

nearest_neighbours_with_self = nearest_neighbours_euclidean.kneighbors_graph(features_standarized).toarray()

for i, x in enumerate(nearest_neighbours_with_self):
    x[i] = 0

nearest_neighbours_with_self[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

#Creating a K-Nearest Neighbors Classifier

In [14]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import datasets

iris = datasets.load_iris()

X, y = iris.data, iris.target

standardizer = StandardScaler()

X_std = standardizer.fit_transform(X)

knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1).fit(X_std, y)

new_observation = [[0.75, 0.75, 0.75, 0.75],
                   [1, 1, 1, 1]]

knn.predict(new_observation)

array([1, 2])

In [15]:
knn.predict_proba(new_observation)

array([[0. , 0.6, 0.4],
       [0. , 0. , 1. ]])

#Identifying the Best Neighborhood Size

In [17]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV

iris = datasets.load_iris()

features, target = iris.data, iris.target

standardizer = StandardScaler()

knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)

pipe = Pipeline([("standardizer", standardizer), ("knn", knn)])

search_space = [{"knn__n_neighbors": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}]

classifier = GridSearchCV(pipe, search_space, cv=5, verbose=0).fit(features_standarized, target)


In [18]:
classifier.best_estimator_.get_params()["knn__n_neighbors"]

6

#Creating a Radius-Based Nearest Neighbors Classifier

In [19]:
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import datasets

iris = datasets.load_iris()

features, target = iris.data, iris.target

standardizer = StandardScaler()

features_standarized = standardizer.fit_transform(features)

rnn = RadiusNeighborsClassifier(radius=.5, n_jobs=-1).fit(features_standarized, target)

new_observation=[[1, 1, 1, 1]]

rnn.predict(new_observation)

array([2])

#Finding Approximate Nearest Neighbors

In [24]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m23.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [26]:
import faiss
from sklearn import datasets
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

iris = datasets.load_iris()

features, target = iris.data, iris.target

standardizer = StandardScaler()

features_standarized = standardizer.fit_transform(features)

n_features = features_standarized.shape[1]
nlist = 3
k = 2

quantizer = faiss.IndexFlatIP(n_features)
index = faiss.IndexIVFFlat(quantizer, n_features, nlist)

index.train(features_standarized)
index.add(features_standarized)

new_observation = np.array([[1, 1, 1, 1]])

distances, indices = index.search(new_observation, k)

np.array([list(features_standarized[i]) for i in indices[0]])


array([[1.03800476, 0.55861082, 1.10378283, 1.18556721],
       [0.79566902, 0.32841405, 0.76275827, 1.05393502]])

#Evaluating Approximate Nearest Neighbors

In [27]:
import faiss
from sklearn import datasets
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

k = 10

iris = datasets.load_iris()

features= iris.data

standardizer = StandardScaler()

features_standarized = standardizer.fit_transform(features)

nearest_neighbours = NearestNeighbors(n_neighbors=k).fit(features_standarized)

n_features = features_standarized.shape[1]
n_list = 3

quantizer = faiss.IndexFlatIP(n_features)
index = faiss.IndexIVFFlat(quantizer, n_features, nlist)

index.train(features_standarized)
index.add(features_standarized)
index.nprobe = 1

new_observation = np.array([[1, 1, 1, 1]])

knn_distance, knn_indices = nearest_neighbours.kneighbors(new_observation)

ivf_distances, ivf_indices = index.search(new_observation, k)

recalled_items = set(list(knn_indices[0])) & set(list(ivf_indices[0]))

print(f"Recall @k={k}: {len(recalled_items)/k * 100}%")



Recall @k=10: 100.0%
