In [223]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

from sklearn import neighbors, datasets
from sklearn.metrics import accuracy_score

In [224]:
iris = datasets.load_iris()
iris_X = iris.data
iris_y = iris.target 
print(f"Number of classes: {np.unique(iris_y)}")
print(f"Number of data points: {len(iris_y)}")

Number of classes: [0 1 2]
Number of data points: 150


In [225]:
#from sklearn.discriminant_analysis import StandardScaler
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler


rc = StandardScaler()
iris_X = rc.fit_transform(X=iris_X)

In [226]:
X0 = iris_X[iris_y == 0, :]
X1 = iris_X[iris_y == 1, :]
X2 = iris_X[iris_y == 2, :]

print("Samples from each class [0, 1, 2]: ", X0.shape, X1.shape, X2.shape)

Samples from each class [0, 1, 2]:  (50, 4) (50, 4) (50, 4)


In [227]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(iris_X, iris_y, test_size=.5, stratify=iris_y)

p: distance type: 
- p = 1 → Manhattan distance
- p = 2 → Euclidean distance
- p → ∞ → Chebyshev distance
- p = 3, 4... Khoảng cách Minkowski tổng quát

In [228]:
model = neighbors.KNeighborsClassifier(n_neighbors=1, p=2 ) 
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

acc_1 = accuracy_score(y_test, y_pred)
print(f"Accuary of model: {acc_1}")

Accuary of model: 0.9066666666666666


Majority Voting with K = 10 neighbors
- Small dataset → K small (3–7)
- Large dataset, outliers → K bigger (10–30)
- If dataset is density imbalance → use weights='distance'

In [229]:
model = neighbors.KNeighborsClassifier(n_neighbors=10, p=2 ) 
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

acc_10 = accuracy_score(y_test, y_pred)
print(f"Accuary of model: {acc_10}")

Accuary of model: 0.9466666666666667


Weights
- 'uniform' - In majority voting technique, treat near data points as the same.
- 'distance' - In majority voting, closer neighbors have more influence on the prediction. 

In [230]:
model = neighbors.KNeighborsClassifier(n_neighbors=10, p=2, weights='distance') 
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

acc_10_distance = accuracy_score(y_test, y_pred)
print(f"Accuary of model: {acc_10_distance}")

Accuary of model: 0.96


Customized weights

In [231]:
def customized_weight(distances):
    sigma2 = .5
    return np.exp(-distances**2/sigma2)

model = neighbors.KNeighborsClassifier(n_neighbors=10, p=2, weights=customized_weight) 
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

acc_10_distance_custom = accuracy_score(y_test, y_pred)
print(f"Accuary of model: {acc_10_distance_custom}")

Accuary of model: 0.9466666666666667


In [232]:
acc_scores = {
    'Method': ['1NN', '10NN', '10NN-distance', '10NN-(customized weights)'],
    'Accuracy': [acc_1, acc_10, acc_10_distance, acc_10_distance_custom]
}

df = pd.DataFrame(acc_scores)
df

Unnamed: 0,Method,Accuracy
0,1NN,0.906667
1,10NN,0.946667
2,10NN-distance,0.96
3,10NN-(customized weights),0.946667


### Cons
- KNN is very sensitive with outliers
- Complexity increases when K increases