In [109]:
#import libraries and dataset
import numpy as np
import pandas as pd

from sklearn.datasets import load_iris, load_diabetes,fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor  # for comparison
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler


KNN classifaction(Iris Dataset)

In [110]:
#Load dataset
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['target'] = iris.target
df.head()


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [111]:
#target distribution 
df.target.value_counts()


target
0    50
1    50
2    50
Name: count, dtype: int64

In [112]:
# feature and target split
X = df.drop('target', axis=1)
y = df['target']


In [113]:
#train test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [114]:
#train KNN model 
model = KNeighborsClassifier(n_neighbors=3)
model.fit(X_train, y_train)


In [115]:
# MOdel prediction and evaluation
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

Accuracy: 1.0
[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



Choosing best K value 

In [116]:

k_values = [3, 5, 7, 9, 11]

for k in k_values:
    model = KNeighborsClassifier(n_neighbors=k)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"K={k}, Accuracy={accuracy_score(y_test, y_pred)*100:.2f}%")


K=3, Accuracy=100.00%
K=5, Accuracy=100.00%
K=7, Accuracy=96.67%
K=9, Accuracy=100.00%
K=11, Accuracy=100.00%


Feature scaling- important for KNN

In [117]:
#without scaling 
data = pd.DataFrame({
    'feature_1': [0.5, 0.6, 0.8, 0.9, 0.6, 0.88],
    'feature_2': [10, 20, 30, 40, 50, 60],
    'target': np.random.choice([0, 1], size=6)
})

X = data[['feature_1', 'feature_2']]
y = data['target']

model = KNeighborsClassifier(n_neighbors=3)
model.fit(X, y)

data['predicted_without_scaling'] = model.predict(X)
data


Unnamed: 0,feature_1,feature_2,target,predicted_without_scaling
0,0.5,10,1,1
1,0.6,20,0,1
2,0.8,30,1,1
3,0.9,40,1,1
4,0.6,50,0,0
5,0.88,60,0,0


In [118]:
# with scaling 
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

model.fit(X_scaled, y)
data['predicted_with_scaling'] = model.predict(X_scaled)

data


Unnamed: 0,feature_1,feature_2,target,predicted_without_scaling,predicted_with_scaling
0,0.5,10,1,1,1
1,0.6,20,0,1,1
2,0.8,30,1,1,1
3,0.9,40,1,1,1
4,0.6,50,0,0,0
5,0.88,60,0,0,1


Handling Imbalance data 

In [119]:
#create imbalanced data
X, y = make_classification(
    n_classes=2,
    weights=[0.9, 0.1],
    n_samples=1000,
    n_features=20,
    random_state=42
)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42
)


In [120]:
#train model(KNN) before handling 
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
print("Accuracy before balancing:", accuracy_score(y_test, y_pred))


Accuracy before balancing: 0.93


In [121]:
pd.Series(y_train).value_counts()


0    803
1     97
Name: count, dtype: int64

dataset is highly imbalanced - majority class have more sample than minority class this result in high accuracy but poor representation of minority class

In [122]:
# oversampling using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

pd.Series(y_resampled).value_counts()


1    803
0    803
Name: count, dtype: int64

applied oversampling (SMOTE) generates new synthetic samples for the minority class and balances data

In [123]:
#undersampling 
rus = RandomUnderSampler(random_state=42)
X_under, y_under = rus.fit_resample(X_train, y_train)

pd.Series(y_under).value_counts()


0    97
1    97
Name: count, dtype: int64

undersampling- address imbalance by reducing the number of samples in the majority class.

Handling categorical Variables( One Hot Encoding)

In [124]:
df_people = pd.DataFrame({
    'weight': [50, 60, 70, 80],
    'height': [150, 160, 170, 180],
    'hair_color': ['black', 'black', 'brown', 'red']
})

encoder = OneHotEncoder(sparse_output=False)
encoded = encoder.fit_transform(df_people[['hair_color']])

encoded_df = pd.DataFrame(
    encoded, columns=encoder.get_feature_names_out(['hair_color'])
)

df_people = pd.concat([df_people.drop('hair_color', axis=1), encoded_df], axis=1)
df_people


Unnamed: 0,weight,height,hair_color_black,hair_color_brown,hair_color_red
0,50,150,1.0,0.0,0.0
1,60,160,1.0,0.0,0.0
2,70,170,0.0,1.0,0.0
3,80,180,0.0,0.0,1.0


One-hot encoding - used to convert categorical variables into a numerical format that machine learning models can understand.

KNN Regression (Diabetes dataset)

In [125]:
# load datasset
diabetes = load_diabetes()
df = pd.DataFrame(diabetes.data, columns=diabetes.feature_names)
df['target'] = diabetes.target

In [126]:
import sklearn
import imblearn

print("sklearn:", sklearn.__version__)
print("imblearn:", imblearn.__version__)


sklearn: 1.2.2
imblearn: 0.10.1


In [127]:
#Train KNN Regressor
X = df.drop('target', axis=1)
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model = KNeighborsRegressor(n_neighbors=5)
model.fit(X_train, y_train)

In [128]:
#Regression Evaluavtion
y_pred = model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("RMSE:", rmse)


RMSE: 54.946114563433675


Assignment1 - KNN from scratch with out sklearn 

In [129]:
# KNN Classification using iris dataset 
df.dtypes

age       float64
sex       float64
bmi       float64
bp        float64
s1        float64
s2        float64
s3        float64
s4        float64
s5        float64
s6        float64
target    float64
dtype: object

iris dataset is having only numeric fields which is easy to calcualate euclidean distance.

In [130]:
def euclidean_distance(x1, x2):
    return np.sqrt(np.sum((x1 - x2) ** 2))


In [131]:
class KNNFromScratch:
    def __init__(self, k=3):
        self.k = k

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, X_test):
        predictions = []
        for x in X_test:
            distances = [
                (euclidean_distance(x, self.X_train[i]), self.y_train[i])
                for i in range(len(self.X_train))
            ]
            distances.sort(key=lambda x: x[0])
            k_labels = [label for _, label in distances[:self.k]]
            predictions.append(max(set(k_labels), key=k_labels.count))
        return np.array(predictions)


In [132]:
#Train and custom KNN
knn_custom = KNNFromScratch(k=3)
knn_custom.fit(X_train.values, y_train.values)

y_pred_custom = knn_custom.predict(X_test.values)
print("Custom KNN Accuracy:", accuracy_score(y_test, y_pred_custom))


Custom KNN Accuracy: 0.011235955056179775


In [133]:
sklearn_knn = KNeighborsClassifier(n_neighbors=3)
sklearn_knn.fit(X_train, y_train)

y_pred_sklearn = sklearn_knn.predict(X_test)
sklearn_accuracy = accuracy_score(y_test, y_pred_sklearn)

print("Sklearn KNN Accuracy:", sklearn_accuracy)


Sklearn KNN Accuracy: 0.0


Assignment2 - KNN Regressor from scratch


In [134]:
# Load boston dataset
boston = fetch_openml(name="boston", version=1, as_frame=True)
df = boston.frame

df.head()


  warn(


Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [135]:
df.dtypes


CRIM        float64
ZN          float64
INDUS       float64
CHAS       category
NOX         float64
RM          float64
AGE         float64
DIS         float64
RAD        category
TAX         float64
PTRATIO     float64
B           float64
LSTAT       float64
MEDV        float64
dtype: object

This data set have non numeric fields. so by converting all columns to numeric fields it will be easier to calculate euclidean distance 

In [136]:
boston = fetch_openml(name="boston", version=1, as_frame=True)
df = boston.frame

# convert all columns to numeric
df = df.apply(pd.to_numeric)

X = df.drop("MEDV", axis=1)
y = df["MEDV"]


  warn(


In [137]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [138]:
#knn regressor from scratch
class KNNRegressorFromScratch:
    def __init__(self, k=5):
        self.k = k

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def predict(self, X_test):
        predictions = []

        for x in X_test:
            distances = [
                (euclidean_distance(x, self.X_train[i]), self.y_train[i])
                for i in range(len(self.X_train))
            ]

            distances.sort(key=lambda x: x[0])
            k_values = [value for _, value in distances[:self.k]]
            predictions.append(np.mean(k_values))

        return np.array(predictions)


In [139]:
#evaluate regressor
custom_reg = KNNRegressorFromScratch(k=5)
custom_reg.fit(X_train.values, y_train.values)

y_pred_custom = custom_reg.predict(X_test.values)
rmse_custom = np.sqrt(mean_squared_error(y_test, y_pred_custom))

print("Custom KNN Regression RMSE:", rmse_custom)


Custom KNN Regression RMSE: 5.0852851926117255
