## Task 1:

In [3]:
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

train_data = pd.read_csv("occupancy_train.txt")
test_data = pd.read_csv("occupancy_test.txt")

x_train = train_data[["Humidity", "Light", "HumidityRatio"]]
y_train = train_data["Occupancy"]

x_test = test_data[["Humidity", "Light", "HumidityRatio"]]
y_test = test_data["Occupancy"]

knnModel2 = KNeighborsClassifier(n_neighbors=5)
model2 = knnModel2.fit(x_train, y_train)
prediction2 = model2.predict(x_test)

print("===================== Training Accuracy =====================")
trac2 = knnModel2.score(x_train, y_train)
trainingAccKNN2 = trac2 * 100
print(trainingAccKNN2)

print("===================== Testing Accuracy ======================")
teacKNN2 = accuracy_score(y_test, prediction2)
testingAccKNN2 = teacKNN2 * 100
print(testingAccKNN2)

acc = []
for i in range(1, 11):
    clf = KNeighborsClassifier(n_neighbors=i)
    clf.fit(x_train, y_train)
    predict_i = clf.predict(x_test)
    accKNN = accuracy_score(y_test, predict_i)
    acc.append(accKNN * 100)

print("\nAccuracies for K = 1 to 10:")
for idx, val in enumerate(acc, start=1):
    print(f"K = {idx} --> Accuracy = {val:.2f}%")

maxAcc = max(acc)
bestK = acc.index(maxAcc) + 1
print("\n===================== Best Result =====================")
print(f"Highest Accuracy = {maxAcc:.2f}% at K = {bestK}")


98.91931720496132
95.53470919324579

Accuracies for K = 1 to 10:
K = 1 --> Accuracy = 94.03%
K = 2 --> Accuracy = 92.53%
K = 3 --> Accuracy = 95.72%
K = 4 --> Accuracy = 94.48%
K = 5 --> Accuracy = 95.53%
K = 6 --> Accuracy = 95.27%
K = 7 --> Accuracy = 96.32%
K = 8 --> Accuracy = 96.14%
K = 9 --> Accuracy = 96.47%
K = 10 --> Accuracy = 96.47%

Highest Accuracy = 96.47% at K = 9


## Task 2:

In [4]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.metrics import confusion_matrix, accuracy_score
from collections import Counter

def chi2_dist(a, b):
    s = 0
    for i in range(len(a)):
        if a[i] + b[i] != 0:
            s += ((a[i]-b[i])**2) / (a[i]+b[i])
    return s

class MyKNN:
    def __init__(self, k):
        self.k = k

    def fit(self, X, y):
        self.Xtrain = X
        self.ytrain = y

    def predict(self, Xtest):
        preds = []
        for x in Xtest:
            dists = []
            for i in range(len(self.Xtrain)):
                d = chi2_dist(x, self.Xtrain[i])
                dists.append((d, self.ytrain[i]))
            dists.sort(key=lambda t: t[0])
            k_neigh = [label for (_, label) in dists[:self.k]]
            c = Counter(k_neigh).most_common(1)[0][0]
            preds.append(c)
        return preds

iris = load_iris()
X = iris.data
y = iris.target

np.random.seed(1)
idx = np.arange(len(X))
np.random.shuffle(idx)
split = int(0.8*len(X))
train_idx = idx[:split]
test_idx = idx[split:]

X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]

knn = MyKNN(k=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

acc = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

print("Accuracy:", acc*100)
print("Confusion Matrix:")
print(cm)


Accuracy: 96.66666666666667
Confusion Matrix:
[[10  0  0]
 [ 0 13  0]
 [ 0  1  6]]


## Task 3:

In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

data = pd.read_csv("cancer patient data sets.csv")
data = data.drop(columns=["index", "Patient Id"], errors="ignore")

y_raw = data["Level"]
le = LabelEncoder()
y = le.fit_transform(y_raw)

X = data.drop("Level", axis=1)
for c in X.columns:
    if X[c].dtype == object or X[c].dtype.name == "category":
        uniques = X[c].dropna().unique()
        if len(uniques) == 2:
            m = {uniques[0]: 0, uniques[1]: 1}
            X[c] = X[c].map(m)
        else:
            X = X.drop(columns=[c])

X = X.select_dtypes(include=[np.number])
X = X.fillna(X.mean())

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, stratify=y, random_state=0)
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size=0.3, stratify=y_train, random_state=0)

metrics = [("euclidean", {"metric":"euclidean"}), ("manhattan", {"metric":"manhattan"}), ("minkowski_p3", {"metric":"minkowski", "p":3})]

for name, kwargs in metrics:
    knn = KNeighborsClassifier(n_neighbors=5, **kwargs)
    knn.fit(X_tr, y_tr)
    tr_acc = accuracy_score(y_tr, knn.predict(X_tr))
    val_acc = accuracy_score(y_val, knn.predict(X_val))
    te_acc = accuracy_score(y_test, knn.predict(X_test))
    preds = knn.predict(X_test)
    cm = confusion_matrix(y_test, preds)
    cm_df = pd.DataFrame(cm, index=le.classes_, columns=le.classes_)
    print(name.upper(), "Train:", f"{tr_acc:.2f}", "Val:", f"{val_acc:.2f}", "Test:", f"{te_acc:.2f}")
    print("Confusion Matrix (test):")
    print(cm_df)
    print()


EUCLIDEAN Train: 1.00 Val: 1.00 Test: 1.00
Confusion Matrix (test):
        High  Low  Medium
High      73    0       0
Low        0   61       0
Medium     0    0      66

MANHATTAN Train: 1.00 Val: 1.00 Test: 1.00
Confusion Matrix (test):
        High  Low  Medium
High      73    0       0
Low        0   61       0
Medium     0    0      66

MINKOWSKI_P3 Train: 1.00 Val: 1.00 Test: 1.00
Confusion Matrix (test):
        High  Low  Medium
High      73    0       0
Low        0   61       0
Medium     0    0      66

