<a href="https://colab.research.google.com/github/khandakerrahin/knn-with-classes/blob/main/knn_with_classes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !pip install numpy
# pip install numpy

In [None]:
import numpy as np # linear algebra, matrix-stuff
import pandas as pd # data science 
import seaborn as sns # plotting

In [None]:
iris = sns.load_dataset("iris")
iris.to_csv("iris.csv")

# pd.read_csv reads a csv file
df = pd.read_csv("iris.csv")
print(df.head)

In [None]:
df
df.iloc[:, 1:5].to_numpy()

In [None]:

# select rows and columns by index
# select all rows and all columns except the first and last one
X = df.iloc[:, 1:-1].to_numpy()
# select all rows and just the last column
Y = df.iloc[:, -1].to_numpy()

In [None]:
# side note: "Tuple unpacking"
# instead of
x = 3
y = 4
# we can do
x, y = 3, 4
# and it's the same as
x, y = (3, 4)
# and also
(x, y) = (3, 4)
# it works for any sequence and not just tuples
x, y = [1, 2]
x, y = ([1, 2], 3)  # read it as ([1, 2], 3)
# x = [1, 2]
# y = 3

# we can also do this for a function
def mult(a, b):
    return a * b


print(mult(*[10, 3]))
params = [10, 3]
print(mult(*params))
# mult(10, 3)

# we can also do dict (named) unpacking
params = {'a': 10, 'b': 3}
print(mult(**params))
# mult(a=10, b=3)

In [None]:
indexes = np.arange(0, X.shape[0]) # 150
np.random.shuffle(indexes)

# 70% for training and 30% for testing
# break_point = int(0.7*len(indexes))
# train_indexes = indexes[:break_point] # 0 to break_point, without break_point
# test_indexes = indexes[break_point:] # break_point to the end

# X_train = X[train_indexes]
# X_test = X[test_indexes]
# Y_train = Y[train_indexes]
# Y_test = Y[test_indexes]

X_train, Y_train = X[indexes[: int(0.7 * len(indexes))]], Y[indexes[: int(0.7 * len(indexes))]]
X_test, Y_test = X[indexes[int(0.7 * len(indexes)) :]], Y[indexes[int(0.7 * len(indexes)) :]]

print(X_train.shape, Y_train.shape)
print(X_test.shape, Y_test.shape)

(105, 4) (105,)
(45, 4) (45,)


In [None]:
class Element:
    def __init__(
        self, sepal_length, sepal_width, petal_length, petal_width, id=None, species=None
    ):
        self.sepal_length = sepal_length
        self.sepal_width = sepal_width
        self.petal_length = petal_length
        self.petal_width = petal_width
        # optional attributes
        self.id = id
        self.species = species

    def __repr__(self):
        species = self.species
        if species is None:
            species = "Not known"

        id_ = self.id
        if id_ is None:
            id_ = "No id"

        return (
            f"Element {id_} (SL={self.sepal_length}, "
            f"SW={self.sepal_width}, PL={self.petal_length}, "
            f"PW={self.petal_width}, species={self.species})"
        )

In [None]:
# first way to do it
X_train_objs = []
# X_train = matrix of shape[105, 4]
# len(X_train) 105
# X_train.shape[0]

for i in range(len(X_train)):
    x = X_train[i]
    y = Y_train[i]
    # x[0] is the sepal lenght
    element = Element(
        sepal_length=x[0], sepal_width=x[1], petal_length=x[2], petal_width=x[3], species=y, id=i
    )
    X_train_objs.append(element)
print(X_train_objs[:5], len(X_train_objs))

[Element 0 (SL=5.8, SW=4.0, PL=1.2, PW=0.2, species=setosa), Element 1 (SL=6.1, SW=3.0, PL=4.6, PW=1.4, species=versicolor), Element 2 (SL=5.2, SW=3.5, PL=1.5, PW=0.2, species=setosa), Element 3 (SL=5.5, SW=2.3, PL=4.0, PW=1.3, species=versicolor), Element 4 (SL=6.8, SW=3.2, PL=5.9, PW=2.3, species=virginica)] 105


In [None]:
# doing it with enumerate and zip
# enumerate "enumerates" a sequence of elements, returning at each step
# both the index and the element itself in the (index, element) format
# e.g. (0, seq[0]), (1, seq[1]), (2, seq[2]) ....

# for index, value in enumerate(sequence):
#     pass

# it's the same as doing:
# for i in range(len(seq)):
#     value = seq[i]
#     print(i, value)

# zip merges together 2 or more sequences and iterates them all at the same time
# always stop based on the smallest sequence
# for value1, value2 in zip(seq1, seq2):
#     pass

# it's them same as doing
# for i in range(min(len(seq1), len(seq2))):
#     value1 = seq1[i]
#     value2 = seq2[i]

X_train_objs = []
for i, (x, y) in enumerate(zip(X_train, Y_train)):
    element = Element(
        sepal_length=x[0], sepal_width=x[1], petal_length=x[2], petal_width=x[3], species=y, id=i
    )
    X_train_objs.append(element)

print(X_train_objs[:5], len(X_train_objs))

[Element 0 (SL=5.8, SW=4.0, PL=1.2, PW=0.2, species=setosa), Element 1 (SL=6.1, SW=3.0, PL=4.6, PW=1.4, species=versicolor), Element 2 (SL=5.2, SW=3.5, PL=1.5, PW=0.2, species=setosa), Element 3 (SL=5.5, SW=2.3, PL=4.0, PW=1.3, species=versicolor), Element 4 (SL=6.8, SW=3.2, PL=5.9, PW=2.3, species=virginica)] 105


In [None]:
# collect the names of the columns
attr_names = list(df.columns[1:-1])
X_train_objs = []
for i, (x, y) in enumerate(zip(X_train, Y_train)):

    # dict comprehension
    # params = {}
    # for name, value in zip(attr_names, x):
    #     params[name] = value
    params = {name: value for name, value in zip(attr_names, x)}
    # params = {'sepal_length': 0.5, ...}
    element = Element(
        **params, species=y, id=i
    )
    X_train_objs.append(element)

print(X_train_objs[:5], len(X_train_objs))

[Element 0 (SL=5.8, SW=4.0, PL=1.2, PW=0.2, species=setosa), Element 1 (SL=6.1, SW=3.0, PL=4.6, PW=1.4, species=versicolor), Element 2 (SL=5.2, SW=3.5, PL=1.5, PW=0.2, species=setosa), Element 3 (SL=5.5, SW=2.3, PL=4.0, PW=1.3, species=versicolor), Element 4 (SL=6.8, SW=3.2, PL=5.9, PW=2.3, species=virginica)] 105


In [None]:
X_test_objs = []
for i, x in enumerate(X_test):
    element = Element(
        sepal_length=x[0], sepal_width=x[1], petal_length=x[2], petal_width=x[3], id=i
    )
    X_test_objs.append(element)

print(X_test_objs[:5], len(X_test_objs))

[Element 0 (SL=6.7, SW=3.3, PL=5.7, PW=2.1, species=None), Element 1 (SL=6.5, SW=3.0, PL=5.8, PW=2.2, species=None), Element 2 (SL=6.9, SW=3.2, PL=5.7, PW=2.3, species=None), Element 3 (SL=5.0, SW=2.3, PL=3.3, PW=1.0, species=None), Element 4 (SL=6.3, SW=2.8, PL=5.1, PW=1.5, species=None)] 45


In [None]:
import math

class Element:
    def __init__(
        self, sepal_length, sepal_width, petal_length, petal_width, id=None, species=None
    ) -> None:
        self.sepal_length = sepal_length
        self.sepal_width = sepal_width
        self.petal_length = petal_length
        self.petal_width = petal_width
        # optional attributes
        self.id = id
        self.species = species

    def __repr__(self):
        species = self.species
        if species is None:
            species = "Not known"

        id_ = self.id
        if id_ is None:
            id_ = "No id"

        return (
            f"Element {id_} (SL={self.sepal_length}, "
            f"SW={self.sepal_width}, PL={self.petal_length}, "
            f"PW={self.petal_width}, species={self.species})"
        )
    
    def distance(self, other):
        # euclidean distance
        dist_sl = (self.sepal_length - other.sepal_length) ** 2
        dist_sw = (self.sepal_width - other.sepal_width) ** 2
        dist_pl = (self.petal_length - other.petal_length) ** 2
        dist_pw = (self.petal_width - other.petal_width) ** 2
        # sum(sequence)
        # sequence[0] + sequence[1] + sequence[2] ... 
        return math.sqrt(sum([dist_sl, dist_sw, dist_pl, dist_pw]))

# collect the names of the columns
attr_names = list(df.columns[1:-1])

X_train_objs = []
for i, (x, y) in enumerate(zip(X_train, Y_train)):
    params = {name: value for name, value in zip(attr_names, x)}
    element = Element(
        **params, species=y, id=i
    )
    X_train_objs.append(element)

print(X_train_objs[:5], len(X_train_objs))

X_test_objs = []
for i, (x, y) in enumerate(zip(X_test, Y_test)):
    element = Element(
        sepal_length=x[0], sepal_width=x[1], petal_length=x[2], petal_width=x[3], id=i
    )
    X_test_objs.append(element)

print(X_test_objs[:5], len(X_test_objs))

[Element 0 (SL=5.8, SW=4.0, PL=1.2, PW=0.2, species=setosa), Element 1 (SL=6.1, SW=3.0, PL=4.6, PW=1.4, species=versicolor), Element 2 (SL=5.2, SW=3.5, PL=1.5, PW=0.2, species=setosa), Element 3 (SL=5.5, SW=2.3, PL=4.0, PW=1.3, species=versicolor), Element 4 (SL=6.8, SW=3.2, PL=5.9, PW=2.3, species=virginica)] 105
[Element 0 (SL=6.7, SW=3.3, PL=5.7, PW=2.1, species=None), Element 1 (SL=6.5, SW=3.0, PL=5.8, PW=2.2, species=None), Element 2 (SL=6.9, SW=3.2, PL=5.7, PW=2.3, species=None), Element 3 (SL=5.0, SW=2.3, PL=3.3, PW=1.0, species=None), Element 4 (SL=6.3, SW=2.8, PL=5.1, PW=1.5, species=None)] 45


In [None]:
print(X_train_objs[0].distance(X_train_objs[0]))
print(X_train_objs[0].distance(X_train_objs[1]))
print(X_train_objs[0].distance(X_test_objs[0]))

0.0
3.7536648758246915
5.015974481593781


In [None]:
import statistics

# let's go back and normalize the features
sepal_lenghts = []
sepal_widths = []
petal_lenghts = []
petal_widths = []
for x in X_train_objs:
    # x = X_train_objs[i]
    sepal_lenghts.append(x.sepal_length)
    sepal_widths.append(x.sepal_width)
    petal_lenghts.append(x.petal_length)
    petal_widths.append(x.petal_width)

SL_mean, SL_std = statistics.mean(sepal_lenghts), statistics.stdev(sepal_lenghts)
SW_mean, SW_std = statistics.mean(sepal_widths), statistics.stdev(sepal_widths)
PL_mean, PL_std = statistics.mean(petal_lenghts), statistics.stdev(petal_lenghts)
PW_mean, PW_std = statistics.mean(petal_widths), statistics.stdev(petal_widths)

print(SL_mean, SL_std)
print(SW_mean, SW_std)
print(PL_mean, PL_std)
print(PW_mean, PW_std)

5.828571428571428 0.8420115892810006
3.085714285714286 0.417508307478051
3.6714285714285713 1.8034536464642097
1.1666666666666667 0.7735615268984335


In [None]:
for x in X_train_objs:
    # (x - mean) / std
    x.sepal_length = (x.sepal_length - SL_mean) / SL_std
    x.sepal_width = (x.sepal_width - SW_mean) / SW_std
    x.petal_length = (x.petal_length - PL_mean) / PL_std
    x.petal_width = (x.petal_width - PW_mean) / PW_std

for x in X_test_objs:
    x.sepal_length = (x.sepal_length - SL_mean) / SL_std
    x.sepal_width = (x.sepal_width - SW_mean) / SW_std
    x.petal_length = (x.petal_length - PL_mean) / PL_std
    x.petal_width = (x.petal_width - PW_mean) / PW_std

In [None]:
print(X_train_objs[0].distance(X_train_objs[0]))
print(X_train_objs[0].distance(X_train_objs[1]))
print(X_train_objs[0].distance(X_test_objs[0]))

0.0
6.180952693684729
5.452852963892606


In [None]:
def compute_knn(k, x_test, X_train):
    distances = []
    for x in X_train:
        distances.append((x_test.distance(x), x.species))
    # distances = [(0.6, 'setosa'), (1.5, 'versicolor') ...]

    # with lambda:
    # test = lambda t: t[0]
 
    # with function 
    # def test(t):
    #    return t[0]
    return sorted(distances, key=lambda t: t[0])[:k]

In [None]:
print(compute_knn(3, X_test_objs[0], X_train_objs))

[(0.6815360607856585, 'virginica'), (0.796862177424332, 'virginica'), (0.8795704491186667, 'virginica')]


In [None]:
class KNN:
    def __init__(self, X):
        self.X = X
    
    def compute_knn(self, x_test, k):
        distances = []
        for x in self.X:
            distances.append((x_test.distance(x), x.species))
        return sorted(distances, key=lambda t: t[0])[:k]

knn = KNN(X_train_objs)
# knn2 = KNN(X_train_objs[:10])

In [None]:
# preds = []
# for x in X_test_objs:
#     pred = knn.compute_knn(x, 3)
#     preds.append(pred)

# list comprehension
# [what_i_want_to_append FOR_LOOP IF_CLAUSE]
preds = [knn.compute_knn(x, 3) for x in X_test_objs]
print(preds[0])
print(preds[1])
print(preds[10])

[(0.6815360607856585, 'virginica'), (0.796862177424332, 'virginica'), (0.8795704491186667, 'virginica')]
[(0.37621191266793286, 'virginica'), (0.3817573249780868, 'virginica'), (0.6747859387993344, 'virginica')]
[(0.5289073647273247, 'setosa'), (0.7347551824590665, 'setosa'), (0.7643500146979169, 'setosa')]


**Full Code**

In [None]:
import math
import seaborn
import numpy as np
import pandas as pd

class Element:
    def __init__(
        self, sepal_length, sepal_width, petal_length, petal_width, id=None, species=None
    ) -> None:
        self.sepal_length = sepal_length
        self.sepal_width = sepal_width
        self.petal_length = petal_length
        self.petal_width = petal_width
        # optional attributes
        self.id = id
        self.species = species

    def __repr__(self):
        species = self.species
        if species is None:
            species = "Not known"

        id_ = self.id
        if id_ is None:
            id_ = "No id"

        return (
            f"Element {id_} (SL={self.sepal_length}, "
            f"SW={self.sepal_width}, PL={self.petal_length}, "
            f"PW={self.petal_width}, species={self.species})"
        )
    
    def distance(self, other):
        dist_sl = (self.sepal_length - other.sepal_length) ** 2
        dist_sw = (self.sepal_width - other.sepal_width) ** 2
        dist_pl = (self.petal_length - other.petal_length) ** 2
        dist_pw = (self.petal_width - other.petal_width) ** 2
        return math.sqrt(sum([dist_sl, dist_sw, dist_pl, dist_pw]))

class KNN:
    def __init__(self, X):
        self.X = X
    
    def compute_knn(self, x_test, k):
        distances = []
        for x in self.X:
            distances.append((x_test.distance(x), x.species))
        return sorted(distances, key=lambda t: t[0])[:k]

    
df = sns.load_dataset("iris")

# select rows and columns by index
# select all rows and all columns except the first and last one
X = df.iloc[:, 1:-1].to_numpy()
# select all rows and just the last column
Y = df.iloc[:, -1].to_numpy()

indexes = np.arange(0, X.shape[0])
np.random.shuffle(indexes)
X_train, Y_train = X[indexes[: int(0.7 * len(indexes))]], Y[indexes[: int(0.7 * len(indexes))]]
X_test, Y_test = X[indexes[int(0.7 * len(indexes)) :]], Y[indexes[int(0.7 * len(indexes)) :]]

# compute np mean and std using X_train
# normalize here

# collect the names of the columns
attr_names = list(df.columns[:-1])

# load data into objects
X_train_objs = []
for i, (x, y) in enumerate(zip(X_train, Y_train)):
    params = {name: value for name, value in zip(attr_names, x)}
    element = Element(**params, species=y, id=i)
    X_train_objs.append(element)

X_test_objs = []
for i, x in enumerate(X_test):
    params = {name: value for name, value in zip(attr_names, x)}
    element = Element(**params, id=i)
    X_test_objs.append(element)

# normalize data
for x in X_train_objs:
    x.sepal_length = (x.sepal_length - SL_mean) / SL_std
    x.sepal_width = (x.sepal_width - SW_mean) / SW_std
    x.petal_length = (x.petal_length - PL_mean) / PL_std
    x.petal_width = (x.petal_width - PW_mean) / PW_std

for x in X_test_objs:
    x.sepal_length = (x.sepal_length - SL_mean) / SL_std
    x.sepal_width = (x.sepal_width - SW_mean) / SW_std
    x.petal_length = (x.petal_length - PL_mean) / PL_std
    x.petal_width = (x.petal_width - PW_mean) / PW_std

knn = KNN(X_train_objs)
preds = [knn.compute_knn(x, 3) for x in X_test_objs]

print(preds[0])
print(preds[6])
print(preds[10])

[(0.3563179823677913, 'versicolor'), (0.43015031535592885, 'versicolor'), (0.4969308476931999, 'versicolor')]
[(0.6714711836679153, 'setosa'), (0.6867958241781682, 'setosa'), (0.697253597312322, 'setosa')]
[(0.46271056380990944, 'virginica'), (0.6356987007104837, 'virginica'), (0.7141194839506259, 'virginica')]


**Snippets**

In [None]:
# loading data
import pandas as pd

# pass the name of the file
df = pd.read_csv('iris.csv') # use header=None if you don't have a header for your csv file

df.columns # gives you access to the column names (including a dummy for the index)
print(df.iloc[:, 1]) # selects using numerical indexes
print(df.loc[:, ['sepal_length']]) # selects using the names

# what you usually want to do to move stuff to numpy
X, Y = df[:, 1:-1].to_numpy(), df[:, -1].to_numpy()

In [None]:
# spliting data
import numpy as np

indexes = np.arange(0, X.shape[0])
percent = 0.7
partition = int(percent * len(indexes))
X_train, X_test = X[indexes[:partition]], X[indexes[partition:]]

Numpy tutorials:
- https://www.w3schools.com/python/numpy/default.asp
- https://cs231n.github.io/python-numpy-tutorial/#numpy
- https://numpy.org/doc/stable/user/quickstart.html

Exercise:
- modify the full code so that we can compute the mean and std without needing to reassign stuff (use numpy to compute the mean and std and just normalize directly in the numpy data before assigning)
- do a geral cleaning of the code and improve where you think it's needed
- create another method for the KNN class that applies the self.compute_knn in all test data points and compute the accuracy of the predictions
- test different values of k
- re-write the whole code without looking. The idea is not to reproduce what is already here, but to solve the same problem in another way