In [1]:
import numpy as np
import pandas as pd
from tabulate import tabulate
from sklearn import preprocessing
from sklearn.datasets import load_iris
from itertools import islice
from collections import Counter

In [15]:
iris = load_iris()
df = pd.DataFrame(data = iris.data, columns=iris.feature_names)
# Create a dictionary that maps numerical labels to target names
target_mapping = {i: name for i, name in enumerate(iris.target_names)}

# Apply the mapping to the target data to create a Series with target names
df_target = pd.Series(data=iris.target, name='target').map(target_mapping)
# Combine the features and target into one DataFrame
df = pd.concat([df, df_target], axis=1)

# Display the first few rows of the DataFrame
print(tabulate(df, headers = 'keys'))

       sepal length (cm)    sepal width (cm)    petal length (cm)    petal width (cm)  target
---  -------------------  ------------------  -------------------  ------------------  ----------
  0                  5.1                 3.5                  1.4                 0.2  setosa
  1                  4.9                 3                    1.4                 0.2  setosa
  2                  4.7                 3.2                  1.3                 0.2  setosa
  3                  4.6                 3.1                  1.5                 0.2  setosa
  4                  5                   3.6                  1.4                 0.2  setosa
  5                  5.4                 3.9                  1.7                 0.4  setosa
  6                  4.6                 3.4                  1.4                 0.3  setosa
  7                  5                   3.4                  1.5                 0.2  setosa
  8                  4.4                 2.9            

In [16]:
print(df.shape)
df = df.drop('target', axis=1)

(150, 5)


## Check for multicollinearity

In [17]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# VIF dataframe 
vif_data = pd.DataFrame() 
vif_data["feature"] = df.columns 
  
# calculating VIF for each feature 
vif_data["VIF"] = [variance_inflation_factor(df.values, i) 
                          for i in range(len(df.columns))]

print(vif_data)

             feature         VIF
0  sepal length (cm)  262.969348
1   sepal width (cm)   96.353292
2  petal length (cm)  172.960962
3   petal width (cm)   55.502060


In [18]:
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

scaler = preprocessing.StandardScaler()
df = pd.DataFrame(data = scaler.fit_transform(df), columns=iris.feature_names)


In [19]:
# Identify rows where any feature value is greater than 2
outliers = df[(abs(df.iloc[:, :-1]) > 2).any(axis=1)]
print(tabulate(outliers, headers='keys'))


       sepal length (cm)    sepal width (cm)    petal length (cm)    petal width (cm)
---  -------------------  ------------------  -------------------  ------------------
 14           -0.0525061            2.16999             -1.4539             -1.31544
 15           -0.173674             3.09078             -1.28339            -1.05218
 32           -0.779513             2.40018             -1.28339            -1.44708
 33           -0.41601              2.63038             -1.34023            -1.31544
 60           -1.02185             -2.43395             -0.146641           -0.262387
105            2.12852             -0.131979             1.61532             1.18557
117            2.24968              1.70959              1.67216             1.3172
118            2.24968             -1.05277              1.78583             1.44883
122            2.24968             -0.592373             1.67216             1.05394
131            2.49202              1.70959              1.5016

In [20]:
X_train, X_test, y_train, y_test = train_test_split(df, df_target, test_size=0.2, random_state=43)
print(tabulate(X_train.head(10), headers='keys'))
print(tabulate(X_test.head(10), headers='keys'))

       sepal length (cm)    sepal width (cm)    petal length (cm)    petal width (cm)
---  -------------------  ------------------  -------------------  ------------------
 96            -0.173674           -0.362176             0.251221            0.13251
 19            -0.900681            1.70959             -1.28339            -1.18381
 93            -1.02185            -1.74336             -0.260315           -0.262387
 98            -0.900681           -1.28296             -0.430828           -0.130755
108             1.038              -1.28296              1.16062             0.790671
103             0.553333           -0.362176             1.04695             0.790671
 50             1.40151             0.328414             0.535409            0.264142
120             1.28034             0.328414             1.10378             1.44883
 62             0.18983            -1.97355              0.137547           -0.262387
 66            -0.294842           -0.131979             

In [21]:
def max_freq(lst):
    freq = {}

    for a in lst:
        freq[a] = lst.count(a)

    freq = dict(sorted(freq.items(), key=lambda x:x[1], reverse=True))
    result = max(freq.values())

    counter = 0
    for a in freq.values():
        if a == result:
            counter += 1
    
    if counter > 1:
        multiple = 1    # 1 meaning there are ties
    else:
        multiple = 0    # 0 meaning there are no ties
    return multiple, freq

In [22]:
lst = ['a', 'b', 'a', 'b', 'b']
max_freq(lst)

(0, {'b': 3, 'a': 2})

In [28]:
def KNN(X_train, x_test, k):

    b = np.array(x_test)
    distance = {}
    r = {}

    for ind in list(X_train.index):
        a = np.array(X_train.loc[ind])
        norm_2 = np.linalg.norm(a - b)
        distance[ind] = norm_2

    sorted_distance = dict(sorted(distance.items(), key=lambda x:x[1]))
    knn = dict(islice(sorted_distance.items(), k))

    for key in knn.keys():
        r[key] = y_train[key]

    # print(knn)
    # print(r)
    
    multiple, freq_dic = max_freq(list(r.values()))
    
    if multiple == 0:
        result = dict(islice(freq_dic.items(), 1))
        
    elif multiple == 1:
        tie_r = {}
        for a in r.values():
            tie_r[a] = knn.values()
        
    if result:
        print(f'no tie, winner is {list(result.keys())[0]} with {list(result.values())[0]} apperances.')
        return list(result.keys())[0]
    else:
        print('tie:', tie_r)
        return tie_r[0]

In [None]:
y_pred = []
for a in range(len(y_test)):
    y_pred.append(KNN(X_train=X_train, x_test=X_test.iloc[a], k=5))

no tie, winner is setosa with 5 apperances.
no tie, winner is setosa with 5 apperances.
no tie, winner is versicolor with 3 apperances.
no tie, winner is versicolor with 5 apperances.
no tie, winner is virginica with 5 apperances.
no tie, winner is setosa with 5 apperances.
no tie, winner is virginica with 4 apperances.
no tie, winner is versicolor with 5 apperances.
no tie, winner is versicolor with 5 apperances.
no tie, winner is versicolor with 5 apperances.
no tie, winner is setosa with 5 apperances.
no tie, winner is versicolor with 5 apperances.
no tie, winner is virginica with 5 apperances.
no tie, winner is setosa with 5 apperances.
no tie, winner is versicolor with 5 apperances.
no tie, winner is versicolor with 5 apperances.
no tie, winner is setosa with 5 apperances.
no tie, winner is setosa with 5 apperances.
no tie, winner is virginica with 5 apperances.
no tie, winner is virginica with 5 apperances.
no tie, winner is setosa with 5 apperances.
no tie, winner is setosa with

['setosa',
 'setosa',
 'versicolor',
 'versicolor',
 'virginica',
 'setosa',
 'virginica',
 'versicolor',
 'versicolor',
 'versicolor',
 'setosa',
 'versicolor',
 'virginica',
 'setosa',
 'versicolor',
 'versicolor',
 'setosa',
 'setosa',
 'virginica',
 'virginica',
 'setosa',
 'setosa',
 'setosa',
 'versicolor',
 'virginica',
 'virginica',
 'setosa',
 'versicolor',
 'setosa',
 'setosa']

In [31]:
comparison = pd.DataFrame({'y_pred': y_pred,
                           'y_test': y_test})
comparison


Unnamed: 0,y_pred,y_test
30,setosa,setosa
0,setosa,setosa
138,versicolor,virginica
67,versicolor,versicolor
105,virginica,virginica
39,setosa,setosa
113,virginica,virginica
71,versicolor,versicolor
81,versicolor,versicolor
57,versicolor,versicolor


In [34]:
from sklearn.metrics import f1_score

print(f'F1 score of the KNN model is {f1_score(y_pred, y_test, average="macro")}')

F1 score of the KNN model is 0.9212962962962963
