# **Iterative Functions: Practise & Application**

In [364]:
# Import libraries

import pandas as pd
import statistics as stats
import numpy as np

### **Introduction to Iteration**

- **Iterator:** an object that represent a stream of data, with a strict protocol being an iterator object, having a next feature, and being able to stop the iterative process

- **Iterable:** an object that is capable of being looped through one element at a time

In [365]:
# Starting off with iterables

iterables = [
    list([1, 2, 2]),
    dict(one=1, two=2, three=3),
    range(5),
    set([1, 2, 3]),
    tuple((4, 5, 6, 8)),
    str("Quant"),
    np.array([0,1,1,1,1,1])
]

for iterable in iterables:
    print("Type:", type(iterable).__name__, " |  Instance:", iterable, " |  Iter object:", iter(iterable))

Type: list  |  Instance: [1, 2, 2]  |  Iter object: <list_iterator object at 0x3432be440>
Type: dict  |  Instance: {'one': 1, 'two': 2, 'three': 3}  |  Iter object: <dict_keyiterator object at 0x145634270>
Type: range  |  Instance: range(0, 5)  |  Iter object: <range_iterator object at 0x3432be3a0>
Type: set  |  Instance: {1, 2, 3}  |  Iter object: <set_iterator object at 0x3116c3c00>
Type: tuple  |  Instance: (4, 5, 6, 8)  |  Iter object: <tuple_iterator object at 0x3432be440>
Type: str  |  Instance: Quant  |  Iter object: <str_ascii_iterator object at 0x3432be3b0>
Type: ndarray  |  Instance: [0 1 1 1 1 1]  |  Iter object: <iterator object at 0x3432be440>


In [366]:
# Let's start off by getting familiar with lists first

lst = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]

print("THE MAIN LIST: ", lst, "\n")

print("Indexed to zero: ", lst[0], "\n")

print("Grab the last element: ", lst[-1], "\n")

print("Go in reverse (4th to last element): ", lst[-4])
print("Go in reverse (2nd to last element): ", lst[-2], "\n")

print("Reverse a list: ", lst[::-1], "\n")

lst_c = lst[::]
print("Copy of a list: ", lst_c, "\n")

print("Slicing by skipping one element: ", lst[::2])
print("Slicing by skipping two elements: ", lst[::3], "\n")

THE MAIN LIST:  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21] 

Indexed to zero:  1 

Grab the last element:  21 

Go in reverse (4th to last element):  18
Go in reverse (2nd to last element):  20 

Reverse a list:  [21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1] 

Copy of a list:  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21] 

Slicing by skipping one element:  [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21]
Slicing by skipping two elements:  [1, 4, 7, 10, 13, 16, 19] 



In [367]:
# List slicing

print("THE MAIN LIST: ", lst, "\n")

start_idx = 3 # Inclusive
stop_idx = 4 # Exclusive
print("List slicing by index (one el): ", lst[start_idx : stop_idx], "\n")

start_idx = 15 # Inclusive
stop_idx = 15 # Exclusive
print("List slicing by index (no el): ", lst[start_idx : stop_idx], "\n")

start_idx = 10 # Inclusive
stop_idx = 15 # Exclusive
print("List slicing by index (multiple el): ", lst[start_idx : stop_idx], "\n")

start_idx = 18 # Inclusive
stop_idx = 15 # Exclusive
print("List slicing by index (no el): ", lst[start_idx : stop_idx], "\n")

mid_idx = len(lst) // 2 # Get the middle index of the list
left_half = lst[:mid_idx] 
right_half = lst[mid_idx:]
print(f"Floor operator //: \n" 
      f"List length: {len(lst)} \n" 
      f"Middle Element Index: {len(lst) // 2}")
print("\n")
print("List slicing by index (left side): ", left_half)
print("List slicing by index (right side): ", right_half)
print("\n")
print("Do the list slices (halves) concat accurately?: ",  "Yes" if ((left_half + right_half) == lst) else "No")

THE MAIN LIST:  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21] 

List slicing by index (one el):  [4] 

List slicing by index (no el):  [] 

List slicing by index (multiple el):  [11, 12, 13, 14, 15] 

List slicing by index (no el):  [] 

Floor operator //: 
List length: 21 
Middle Element Index: 10


List slicing by index (left side):  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
List slicing by index (right side):  [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21]


Do the list slices (halves) concat accurately?:  Yes


In [368]:
# Moving on to list iteration

print("THE MAIN LIST: ", lst, "\n")

print("Iteration by element")
for el in lst:
    print(el)

print("\n")
print("Iteration by index")
for i in range(0, len(lst)): # Inclusive of 0, exclusive of len(lst) which is 21 but last index is 20 
    print(lst[i])

print("\n")
print("Common list counting, product, summation technique")
factorial = 1
for i in range(1, 10): # Inclusive of 1, exclusive of 10
    factorial *= i

print("9!: ", factorial)

THE MAIN LIST:  [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21] 

Iteration by element
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21


Iteration by index
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21


Common list counting, product, summation technique
9!:  362880


In [369]:
# Dictionaries (iterable methods)

cool_dict = {"MSFT": 495.94, 
             "AAPL": 201.08,
             "MU": 124.75,
             "DECK": 104.20}

print("THE MAIN DICTIONARY: ", cool_dict, "\n")

# .items() method
print(".items() method")
for k, v in cool_dict.items():
    print(k, cool_dict[k])

# .values() method
print("\n.values() method")
for v in cool_dict.values():
    print(v)

print("\n.keys() method")
# .keys() method
for k in cool_dict.keys():
    print(k)

THE MAIN DICTIONARY:  {'MSFT': 495.94, 'AAPL': 201.08, 'MU': 124.75, 'DECK': 104.2} 

.items() method
MSFT 495.94
AAPL 201.08
MU 124.75
DECK 104.2

.values() method
495.94
201.08
124.75
104.2

.keys() method
MSFT
AAPL
MU
DECK


### **itertools Library**

- A built in library to Python, used for more advanced iterative techniques

In [370]:
import itertools

### **Bubble Sort Algorithm**

- A basic algorithm that sorts a list in O(n * n) time; where n is the size of our input variable (in this case the length of the list)

- Iterates through a list and compares pairings of adjacent elements, it makes a swap when the right element is larger than the left element (ie it would swap if left=5 and right=3)

- If the list is in descending order (completely unsorted), then after the first outer iteration completes the first element will be sorted (the largest element that was at the beginning to start will now be at the end), but the rest of the list will still be unsorted

- This algorithm will be used later on in a KNN implementation

- Bubble sort is an in-place algorithm

In [371]:
import random

In [372]:
# Generate a random list of integers

random.seed(42)

rand_lst = [random.randint(1, 100) for _ in range(20)]
print(rand_lst)

[82, 15, 4, 95, 36, 32, 29, 18, 95, 14, 87, 95, 70, 12, 76, 55, 5, 4, 12, 28]


In [373]:
# Bubble sort...
def bubble_sort(unsorted_lst):
    for i in range(len(unsorted_lst)):
        for idx in range(len(unsorted_lst) - 1 - i): # Micro optimization
            if unsorted_lst[idx] > unsorted_lst[idx + 1]:
                unsorted_lst[idx], unsorted_lst[idx + 1] = unsorted_lst[idx + 1], unsorted_lst[idx]

In [374]:
# Bubble sort results

print('Unsorted: ', rand_lst, "\n")
bubble_sort(rand_lst) # Notice how we do not assign the list to a variable, matter of fact our function has no real output!
print('Sorted: ', rand_lst)

Unsorted:  [82, 15, 4, 95, 36, 32, 29, 18, 95, 14, 87, 95, 70, 12, 76, 55, 5, 4, 12, 28] 

Sorted:  [4, 4, 5, 12, 12, 14, 15, 18, 28, 29, 32, 36, 55, 70, 76, 82, 87, 95, 95, 95]


### **K Nearest Neighbors (KNN) Machine Learning Algorithm**

- An iterative implementation to KNN

- A classification algorithm, where the central idea is that the data points with similar attributes tend to fall into similar categories

- Often used for data imputation

- Supervisied algorithm

- Inefficient in it's use of space

- Hyperparameter: k and distance metric

In [375]:
# Before we get into the algorithm let's understand the data first, which will make the KNN function inputs easier to understand

# Chosen set: https://www.kaggle.com/datasets/yasserh/loan-default-dataset?resource=download
fh = '/Users/henrycosentino/Desktop/Python/Projects/Loan Modeling/Loan_Default.csv'
df = pd.read_csv(fh)
df.head(5)

Unnamed: 0,ID,year,loan_limit,Gender,approv_in_adv,loan_type,loan_purpose,Credit_Worthiness,open_credit,business_or_commercial,...,credit_type,Credit_Score,co-applicant_credit_type,age,submission_of_application,LTV,Region,Security_Type,Status,dtir1
0,24890,2019,cf,Sex Not Available,nopre,type1,p1,l1,nopc,nob/c,...,EXP,758,CIB,25-34,to_inst,98.728814,south,direct,1,45.0
1,24891,2019,cf,Male,nopre,type2,p1,l1,nopc,b/c,...,EQUI,552,EXP,55-64,to_inst,,North,direct,1,
2,24892,2019,cf,Male,pre,type1,p1,l1,nopc,nob/c,...,EXP,834,CIB,35-44,to_inst,80.019685,south,direct,0,46.0
3,24893,2019,cf,Male,nopre,type1,p4,l1,nopc,nob/c,...,EXP,587,CIB,45-54,not_inst,69.3769,North,direct,0,42.0
4,24894,2019,cf,Joint,pre,type1,p1,l1,nopc,nob/c,...,CRIF,602,EXP,25-34,not_inst,91.886544,North,direct,0,39.0


### **Data Exploration**

In [376]:
# Checking for missing values

df.isna().sum().sort_values()

ID                               0
Security_Type                    0
Region                           0
co-applicant_credit_type         0
Credit_Score                     0
credit_type                      0
total_units                      0
Secured_by                       0
occupancy_type                   0
construction_type                0
lump_sum_payment                 0
Status                           0
interest_only                    0
Credit_Worthiness                0
year                             0
loan_amount                      0
business_or_commercial           0
open_credit                      0
loan_type                        0
Gender                           0
term                            41
Neg_ammortization              121
loan_purpose                   134
age                            200
submission_of_application      200
approv_in_adv                  908
loan_limit                    3344
income                        9150
property_value      

In [377]:
# Selecting a subset of columns to investigate

cols_to_investigate = ['Status', 'Credit_Worthiness', 'loan_amount', 'Neg_ammortization', 'Credit_Score', 'LTV', 'age', 'interest_only', 'term']

for col in cols_to_investigate:
    if len(df[col].value_counts()) < 50:
        print(col)
        print(df[col].value_counts())
        print("\n")

Status
Status
0    112031
1     36639
Name: count, dtype: int64


Credit_Worthiness
Credit_Worthiness
l1    142344
l2      6326
Name: count, dtype: int64


Neg_ammortization
Neg_ammortization
not_neg    133420
neg_amm     15129
Name: count, dtype: int64


age
age
45-54    34720
35-44    32818
55-64    32534
65-74    20744
25-34    19142
>74       7175
<25       1337
Name: count, dtype: int64


interest_only
interest_only
not_int     141560
int_only      7110
Name: count, dtype: int64


term
term
360.0    121685
180.0     12981
240.0      5859
300.0      2822
324.0      2766
120.0       510
144.0       263
348.0       260
336.0       213
96.0        194
312.0       185
156.0       174
216.0       115
276.0       100
132.0        93
288.0        90
168.0        82
228.0        74
204.0        59
264.0        43
108.0        33
192.0        17
252.0         8
165.0         1
280.0         1
322.0         1
Name: count, dtype: int64




In [378]:
# Dropping rows with no data

prelim_features = cols_to_investigate.copy() # With LTV
cols_to_investigate.remove('LTV')
prelim_features2 = cols_to_investigate.copy() # Without LTV

ltv_df = df[prelim_features].dropna()
noLtv_df = df[prelim_features2].dropna()

In [379]:
# Complete preliminary feature set

ltv_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 133230 entries, 0 to 148669
Data columns (total 9 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Status             133230 non-null  int64  
 1   Credit_Worthiness  133230 non-null  object 
 2   loan_amount        133230 non-null  int64  
 3   Neg_ammortization  133230 non-null  object 
 4   Credit_Score       133230 non-null  int64  
 5   LTV                133230 non-null  float64
 6   age                133230 non-null  object 
 7   interest_only      133230 non-null  object 
 8   term               133230 non-null  float64
dtypes: float64(2), int64(3), object(4)
memory usage: 10.2+ MB


In [380]:
# Complete preliminary feature set (omitting LTV)

noLtv_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 148308 entries, 0 to 148669
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Status             148308 non-null  int64  
 1   Credit_Worthiness  148308 non-null  object 
 2   loan_amount        148308 non-null  int64  
 3   Neg_ammortization  148308 non-null  object 
 4   Credit_Score       148308 non-null  int64  
 5   age                148308 non-null  object 
 6   interest_only      148308 non-null  object 
 7   term               148308 non-null  float64
dtypes: float64(1), int64(3), object(4)
memory usage: 10.2+ MB


In [381]:
# Encoding preliminary features for LTV data frame

features_to_encode = ltv_df[['Credit_Worthiness', 'Neg_ammortization', 'age', 'interest_only']].copy()

# Age Dummy
features_to_encode['age_dummy_<25'] = 0
features_to_encode['age_dummy_25-34'] = 0
features_to_encode['age_dummy_35-44'] = 0
features_to_encode['age_dummy_45-54'] = 0
features_to_encode['age_dummy_55-64'] = 0
features_to_encode['age_dummy_65-74'] = 0
features_to_encode.loc[features_to_encode['age'].isin(['<25']), 'age_dummy_<25'] = 1
features_to_encode.loc[features_to_encode['age'].isin(['25-34']), 'age_dummy_25-34'] = 1
features_to_encode.loc[features_to_encode['age'].isin(['35-44']), 'age_dummy_35-44'] = 1
features_to_encode.loc[features_to_encode['age'].isin(['45-54']), 'age_dummy_45-54'] = 1
features_to_encode.loc[features_to_encode['age'].isin(['55-64']), 'age_dummy_55-64'] = 1
features_to_encode.loc[features_to_encode['age'].isin(['65-74']), 'age_dummy_65-74'] = 1

# Credit Worthiness Dummy
features_to_encode['credit_worthiness_dummy'] = 0
features_to_encode.loc[features_to_encode['Credit_Worthiness'].isin(['l1']), 'credit_worthiness_dummy'] = 1

# Interest Only Dummy
features_to_encode['interest_only_dummy'] = 0
features_to_encode.loc[features_to_encode['interest_only'].isin(['int_only']), 'interest_only_dummy'] = 1 # Interest only type = 1

# Negative Amortization Dummy
features_to_encode['negative_amortization_dummy'] = 0
features_to_encode.loc[features_to_encode['Neg_ammortization'].isin(['neg_amm']), 'negative_amortization_dummy'] = 1 # Negative amortization = 1

# Merging frames
encoded_features = features_to_encode[['age_dummy_<25', 'age_dummy_25-34', 'age_dummy_35-44', 'age_dummy_45-54', 
                                       'age_dummy_55-64', 'age_dummy_65-74', 'credit_worthiness_dummy', 
                                       'interest_only_dummy', 'negative_amortization_dummy']]
ltv_df_nonBinary = ltv_df[['Status', 'loan_amount', 'Credit_Score', 'LTV', 'term']]

prelim_ltv_df = ltv_df_nonBinary.merge(encoded_features, how='left', left_index=True, right_index=True)

prelim_ltv_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 133230 entries, 0 to 148669
Data columns (total 14 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Status                       133230 non-null  int64  
 1   loan_amount                  133230 non-null  int64  
 2   Credit_Score                 133230 non-null  int64  
 3   LTV                          133230 non-null  float64
 4   term                         133230 non-null  float64
 5   age_dummy_<25                133230 non-null  int64  
 6   age_dummy_25-34              133230 non-null  int64  
 7   age_dummy_35-44              133230 non-null  int64  
 8   age_dummy_45-54              133230 non-null  int64  
 9   age_dummy_55-64              133230 non-null  int64  
 10  age_dummy_65-74              133230 non-null  int64  
 11  credit_worthiness_dummy      133230 non-null  int64  
 12  interest_only_dummy          133230 non-null  int64  
 13  nega

In [382]:
# Encoding preliminary features for non-LTV data frame

features_to_encode = noLtv_df[['Credit_Worthiness', 'Neg_ammortization', 'age', 'interest_only']].copy()

# Age Dummy
features_to_encode['age_dummy_<25'] = 0
features_to_encode['age_dummy_25-34'] = 0
features_to_encode['age_dummy_35-44'] = 0
features_to_encode['age_dummy_45-54'] = 0
features_to_encode['age_dummy_55-64'] = 0
features_to_encode['age_dummy_65-74'] = 0
features_to_encode.loc[features_to_encode['age'].isin(['<25']), 'age_dummy_<25'] = 1
features_to_encode.loc[features_to_encode['age'].isin(['25-34']), 'age_dummy_25-34'] = 1
features_to_encode.loc[features_to_encode['age'].isin(['35-44']), 'age_dummy_35-44'] = 1
features_to_encode.loc[features_to_encode['age'].isin(['45-54']), 'age_dummy_45-54'] = 1
features_to_encode.loc[features_to_encode['age'].isin(['55-64']), 'age_dummy_55-64'] = 1
features_to_encode.loc[features_to_encode['age'].isin(['65-74']), 'age_dummy_65-74'] = 1

# Credit Worthiness Dummy
features_to_encode['credit_worthiness_dummy'] = 0
features_to_encode.loc[features_to_encode['Credit_Worthiness'].isin(['l1']), 'credit_worthiness_dummy'] = 1

# Interest Only Dummy
features_to_encode['interest_only_dummy'] = 0
features_to_encode.loc[features_to_encode['interest_only'].isin(['int_only']), 'interest_only_dummy'] = 1

# Negative Amortization Dummy
features_to_encode['negative_amortization_dummy'] = 0
features_to_encode.loc[features_to_encode['Neg_ammortization'].isin(['neg_amm']), 'negative_amortization_dummy'] = 1

# Merging frames
encoded_features = features_to_encode[['age_dummy_<25', 'age_dummy_25-34', 'age_dummy_35-44', 'age_dummy_45-54', 
                                       'age_dummy_55-64', 'age_dummy_65-74', 'credit_worthiness_dummy', 
                                       'interest_only_dummy', 'negative_amortization_dummy']]
noLtv_df_nonBinary = noLtv_df[['Status', 'loan_amount', 'Credit_Score', 'term']]

prelim_noLtv_df = noLtv_df_nonBinary.merge(encoded_features, how='left', left_index=True, right_index=True)

prelim_noLtv_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 148308 entries, 0 to 148669
Data columns (total 13 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Status                       148308 non-null  int64  
 1   loan_amount                  148308 non-null  int64  
 2   Credit_Score                 148308 non-null  int64  
 3   term                         148308 non-null  float64
 4   age_dummy_<25                148308 non-null  int64  
 5   age_dummy_25-34              148308 non-null  int64  
 6   age_dummy_35-44              148308 non-null  int64  
 7   age_dummy_45-54              148308 non-null  int64  
 8   age_dummy_55-64              148308 non-null  int64  
 9   age_dummy_65-74              148308 non-null  int64  
 10  credit_worthiness_dummy      148308 non-null  int64  
 11  interest_only_dummy          148308 non-null  int64  
 12  negative_amortization_dummy  148308 non-null  int64  
dtypes: f

In [383]:
prelim_ltv_df

Unnamed: 0,Status,loan_amount,Credit_Score,LTV,term,age_dummy_<25,age_dummy_25-34,age_dummy_35-44,age_dummy_45-54,age_dummy_55-64,age_dummy_65-74,credit_worthiness_dummy,interest_only_dummy,negative_amortization_dummy
0,1,116500,758,98.728814,360.0,0,1,0,0,0,0,1,0,0
2,0,406500,834,80.019685,360.0,0,0,1,0,0,0,1,0,1
3,0,456500,587,69.376900,360.0,0,0,0,1,0,0,1,0,0
4,0,696500,602,91.886544,360.0,0,1,0,0,0,0,1,0,0
5,0,706500,864,70.089286,360.0,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148665,0,436500,659,71.792763,180.0,0,0,0,0,1,0,1,0,0
148666,0,586500,569,74.428934,360.0,0,1,0,0,0,0,1,0,0
148667,0,446500,702,61.332418,180.0,0,0,0,1,0,0,1,0,0
148668,0,196500,737,70.683453,180.0,0,0,0,0,1,0,1,0,0


In [384]:
prelim_noLtv_df

Unnamed: 0,Status,loan_amount,Credit_Score,term,age_dummy_<25,age_dummy_25-34,age_dummy_35-44,age_dummy_45-54,age_dummy_55-64,age_dummy_65-74,credit_worthiness_dummy,interest_only_dummy,negative_amortization_dummy
0,1,116500,758,360.0,0,1,0,0,0,0,1,0,0
1,1,206500,552,360.0,0,0,0,0,1,0,1,0,0
2,0,406500,834,360.0,0,0,1,0,0,0,1,0,1
3,0,456500,587,360.0,0,0,0,1,0,0,1,0,0
4,0,696500,602,360.0,0,1,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
148665,0,436500,659,180.0,0,0,0,0,1,0,1,0,0
148666,0,586500,569,360.0,0,1,0,0,0,0,1,0,0
148667,0,446500,702,180.0,0,0,0,1,0,0,1,0,0
148668,0,196500,737,180.0,0,0,0,0,1,0,1,0,0


In [385]:
# Splitting the data
from sklearn.model_selection import train_test_split

# prelim_ltv_df = prelim_ltv_df.iloc[:10000]

y = prelim_ltv_df['Status']
X = prelim_ltv_df.drop(columns='Status')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [386]:
# Scaling the data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [387]:
# Creating dictionaries for the custom KNN algorithm implementation

y_train_dict = dict(y_train)
y_test_dict = dict(y_test)
X_train_scaled_dict = dict(zip(X_train.index, X_train_scaled))
X_test_scaled_dict = dict(zip(X_test.index, X_test_scaled))

In [388]:
# Testing test keys

if X_test_scaled_dict.keys() == y_test_dict.keys():
    print(True)

True


In [389]:
# Testing train keys

if X_train_scaled_dict.keys() == y_train_dict.keys():
    print(True)

True


In [390]:
X_train_scaled_dict

{101125: array([0.05337079, 0.705     , 0.0074546 , 1.        , 0.        ,
        1.        , 0.        , 0.        , 0.        , 0.        ,
        1.        , 1.        , 0.        ]),
 39828: array([0.13764045, 0.1025    , 0.00501832, 1.        , 0.        ,
        0.        , 0.        , 0.        , 1.        , 0.        ,
        1.        , 0.        , 0.        ]),
 100452: array([0.10393258, 0.2425    , 0.01089422, 1.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 1.        ,
        1.        , 0.        , 0.        ]),
 124446: array([0.08707865, 0.38      , 0.01009632, 1.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 1.        ,
        1.        , 0.        , 0.        ]),
 70360: array([0.07022472, 0.1225    , 0.00842783, 0.81818182, 0.        ,
        0.        , 1.        , 0.        , 0.        , 0.        ,
        1.        , 0.        , 0.        ]),
 96099: array([0.01685393, 0.02      , 0.0081559 , 1

In [391]:
y_train_dict

{101125: 0,
 39828: 0,
 100452: 0,
 124446: 0,
 70360: 0,
 96099: 0,
 47461: 0,
 11646: 0,
 140743: 0,
 13134: 0,
 138729: 0,
 123530: 0,
 130612: 1,
 135641: 0,
 35175: 0,
 98713: 1,
 49713: 0,
 146662: 1,
 105050: 0,
 36347: 0,
 92497: 0,
 134792: 1,
 19689: 0,
 54777: 0,
 107256: 1,
 38438: 0,
 33829: 1,
 40522: 0,
 112141: 0,
 18193: 0,
 59918: 0,
 48497: 0,
 81480: 0,
 74023: 0,
 20523: 0,
 93998: 1,
 134479: 1,
 112410: 0,
 41933: 1,
 21249: 1,
 104560: 1,
 12797: 1,
 53158: 0,
 147492: 0,
 107835: 1,
 49286: 0,
 137751: 0,
 87551: 0,
 60995: 0,
 77790: 0,
 142761: 0,
 58883: 0,
 41538: 0,
 81785: 0,
 63602: 0,
 22984: 0,
 9621: 0,
 136063: 0,
 77390: 0,
 46033: 0,
 17667: 0,
 113698: 0,
 19699: 0,
 74849: 1,
 46063: 0,
 88204: 0,
 128284: 0,
 55653: 0,
 80014: 0,
 101941: 0,
 144293: 0,
 104805: 0,
 24357: 0,
 68592: 0,
 15037: 0,
 53263: 0,
 37154: 0,
 81605: 0,
 58768: 0,
 77891: 0,
 101244: 0,
 86463: 0,
 3170: 0,
 98595: 1,
 9663: 0,
 98976: 0,
 58069: 0,
 63048: 0,
 10012: 

### **KNN Algorithm Implementation**

In [None]:
# Distance helper function & KNN Algorithm

def _distance(unknown, known):
    return sum((u-k)**2 for u, k in zip(unknown, known)) ** 0.5 # Euclidean distance

def knn_classifier(unknown: list, data: dict, labels: list, k: int=5) -> int:
    """
    unknown: contains the feature set of the unclassified item
    data: keys are the unique item identifier, values are the feature set of the classified item (list format)
    labels: binary variable (0 or 1), where 1 is positive and 0 is negative
    k: the number of neighbors
    """
    if len(data) != len(labels):
        raise ValueError("Data and labels must be same length")
    
    if k > len(data):
        raise ValueError("K cannot be greater than the number of data points")
    
    distance_label_pairs = []
    for k, v in data.items():
        dist = _distance(unknown, v)
        distance_label_pairs.append((dist, labels[k]))

    distance_label_pairs.sort(key=lambda x: x[0]) # Sort by distances
    k_neighbor_labels = [x[1] for x in distance_label_pairs[0:k]] # Extract by labels

    return stats.mode(k_neighbor_labels)

In [393]:
# Model Accuracy & Validation
def confusion(training_set: dict, training_labels: dict, validation_set: dict, validation_labels: dict, k: int=5) -> dict:
    confusion_dict = {"TP": 0,
                      "FP": 0,
                      "TN": 0,
                      "FN": 0}

    for key in validation_set.keys():
        classified_label = knn_classifier(validation_set[key], training_set, training_labels, k)

        if (classified_label == validation_labels[key]) and (classified_label == 1):
            confusion_dict["TP"] += 1
        elif (classified_label != validation_labels[key]) and (classified_label == 1):
            confusion_dict["FP"] += 1
        elif (classified_label == validation_labels[key]) and (classified_label == 0):
            confusion_dict["TN"] += 1
        elif (classified_label != validation_labels[key]) and (validation_labels[key] == 1):
            confusion_dict["FN"] += 1

    return confusion_dict

def accuracy(training_set: dict, training_labels: list, validation_set: dict, validation_labels: list, k: int=5) -> dict:
    confusion_dict = confusion(training_set, training_labels, validation_set, validation_labels, k)
    tp = confusion_dict["TP"]
    fp = confusion_dict["FP"]
    tn = confusion_dict["TN"]
    fn = confusion_dict["FN"]

    total = tp + fp + tn + fn
    
    accuracy_val = (tp + tn) / total if total > 0 else 0
    precision_val = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall_val = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity_val = tn / (tn + fp) if (tn + fp) > 0 else 0
    npv_val = tn / (tn + fn) if (tn + fn) > 0 else 0
    f1_val = 2 * (precision_val * recall_val) / (precision_val + recall_val) if (precision_val + recall_val) > 0 else 0

    return {"Accuracy": accuracy_val,
            "Precision": precision_val,
            "Recall": recall_val,
            "F1 Score": f1_val,
            "Specificity": specificity_val,
            "Negative Predictive Value": npv_val}

In [None]:
model_stats_dict = accuracy(X_train_scaled_dict, y_train_dict, X_test_scaled_dict, y_test_dict)
print(f"Custom KNN Accuracy: {model_stats_dict['Accuracy']}")

### **Scikit-learn Implementation**

In [None]:
from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, y_train)

accuracy_score = model.score(X_test, y_test)
print(f"Scikit-learn KNN Accuracy: {accuracy_score}")

Scikit-learn KNN Accuracy: 0.823
