# K-NN algorithm implementation

In [1]:
from package.distance import *
from package.normalize import normalize_data
import pandas as pd
import numpy as np

In [2]:
def knn(df: pd.DataFrame, test_data: pd.DataFrame, options: dict = {}, ignore_columns: list = []):
    # Create a copy of ignore_columns to avoid modifying the input list
    columns_to_ignore = ignore_columns + ['id', 'class']
    distance_options = options.copy()

    # Check if any non-binary columns exist outside the ignore list
    jaccard_applicable = all(df[col].isin([0, 1]).all() for col in df.columns if col not in columns_to_ignore)

    if not jaccard_applicable:
        # Normalize data if not strictly binary
        normalize_data(df, test_data, columns_to_ignore)
        print('Normalized data:')
        print(df, end='\n\n')

    print(f'test_data :')
    print(test_data)
    
    # Calculate distance for each row in df
    df['distance'] = df.apply(lambda row: distance(row, test_data.iloc[0], df.columns.difference(columns_to_ignore), distance_options) if not jaccard_applicable 
                              else jaccard_coefficient([row[col] for col in df.columns if col not in columns_to_ignore], 
                                                       [test_data.iloc[0][col] for col in test_data.columns if col not in columns_to_ignore]),
                              axis=1)

    # Sort df by distance
    df_sorted = df.sort_values(by=['distance'])
    
    return df_sorted


In [3]:
data = pd.read_csv('data/data.csv')

test_data = {'c1': [0], 'c2': [0], 'c3': [1], 'c4': [0], 'c5': [0], 'c6': [0], 'c7': [0], 'c8': [0], 'c9': [1], 'c10': [0], 'c11': [1] }
test_data = pd.DataFrame(test_data)

knn(data, test_data=test_data)

test_data :
   c1  c2  c3  c4  c5  c6  c7  c8  c9  c10  c11
0   0   0   1   0   0   0   0   0   1    0    1


Unnamed: 0,id,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,class,distance
3,4,0,0,0,0,0,0,0,0,1,1,1,Politique,0.5
2,3,1,0,1,0,0,0,0,0,0,0,0,Sport,0.75
4,5,0,0,0,0,0,1,0,0,1,0,0,Politique,0.75
0,1,1,1,0,0,0,1,0,0,0,0,0,Sport,1.0
1,2,1,0,0,1,0,0,0,1,0,0,0,Sport,1.0
5,6,0,0,0,0,0,0,1,0,0,1,0,Politique,1.0
