## IMPORT LIBRARIES

In [22]:
import pandas as pd
import numpy as np
import pickle

#scikit-learn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report

## IMPORT DATASET

In [23]:
data_train = pd.read_csv("Tubes2_AI/data/data_train.csv")
data_validation = pd.read_csv("Tubes2_AI/data/data_validation.csv")
data_test= pd.read_csv("Tubes2_AI/data/test.csv")

## KNN ALGORITHM

In [24]:
# Constants
k = 50                      
vote_weight = True                
column_weight = False
normalize = False
dist_dimension = 2
class KNN:
    def fit(self, X, y):
        self.X = X
        self.y = y

    def predict(self, X_test, k: int = k):
        # Normalize train & test data
        X = self.X.copy(deep=True)
        if normalize:
            all_min = X.min()
            all_max = X.max()

            X = (X - all_min) / (all_max - all_min)
            X_test = (X_test - all_min) / (all_max - all_min)

        # Apply weight to each column
        if column_weight:
            pearson_weight = data_train.corr()[data_train.columns[-1]][:-1]
            # pearson_weight *= np.abs(pearson_weight)
            X = X.apply(lambda row: row * pearson_weight, axis=1)
            X_test = X_test.apply(lambda row: row * pearson_weight, axis=1)

        # Predict test value
        y_pred_test = []

        if dist_dimension == 1:
            dist = np.sum(np.abs(X_test.to_numpy()[:, np.newaxis, :] - X.to_numpy()), axis=2)
        elif dist_dimension == 2:
            dist = np.linalg.norm(X_test.to_numpy()[:, np.newaxis, :] - X.to_numpy(), axis=2)

        for row in dist:
            idx_dist = np.column_stack((row, self.y))
            idx_dist = idx_dist[np.argsort(idx_dist[:, 0])]

            votes = [0 for i in range(4)]
            for i in range(k):
                votes[int(idx_dist[i][1])] += 1 / idx_dist[i][0] if vote_weight else 1

            best_idx = 0
            for i in range(1, 4):
                if votes[i] > votes[best_idx]:
                    best_idx = i

            y_pred_test += [best_idx]

        return y_pred_test

y = data_train['price_range']
X = data_train.drop('price_range', axis=1)

y_true_test = data_validation['price_range']
X_test = data_validation.drop('price_range', axis = 1)

knn = KNN()
knn.fit(X, y)
y_test = knn.predict(X_test, k)
accuracy_score_knn = accuracy_score(y_true_test, y_test)
precision_score_knn = precision_score(y_true_test, y_test, average='macro')
recall_score_knn = recall_score(y_true_test, y_test, average='macro')

print(f'Accuracy Score of KNN Algorithm : {accuracy_score_knn}')
print(f'Precision Score of KNN Algorithm : {precision_score_knn}')
print(f'Recall Score of KNN Algorithm : {recall_score_knn}')




# res = []
# for k in range(41, 61):
#     res += [[k, 1 - accuracy_score(y_true_test, knn(X, y, X_test, k))]]

# res_df = pd.DataFrame(res, columns=['k', 'error'])
# res_df

Accuracy Score of KNN Algorithm : 0.935
Precision Score of KNN Algorithm : 0.9348889404477712
Recall Score of KNN Algorithm : 0.9352940689718892


## EXPORT MODEL

In [25]:
with open('Tubes2_AI/model-knn.pkl', 'wb') as file:
    pickle.dump(knn, file)

## LOAD MODEL

In [27]:
with open('Tubes2_AI/model-knn.pkl', 'rb') as file:
    loaded_model = pickle.load(file)


y_pred_test = loaded_model.predict(X_test, 50)

## KNN SUBMISSION FOR  KAGGLE

In [None]:
#Submission csv
k = 56   
data_test_size = 2000
training_set = data_train.append(data_validation)
y = training_set['price_range']
X = training_set.drop('price_range', axis = 1)[:data_test_size]
X_test = data_test.drop('id', axis = 1)
y_test= knn(X, y, X_test, k)
submission = pd.concat([data_test['id'], pd.DataFrame(y_test, columns = ['price_range'])], axis = 1)

submission.to_csv('Tubes2_AI/out1.csv', index = False)
submission

Unnamed: 0,id,price_range
0,0,0
1,1,3
2,2,3
3,3,2
4,4,0
...,...,...
1995,1995,2
1996,1996,0
1997,1997,3
1998,1998,0


<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=0cf4af83-b6f2-43d0-9e47-37804035f63d' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>