In [1]:
import csv
import os
import numpy as np
import pandas as pd
from collections import Counter
from sklearn import metrics
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

# read the data
trainDT = r"C:\Users\Hoda\Desktop\interview\train_data.csv"
testDT = r"C:\Users\Hoda\Desktop\interview\test_data.csv"

In [2]:
train_dt = pd.read_csv(trainDT)
test_dt = pd.read_csv(testDT)

In [4]:
features = train_dt.iloc[:,:-1]
categories = train_dt.iloc[:,-1]
features.drop('id', axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(features, categories, test_size=0.3, random_state=23)

In [None]:
import random
temp = list(y_test)
random.shuffle(temp)
x = pd.Series(temp, index = y_test.index) 

In [5]:
features.head()

Unnamed: 0,feat_1,feat_2,feat_3,feat_4,feat_5,feat_6,feat_7,feat_8,feat_9,feat_10,...,feat_84,feat_85,feat_86,feat_87,feat_88,feat_89,feat_90,feat_91,feat_92,feat_93
0,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,1,6,1,5,0,0,1,...,22,0,1,2,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0


In [6]:
knn = KNeighborsClassifier()
# fit the model with data (occurs in-place)
model = knn.fit(X_train, y_train)
# predict the test result for evaluating the model
prediction = knn.predict(X_test)
print('Classification accuracy is: ', metrics.accuracy_score(y_test, prediction)*100, '%')
#print(metrics.confusion_matrix(y_test, prediction))
print(metrics.classification_report(y_test, prediction))

Classification accuracy is:  77.48868778280543 %
              precision    recall  f1-score   support

     Class_1       0.61      0.53      0.56       619
     Class_2       0.69      0.81      0.74      4869
     Class_3       0.51      0.49      0.50      2367
     Class_4       0.65      0.26      0.37       831
     Class_5       0.93      0.98      0.95       807
     Class_6       0.94      0.94      0.94      4206
     Class_7       0.74      0.56      0.64       817
     Class_8       0.91      0.89      0.90      2543
     Class_9       0.83      0.86      0.84      1505

    accuracy                           0.77     18564
   macro avg       0.76      0.70      0.72     18564
weighted avg       0.77      0.77      0.77     18564



In [7]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

X_train_t = scaler.transform(X_train)
X_test_t = scaler.transform(X_test)
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=7)
classifier.fit(X_train_t, y_train)
y_pred = classifier.predict(X_test_t)

In [8]:
# predict the test result for evaluating the model
prediction = knn.predict(X_test_t)
print('Classification accuracy is: ', metrics.accuracy_score(y_test, y_pred)*100, '%')
#print(metrics.confusion_matrix(y_test, prediction))
print(metrics.classification_report(y_test, y_pred))

Classification accuracy is:  76.8476621417798 %
              precision    recall  f1-score   support

     Class_1       0.59      0.48      0.53       619
     Class_2       0.68      0.83      0.75      4869
     Class_3       0.52      0.47      0.49      2367
     Class_4       0.63      0.26      0.37       831
     Class_5       0.92      0.96      0.94       807
     Class_6       0.93      0.93      0.93      4206
     Class_7       0.69      0.54      0.61       817
     Class_8       0.89      0.87      0.88      2543
     Class_9       0.85      0.83      0.84      1505

    accuracy                           0.77     18564
   macro avg       0.75      0.69      0.70     18564
weighted avg       0.76      0.77      0.76     18564



In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
error = []

# Calculating error for K values between 1 and 40
for i in range(1, 40):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(X_train, y_train)
    pred_i = knn.predict(X_test)
    error.append(np.mean(pred_i != y_test))
plt.figure(figsize=(12, 6))
plt.plot(range(1, 40), error, color='red', linestyle='dashed', marker='o',
         markerfacecolor='blue', markersize=10)
plt.title('Error Rate K Value')
plt.xlabel('K Value')
plt.ylabel('Mean Error')

In [None]:
predict_unlabeled_data = knn.predict(test_dt)

In [None]:
df = pd.DataFrame(0, index=range(len(predict_unlabeled_data)), columns=range(len(categories.unique())))
df.columns = categories.unique()

In [None]:
for i in range(len(predict_unlabeled_data)):
    df.loc[i][predict_unlabeled_data[i]] = 1

In [None]:
# use
from sklearn.model_selection import train_test_split, KFold
print('---------------------------------------- using K-fold cross validation')
scores = []
cv = KFold(n_splits=5, random_state=42, shuffle=False)
for train_index, test_index in cv.split(features,y=categories):
    #print("Train Index: ", train_index, "\n")
    #print("Test Index: ", test_index)

    X_train, X_test, y_train, y_test = features.loc[train_index], features.loc[test_index], categories.loc[train_index], categories.loc[test_index]
    knn.fit(X_train, y_train)
    scores.append(knn.score(X_test, y_test))

In [None]:
scores