In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import StratifiedKFold



## Import Data (Bert Vectors)

In [2]:
#full sentence bert vector
file = '../input/bt5153-train-test-bert-full-sentence/train_data_bert_fullsent.csv'
testfile = '../input/bt5153-train-test-bert-full-sentence/test_data_bert_fullsent.csv'

df = pd.read_csv(file, dtype={'label':str})
df_test = pd.read_csv(testfile, dtype={'label':str})

df['distil_bert_avg'] = df['distil_bert_avg'].apply(lambda x: np.array(eval(str(x))))
df_test['distil_bert_avg'] = df_test['distil_bert_avg'].apply(lambda x: np.array(eval(str(x))))

print(df.shape)
df.head(1)


(1065, 13)


Unnamed: 0,review_id,review_content,review_stars,delivery,product,service,bert_avg,bert_max,bert_layer0,distil_bert_avg,distil_bert_max,distil_bert_layer0,label
0,9161,"Delivery took more than a week, short expiry d...",2,1,1,0,"[-0.06329992, -0.32380387, 0.35608995, -0.1268...","[0.93797976, 0.25249124, 1.1054325, 0.49609792...","[-0.19600664, -0.4481723, 0.20494524, -0.39789...","[-0.046782605, -0.27473888, 0.2780265, 0.03694...","[0.4727717, 0.23103717, 0.7197178, 0.6083572, ...","[-0.022450736, -0.34166092, 0.13424423, -0.170...",110


## KNN

In [3]:
X = df.distil_bert_avg
X=np.array([x for x in list(X)])
X_test = df_test.distil_bert_avg
X_test=np.array([x for x in list(X_test)])

In [4]:
y_dl = df['delivery'].copy()
y_dl_test = df_test['delivery'].copy()
y_pd = df['product'].copy()
y_pd_test = df_test['product'].copy()
y_sv = df['service'].copy()
y_sv_test = df_test['service'].copy()

In [5]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score, make_scorer

from sklearn.model_selection import GridSearchCV


kf = StratifiedKFold(n_splits=10, shuffle=False)
param={'n_neighbors':[1, 5,6,7,8,9,10,11,12,13,14,15,20,25]}

knn_dl = KNeighborsClassifier()

best_knn=GridSearchCV(estimator=knn_dl, 
                      param_grid=param, 
                      cv=kf,
                      scoring=make_scorer(f1_score , average='macro'))
best_knn.fit(X, y_dl)


print('best estimator',best_knn.best_estimator_)
# print('best params:',best_knn.best_params_)
# print('best score:',best_knn.best_score_)

knn_dl = best_knn.best_estimator_
knn_dl.fit(X, y_dl)


y_dl_pred = knn_dl.predict(X_test)
y_dl_proba = knn_dl.predict_proba(X_test)[:, 1]


precision = precision_score(y_dl_test, y_dl_pred, average='micro')
recall = recall_score(y_dl_test, y_dl_pred, average='micro')
accuracy = accuracy_score(y_dl_test, y_dl_pred)
auc = roc_auc_score(y_dl_test, y_dl_proba, multi_class='ovr')

print('precision: ', precision)
print('recall: ', recall)
print('accuracy: ', accuracy)
print('auc: ', auc)
print('macro_fl_delivery:',round(f1_score(y_dl_test, y_dl_pred,average='macro'),3))
print(classification_report(y_dl_test, y_dl_pred))

best estimator KNeighborsClassifier(n_neighbors=13)
precision:  0.8209606986899564
recall:  0.8209606986899564
accuracy:  0.8209606986899564
auc:  0.866363128045371
macro_fl_delivery: 0.73
              precision    recall  f1-score   support

           0       0.86      0.91      0.89       351
           1       0.65      0.51      0.57       107

    accuracy                           0.82       458
   macro avg       0.75      0.71      0.73       458
weighted avg       0.81      0.82      0.81       458



In [6]:

param={'n_neighbors':[1, 5,10,11,12,13,14,15,16,17,18,20,25]}

knn_pd = KNeighborsClassifier()

best_knn=GridSearchCV(estimator=knn_pd, 
                      param_grid=param, 
                      cv=kf,
                      scoring=make_scorer(f1_score , average='macro'))
best_knn.fit(X, y_pd)
print('best estimator',best_knn.best_estimator_)
knn_pd = best_knn.best_estimator_
knn_pd.fit(X, y_pd)


y_pd_pred = knn_pd.predict(X_test)
y_pd_proba = knn_pd.predict_proba(X_test)[:, 1]

precision = precision_score(y_pd_test, y_pd_pred, average='micro')
recall = recall_score(y_pd_test, y_pd_pred, average='micro')
accuracy = accuracy_score(y_pd_test, y_pd_pred)
auc = roc_auc_score(y_pd_test, y_pd_proba, multi_class='ovr')

print('precision: ', precision)
print('recall: ', recall)
print('accuracy: ', accuracy)
print('auc: ', auc)
print('macro_fl_delivery:',round(f1_score(y_pd_test, y_pd_pred,average='macro'),3))
print(classification_report(y_pd_test, y_pd_pred))

best estimator KNeighborsClassifier(n_neighbors=15)
precision:  0.8275109170305677
recall:  0.8275109170305677
accuracy:  0.8275109170305677
auc:  0.9107463260423786
macro_fl_delivery: 0.808
              precision    recall  f1-score   support

           0       0.74      0.75      0.75       154
           1       0.87      0.87      0.87       304

    accuracy                           0.83       458
   macro avg       0.81      0.81      0.81       458
weighted avg       0.83      0.83      0.83       458



In [7]:
param={'n_neighbors':[1, 5,10,11,12,13,14,15,16,17,18,20,25]}

knn_sv = KNeighborsClassifier()

best_knn=GridSearchCV(estimator=knn_sv, 
                      param_grid=param, 
                      cv=kf,
                      scoring=make_scorer(f1_score , average='macro'))
best_knn.fit(X, y_sv)
print('best estimator',best_knn.best_estimator_)
knn_sv = best_knn.best_estimator_
knn_sv.fit(X, y_sv)



y_sv_pred = knn_sv.predict(X_test)
y_sv_proba = knn_sv.predict_proba(X_test)[:, 1]

precision = precision_score(y_sv_test, y_sv_pred, average='micro')
recall = recall_score(y_sv_test, y_sv_pred, average='micro')
accuracy = accuracy_score(y_sv_test, y_sv_pred)
auc = roc_auc_score(y_sv_test, y_sv_proba, multi_class='ovr')

print('precision: ', precision)
print('recall: ', recall)
print('accuracy: ', accuracy)
print('auc: ', round(auc,3))
print('macro_fl_delivery:',round(f1_score(y_sv_test, y_sv_pred,average='macro'),3))
print(classification_report(y_sv_test, y_sv_pred))

best estimator KNeighborsClassifier(n_neighbors=13)
precision:  0.8231441048034934
recall:  0.8231441048034934
accuracy:  0.8231441048034934
auc:  0.832
macro_fl_delivery: 0.692
              precision    recall  f1-score   support

           0       0.83      0.97      0.89       350
           1       0.76      0.36      0.49       108

    accuracy                           0.82       458
   macro avg       0.80      0.66      0.69       458
weighted avg       0.81      0.82      0.80       458

