In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings('ignore')

## Import Data (Bert Vectors)

In [2]:
#full sentence bert vector
file = '../input/bt5153-train-test-bert-full-sentence/train_data_bert_fullsent.csv'
testfile = '../input/bt5153-train-test-bert-full-sentence/test_data_bert_fullsent.csv'

df = pd.read_csv(file, dtype={'label':str})
df_test = pd.read_csv(testfile, dtype={'label':str})

df['distil_bert_avg'] = df['distil_bert_avg'].apply(lambda x: np.array(eval(str(x))))
df_test['distil_bert_avg'] = df_test['distil_bert_avg'].apply(lambda x: np.array(eval(str(x))))

In [3]:
#merge negative bert vectors
# neg_distillbert_avg = open("../input/5153-bert-feature-extraction-negative/negative_distil_bert_avg_dic.txt","r")
# neg_distillbert_avg=(neg_distillbert_avg.read())
# neg_distillbert_avg=eval(neg_distillbert_avg)

# df_neg_distillbert_avg = pd.DataFrame({'review_id' : neg_distillbert_avg.keys(), 
#                                      'neg_bert_avg' : neg_distillbert_avg.values()})

# df = pd.merge(df, df_neg_distillbert_avg, on='review_id', how='inner')
# df_test = pd.merge(df_test, df_neg_distillbert_avg, on='review_id', how='inner')
# print(df.shape)
# df.head()

## GaussianNB

In [1]:
from sklearn.metrics import precision_score, recall_score
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.metrics import classification_report

In [5]:
X = df.distil_bert_avg
X = np.array([x for x in list(X)])
X_test = df_test.distil_bert_avg
X_test = np.array([x for x in list(X_test)])

y_dl = df['delivery'].copy()
y_dl_test = df_test['delivery'].copy()
y_pd = df['product'].copy()
y_pd_test = df_test['product'].copy()
y_sv = df['service'].copy()
y_sv_test = df_test['service'].copy()

In [6]:
X

array([[-0.0467826 , -0.27473888,  0.2780265 , ..., -0.0511116 ,
        -0.0305712 ,  0.09294429],
       [ 0.1399654 , -0.11672436,  0.23911656, ..., -0.12838659,
        -0.14252678, -0.00974948],
       [ 0.07374873, -0.01620108,  0.20269494, ..., -0.06569172,
        -0.12933594, -0.1495965 ],
       ...,
       [ 0.05727965,  0.1664856 ,  0.12964745, ..., -0.04691945,
        -0.14244272,  0.01669729],
       [ 0.11460409,  0.09498134,  0.23205358, ...,  0.05434497,
        -0.09696532,  0.09951015],
       [-0.03872562, -0.10021102,  0.16911487, ...,  0.06784689,
         0.06759077,  0.11119706]])

In [7]:
y_dl.shape

(1065,)

### Delivery

In [8]:
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X, y_dl)

GaussianNB()

In [9]:
y_dl_pred = clf.predict(X_test)
y_dl_proba = clf.predict_proba(X_test)[:, 1]

precision = precision_score(y_dl_test, y_dl_pred, average='micro')
recall = recall_score(y_dl_test, y_dl_pred, average='micro')
accuracy = accuracy_score(y_dl_test, y_dl_pred)
auc = roc_auc_score(y_dl_test, y_dl_proba, multi_class='ovr')

print('precision: ', precision)
print('recall: ', recall)
print('accuracy: ', accuracy)
print('auc: ', auc)
print('macro_fl_delivery:',round(f1_score(y_dl_test, y_dl_pred,average='macro'),3))
print(classification_report(y_dl_test, y_dl_pred))

precision:  0.777292576419214
recall:  0.777292576419214
accuracy:  0.777292576419214
auc:  0.8429320765769363
macro_fl_delivery: 0.734
              precision    recall  f1-score   support

           0       0.93      0.77      0.84       351
           1       0.51      0.80      0.63       107

    accuracy                           0.78       458
   macro avg       0.72      0.79      0.73       458
weighted avg       0.83      0.78      0.79       458



### Product

In [10]:
clf = GaussianNB()
clf.fit(X, y_pd)

GaussianNB()

In [11]:
y_pd_pred = clf.predict(X_test)
y_pd_proba = clf.predict_proba(X_test)[:, 1]

precision = precision_score(y_pd_test, y_pd_pred, average='micro')
recall = recall_score(y_pd_test, y_pd_pred, average='micro')
accuracy = accuracy_score(y_pd_test, y_pd_pred)
auc = roc_auc_score(y_pd_test, y_pd_proba, multi_class='ovr')

print('precision: ', precision)
print('recall: ', recall)
print('accuracy: ', accuracy)
print('auc: ', auc)
print('macro_fl_delivery:',round(f1_score(y_pd_test, y_pd_pred,average='macro'),3))
print(classification_report(y_pd_test, y_pd_pred))

precision:  0.7925764192139738
recall:  0.7925764192139738
accuracy:  0.7925764192139738
auc:  0.8776059466848941
macro_fl_delivery: 0.779
              precision    recall  f1-score   support

           0       0.66      0.81      0.72       154
           1       0.89      0.79      0.83       304

    accuracy                           0.79       458
   macro avg       0.77      0.80      0.78       458
weighted avg       0.81      0.79      0.80       458



### Service

In [12]:
clf = GaussianNB()
clf.fit(X, y_sv)

GaussianNB()

In [13]:
y_sv_pred = clf.predict(X_test)
y_sv_proba = clf.predict_proba(X_test)[:, 1]

precision = precision_score(y_sv_test, y_sv_pred, average='micro')
recall = recall_score(y_sv_test, y_sv_pred, average='micro')
accuracy = accuracy_score(y_sv_test, y_sv_pred)
auc = roc_auc_score(y_sv_test, y_sv_proba, multi_class='ovr')

print('precision: ', precision)
print('recall: ', recall)
print('accuracy: ', accuracy)
print('auc: ', auc)
print('macro_fl_delivery:',round(f1_score(y_sv_test, y_sv_pred,average='macro'),3))
print(classification_report(y_sv_test, y_sv_pred))

precision:  0.7336244541484717
recall:  0.7336244541484717
accuracy:  0.7336244541484717
auc:  0.7971560846560846
macro_fl_delivery: 0.678
              precision    recall  f1-score   support

           0       0.88      0.75      0.81       350
           1       0.46      0.68      0.54       108

    accuracy                           0.73       458
   macro avg       0.67      0.71      0.68       458
weighted avg       0.78      0.73      0.75       458

