In [1]:
# Load libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential,Input
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Bidirectional,MaxPooling1D
from keras.callbacks import EarlyStopping
from keras.layers import Dropout
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from keras.optimizers import Adam

In [2]:
# Load data
df_train=pd.read_csv('../input/bt5153-train-test-bert-full-sentence/train_data_bert_fullsent.csv',dtype={'label':str})
df_test=pd.read_csv('../input/bt5153-train-test-bert-full-sentence/test_data_bert_fullsent.csv',dtype={'label':str})

In [3]:
df_train.head()

Unnamed: 0,review_id,review_content,review_stars,delivery,product,service,bert_avg,bert_max,bert_layer0,distil_bert_avg,distil_bert_max,distil_bert_layer0,label
0,9161,"Delivery took more than a week, short expiry d...",2,1,1,0,"[-0.06329992, -0.32380387, 0.35608995, -0.1268...","[0.93797976, 0.25249124, 1.1054325, 0.49609792...","[-0.19600664, -0.4481723, 0.20494524, -0.39789...","[-0.046782605, -0.27473888, 0.2780265, 0.03694...","[0.4727717, 0.23103717, 0.7197178, 0.6083572, ...","[-0.022450736, -0.34166092, 0.13424423, -0.170...",110
1,8198,"Quality is so-so, loops are too huge resulting...",3,0,1,0,"[0.19020107, -0.3453789, 0.3494842, 0.14938574...","[1.1594802, 0.60506886, 1.084206, 0.8051733, 0...","[0.04546422, -0.29597518, -0.35977763, -0.4039...","[0.1399654, -0.11672436, 0.23911656, 0.1187961...","[0.8445691, 0.4961368, 0.7105234, 0.6109599, 0...","[0.12642525, -0.1857245, 0.08149249, -0.019622...",10
2,5314,Received within 3days. Well packed in a carton...,4,0,1,0,"[-0.028732965, -0.14712504, 0.36105582, 0.0254...","[1.3182856, 0.7953221, 1.3915914, 0.87499446, ...","[-0.38282838, -0.22101544, 0.6115111, -0.33658...","[0.07374873, -0.01620108, 0.20269494, 0.149083...","[0.76418287, 0.48008785, 0.97139245, 0.5615782...","[0.026645288, -0.27306008, 0.20750262, 0.05986...",10
3,17589,Not sure if these were the authentic as I brou...,3,0,1,0,"[0.3033499, 0.02644875, 0.16410081, 0.04581299...","[1.2807441, 0.85781705, 0.80655, 0.49130562, 0...","[-0.078149214, 0.117149524, 0.04559862, -0.218...","[0.18770164, 0.114881225, 0.08257794, 0.121869...","[0.9882787, 0.5043827, 0.54232115, 0.44097498,...","[0.23179623, 0.050810635, 0.07551272, -0.15587...",10
4,31446,its a fashion mask. no filter so no protection...,3,0,1,0,"[-0.18616518, -0.38300017, 0.52194715, 0.18413...","[0.65815246, 0.20897251, 1.0151551, 0.48899576...","[-0.29987, -0.26352054, 0.3190687, -0.22115439...","[-0.0903255, -0.1808794, 0.33290026, 0.2622123...","[0.66676205, 0.099603325, 0.8990194, 0.5398592...","[-0.34698877, -0.3470553, 0.3122651, 0.0037901...",10


In [4]:
f=open('../input/5153-bert-feature-extraction-negative/negative_bert_layer0_dic.txt','r')
neg_bert_layer0=(f.read())
neg_bert_layer0=eval(neg_bert_layer0)
df_neg_feature=pd.DataFrame({'review_id':neg_bert_layer0.keys(),
                            'neg_distil_bert_layer0':neg_bert_layer0.values()})
df_train=df_train.merge(df_neg_feature,on='review_id',how='left')
df_test=df_test.merge(df_neg_feature,on='review_id',how='left')

In [5]:
f=open('../input/5153-bert-feature-extraction-negative/negative_distil_bert_avg_dic.txt','r')
neg_bert_avg=(f.read())
neg_bert_avg=eval(neg_bert_avg)
df_neg_feature=pd.DataFrame({'review_id':neg_bert_avg.keys(),
                            'neg_distil_bert_avg':neg_bert_avg.values()})

df_train=df_train.merge(df_neg_feature,on='review_id',how='left')
df_test=df_test.merge(df_neg_feature,on='review_id',how='left')

In [6]:
def BiLstm_model(X_train, y_train):
    model = Sequential()

    model.add(Bidirectional(LSTM(256, dropout=0.1, recurrent_dropout=0.1))) 

    model.add(Dense(512, activation='relu'))  

    model.add(Dense(128, activation='relu')) 

    model.add(Dense(3, activation='sigmoid')) 

    optimizer=Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)

    model.compile(loss='BinaryCrossentropy', optimizer=optimizer, metrics=['accuracy'])

    epochs = 100
    batch_size = 32

    history = model.fit(X_train, y_train, 
            epochs=epochs, 
            batch_size=batch_size,
            validation_split=0.1,
            callbacks=[EarlyStopping(monitor='val_loss', 
            patience=3, min_delta=0.0001)])
    return model

In [7]:
def get_results(model,X_test, y_test):


    results_prob=model.predict(X_test)
    results_pred=np.copy(results_prob)
    results_pred[results_prob>=0.5]=1
    results_pred[results_prob<0.5]=0
    results_pred=results_pred.astype('int')

    print('roc_auc_score_delivery:',round(roc_auc_score(list(df_test['delivery']), results_prob[:,0]),3))
    print('roc_auc_score_product:',round(roc_auc_score(list(df_test['product']), results_prob[:,1]),3))
    print('roc_auc_score_service:',round(roc_auc_score(list(df_test['service']), results_prob[:,2]),3))
    print('macro_fl_delivery:',round(f1_score(list(df_test['delivery']), results_pred[:,0],average='macro'),3))
    print('macro_fl_product:',round(f1_score(list(df_test['product']), results_pred[:,1],average='macro'),3))
    print('macro_fl_service:',round(f1_score(list(df_test['service']), results_pred[:,2],average='macro'),3))


In [8]:
def data_process_1D(feature_name):
    X_train=np.array([eval(str(x)) for x in list(df_train[feature_name])])
    X_train=X_train[:,np.newaxis,:]
    y_train=np.array(df_train[['delivery','product','service']])

    X_test=np.array([eval(str(x)) for x in list(df_test[feature_name])])
    X_test=X_test[:,np.newaxis,:]
    y_test=np.array(df_test[['delivery','product','service']])
    
    return X_train,y_train,X_test,y_test

### use distill_bert_layer0

In [9]:
X_train,y_train,X_test,y_test=data_process_1D('distil_bert_layer0')

model_1=BiLstm_model(X_train,y_train)
get_results(model_1,X_test,y_test)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
roc_auc_score_delivery: 0.899
roc_auc_score_product: 0.909
roc_auc_score_service: 0.846
macro_fl_delivery: 0.79
macro_fl_product: 0.82
macro_fl_service: 0.779


### use distill_bert_avg 

In [10]:
X_train,y_train,X_test,y_test=data_process_1D('distil_bert_avg')

model_2=BiLstm_model(X_train,y_train)
get_results(model_2,X_test,y_test)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
roc_auc_score_delivery: 0.908
roc_auc_score_product: 0.925
roc_auc_score_service: 0.877
macro_fl_delivery: 0.805
macro_fl_product: 0.851
macro_fl_service: 0.784


### use neg_distil_bert_layer0

In [11]:
X_train,y_train,X_test,y_test=data_process_1D('neg_distil_bert_layer0')

model_3=BiLstm_model(X_train,y_train)
get_results(model_3,X_test,y_test)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
roc_auc_score_delivery: 0.896
roc_auc_score_product: 0.916
roc_auc_score_service: 0.846
macro_fl_delivery: 0.804
macro_fl_product: 0.83
macro_fl_service: 0.723


## use_neg_distllbert_avg

In [12]:
X_train,y_train,X_test,y_test=data_process_1D('neg_distil_bert_avg')

model_4=BiLstm_model(X_train,y_train)
get_results(model_4,X_test,y_test)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
roc_auc_score_delivery: 0.932
roc_auc_score_product: 0.929
roc_auc_score_service: 0.844
macro_fl_delivery: 0.838
macro_fl_product: 0.87
macro_fl_service: 0.761


# 2d FEATURE


In [13]:
twoD_feature=np.load("../input/5153-bert-feature-extraction-negative/distil_bert_dic_2D.pkl",allow_pickle=True)
df_neg_feature=pd.DataFrame({'review_id':twoD_feature.keys(),
                            'neg_2d_distill':twoD_feature.values()})
df_train=df_train.merge(df_neg_feature,on='review_id',how='left')
df_test=df_test.merge(df_neg_feature,on='review_id',how='left')

In [14]:
twoD_feature=np.load("../input/5153-bert-fullsentence-featureextraction-final/distil_full_bert_dic_2D.pkl",allow_pickle=True)
df_feature=pd.DataFrame({'review_id':twoD_feature.keys(),
                         '2d_distill':twoD_feature.values()})
df_train=df_train.merge(df_feature,on='review_id',how='left')
df_test=df_test.merge(df_feature,on='review_id',how='left')

In [15]:
df_train.columns

Index(['review_id', 'review_content', 'review_stars', 'delivery', 'product',
       'service', 'bert_avg', 'bert_max', 'bert_layer0', 'distil_bert_avg',
       'distil_bert_max', 'distil_bert_layer0', 'label',
       'neg_distil_bert_layer0', 'neg_distil_bert_avg', 'neg_2d_distill',
       '2d_distill'],
      dtype='object')

In [16]:
def data_process_2D(feature_name):
    X_train=np.array([x for x in list(df_train[feature_name])])
    X_train=X_train[:,0,:,:]
    y_train=np.array(df_train[['delivery','product','service']])

    X_test=np.array([x for x in list(df_test[feature_name])])
    X_test=X_test[:,0,:,:]
    y_test=np.array(df_test[['delivery','product','service']])
    
    return X_train,y_train,X_test,y_test

In [17]:
X_train,y_train,X_test,y_test=data_process_2D('neg_2d_distill')

model_5=BiLstm_model(X_train,y_train)
get_results(model_5,X_test,y_test)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
roc_auc_score_delivery: 0.931
roc_auc_score_product: 0.923
roc_auc_score_service: 0.831
macro_fl_delivery: 0.846
macro_fl_product: 0.836
macro_fl_service: 0.749


In [18]:
X_train,y_train,X_test,y_test=data_process_2D('2d_distill')

model_5=BiLstm_model(X_train,y_train)
get_results(model_5,X_test,y_test)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
roc_auc_score_delivery: 0.915
roc_auc_score_product: 0.924
roc_auc_score_service: 0.867
macro_fl_delivery: 0.813
macro_fl_product: 0.83
macro_fl_service: 0.747
