In [82]:
import pandas as pd
import numpy as np
#import re
#import nltk
#from datetime import datetime
#from nltk.corpus import stopwords
#from nltk.corpus import wordnet
#from nltk.stem import WordNetLemmatizer
#from nltk import PorterStemmer, LancasterStemmer, SnowballStemmer
#from nltk.tokenize import word_tokenize
#from nltk.tokenize import sent_tokenize
#from nltk.sentiment.vader import SentimentIntensityAnalyzer
#from sklearn.feature_extraction.text import CountVectorizer
#from sklearn.preprocessing import OneHotEncoder
#from sklearn.model_selection import cross_val_score
#import datetime

from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.metrics import f1_score, roc_curve, auc, roc_auc_score, recall_score, precision_score, accuracy_score
from sklearn.metrics import confusion_matrix, classification_report
from sklearn import metrics

from keras.models import Model
import numpy as np
from keras.layers import Input, Dense, Embedding, Activation, Flatten
from keras.layers import Conv1D, GlobalMaxPooling1D, Dropout, Concatenate, SimpleRNN,Bidirectional

from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

import warnings
warnings.filterwarnings('ignore')

## Inputs

In [2]:
file = '../input/negativecomments/train_data.csv'
testfile = '../input/negativecomments/test_data.csv'

rename_cols = {'Bad Delivery': 'delivery', 
               'Product':'product', 
               'Customer Service':'service'}

use_cols = ['review_id', 'review_content'] + list(rename_cols.values())

#seed = 5153
#valid_ratio = 0.1

#cv = 5
#num_eval = 5 
#score = 'accuracy'

## Data Preparation

In [3]:
def change_datatype(df):
    df['review_id'] = df['review_id'].astype('int64')
    df['delivery'] = df['delivery'].astype('int32')
    df['product'] = df['product'].astype('int32')
    df['service'] = df['service'].astype('int32')
    return df

In [4]:
df = pd.read_csv(file)
df = change_datatype(df.rename(columns=rename_cols)[use_cols])
print(df.shape)
df.head(3)

(1371, 5)


Unnamed: 0,review_id,review_content,delivery,product,service
0,10880,"Received well packed, Mask quality ok. But ear...",0,1,0
1,16711,If you are getting one mask then the price is ...,0,1,0
2,6351,Delivery was fast! Giving it 3 stars because I...,0,1,0


In [5]:
df_test = pd.read_csv(testfile)
df_test = change_datatype(df_test.rename(columns=rename_cols)[use_cols])
print(df_test.shape)
df_test.head(3)

(153, 5)


Unnamed: 0,review_id,review_content,delivery,product,service
0,16748,it does not cover the entire nose,0,1,0
1,7936,Items received in good condition. \nFast deliv...,0,1,0
2,35144,Order was not shipped by ship by date. 2 days ...,0,0,1


In [6]:
def add_target_label(df):
    df['label'] = df['delivery']*100 + df['product']*10 + df['service']
    df['label'] = '00'+df['label'].astype('str')
    df['label'] = df['label'].apply(lambda x: x[-3:])
    return df

In [7]:
#add target label
df = add_target_label(df)
df.loc[df.review_id==13310, 'label'] = '010'
print(df['label'].unique())
df.head(3)

['010' '001' '100' '110' '101' '011' '111']


Unnamed: 0,review_id,review_content,delivery,product,service,label
0,10880,"Received well packed, Mask quality ok. But ear...",0,1,0,10
1,16711,If you are getting one mask then the price is ...,0,1,0,10
2,6351,Delivery was fast! Giving it 3 stars because I...,0,1,0,10


In [8]:
#add target label
df_test = add_target_label(df_test)
df_test.loc[df_test.review_id==40625, 'label'] = '010'
print(df_test['label'].unique())
df_test.head(3)

['010' '001' '101' '110' '100' '011' '111']


Unnamed: 0,review_id,review_content,delivery,product,service,label
0,16748,it does not cover the entire nose,0,1,0,10
1,7936,Items received in good condition. \nFast deliv...,0,1,0,10
2,35144,Order was not shipped by ship by date. 2 days ...,0,0,1,1


## Merge DataFrames

In [9]:
df_all = pd.concat([df, df_test])
print(df.shape, df_test.shape, df_all.shape)
df_all

(1371, 6) (153, 6) (1524, 6)


Unnamed: 0,review_id,review_content,delivery,product,service,label
0,10880,"Received well packed, Mask quality ok. But ear...",0,1,0,010
1,16711,If you are getting one mask then the price is ...,0,1,0,010
2,6351,Delivery was fast! Giving it 3 stars because I...,0,1,0,010
3,33128,Too big for me,0,1,0,010
4,1304,Mask quality consider not bad for the price. D...,0,0,1,001
...,...,...,...,...,...,...
148,8674,Box a little damaged but product is still fine...,1,0,0,100
149,13363,Boxes deformed and stickers were already torn ...,1,0,1,101
150,32220,"Fits perfectly for guys, but adult size might ...",0,1,0,010
151,39695,i understand you have to change the type of ma...,0,0,1,001


In [10]:
df_all.loc[1, 'label']

1    010
1    010
Name: label, dtype: object

In [11]:
df_all.to_csv('df_all.csv')

# (1) BERT Feature Extraction - BertTokenizer

In [12]:
import torch
from transformers import BertTokenizer, BertModel

In [13]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Load pre-trained model tokenizer (vocabulary)
model = BertModel.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [14]:
#Step 1: Tokenization
tokenized = df_all['review_content'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [15]:
#Step 2: Padding
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
print(padded.shape)
print(padded[0])
print(tokenizer.convert_ids_to_tokens(0))

(1524, 86)
[  101  2363  2092  8966  1010  7308  3737  7929  1012  2021  4540 15932
  2978  4558  1998  1996  4451  6218  2081  1997  6081  3430  2029  2003
  2025  3733  2000  6260  3426  2025  4906  2007  1996  2227  7919   102
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0]
[PAD]


In [16]:
#Step 3: Masking
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(1524, 86)

In [17]:
%%time
#Step 4: Use the pre-trained BERT Model
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

feature_list = []
# feature_list_avg = []
# feature_list_max = []
with torch.no_grad():
    for batch_idx in range(0,padded.shape[0]):
        #BERT check 10 sample each time.
        input_ids = torch.tensor(padded[batch_idx:batch_idx+1])  
        used_attention_mask = torch.tensor(attention_mask[batch_idx:batch_idx+1])
        last_hidden_states = model(input_ids, attention_mask=used_attention_mask)
        
        features = last_hidden_states[0][:,:,:].numpy()
        feature_list.append(features)

CPU times: user 4min 54s, sys: 917 ms, total: 4min 55s
Wall time: 4min 57s


In [18]:
df_all['bert_vec_2d'] = feature_list
df_all.head()

Unnamed: 0,review_id,review_content,delivery,product,service,label,bert_vec_2d
0,10880,"Received well packed, Mask quality ok. But ear...",0,1,0,10,"[[[-0.71355814, -0.076815225, 0.08607712, -0.0..."
1,16711,If you are getting one mask then the price is ...,0,1,0,10,"[[[-0.08130932, -0.07046799, 0.44845906, -0.05..."
2,6351,Delivery was fast! Giving it 3 stars because I...,0,1,0,10,"[[[-0.0661019, -0.16525678, 0.18251413, -0.268..."
3,33128,Too big for me,0,1,0,10,"[[[-0.45213115, 0.114288636, -0.13416263, -0.1..."
4,1304,Mask quality consider not bad for the price. D...,0,0,1,1,"[[[-0.3741603, -0.3796699, 0.5739704, 0.079023..."


In [19]:
df_all.iloc[0, -1]

array([[[-0.71355814, -0.07681523,  0.08607712, ..., -0.5741174 ,
          0.17513901, -0.01610786],
        [ 0.15930746,  0.0086108 , -0.25385755, ..., -0.04421128,
          0.74882203,  0.47337043],
        [-0.45033374,  0.01329796,  0.74146324, ..., -0.30378473,
         -0.04832435, -0.40636843],
        ...,
        [-0.12282183, -0.21913949,  0.45437086, ..., -0.369274  ,
          0.09420827,  0.11392747],
        [-0.01509623, -0.16046087,  0.41565898, ..., -0.36179453,
          0.06863031,  0.17267777],
        [-0.10153729, -0.17985332,  0.51685977, ..., -0.2876695 ,
         -0.0684239 ,  0.1650774 ]]], dtype=float32)

# (2) BERT Feature Extraction - DistilBert

In [20]:
from transformers import DistilBertTokenizer, DistilBertModel
# Load pre-trained model tokenizer (vocabulary)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
# Load pre-trained model (weights)
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/442 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [21]:
tokenized = df_all['review_content'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512, truncation=True)))
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(1524, 86)

In [22]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [23]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(1524, 86)

In [24]:
%%time
feature_list_distilbert = []
# feature_list_avg = []
# feature_list_max = []
with torch.no_grad():
    for batch_idx in range(0,padded.shape[0]):
        #BERT check 10 sample each time.
        input_ids = torch.tensor(padded[batch_idx:batch_idx+1])  
        used_attention_mask = torch.tensor(attention_mask[batch_idx:batch_idx+1])
        last_hidden_states = model(input_ids, attention_mask=used_attention_mask)
        
        features = last_hidden_states[0][:,:,:].numpy()
        feature_list_distilbert.append(features)
        #Get the embeddings for the [CLS] tag (position is 0)
        #features_layer0 = last_hidden_states[0][:,0,:].numpy()
        #features_avg = np.mean(last_hidden_states[0].numpy(),axis=1)
        #features_max = np.max(last_hidden_states[0].numpy(),axis=1)
        #feature_list_layer0.append(features_layer0)
        #feature_list_avg.append(features_avg)
        #feature_list_max.append(features_max)
        

CPU times: user 2min 29s, sys: 469 ms, total: 2min 29s
Wall time: 2min 30s


In [25]:
# preprare features
# features_layer0 = np.vstack(feature_list_layer0)
# print(features_layer0.shape)

# features_avg = np.vstack(feature_list_avg)
# print(features_avg.shape)

# features_max = np.vstack(feature_list_max)
# print(features_max.shape)

features_all = np.vstack(feature_list)
print(features_all.shape)

(1524, 86, 768)


In [26]:
df_all['distilbert_vec_2d'] = feature_list_distilbert
df_all.head()

Unnamed: 0,review_id,review_content,delivery,product,service,label,bert_vec_2d,distilbert_vec_2d
0,10880,"Received well packed, Mask quality ok. But ear...",0,1,0,10,"[[[-0.71355814, -0.076815225, 0.08607712, -0.0...","[[[-0.2146607, -0.004405518, 0.1835633, -0.107..."
1,16711,If you are getting one mask then the price is ...,0,1,0,10,"[[[-0.08130932, -0.07046799, 0.44845906, -0.05...","[[[0.08736028, -0.008792455, 0.21259484, -0.18..."
2,6351,Delivery was fast! Giving it 3 stars because I...,0,1,0,10,"[[[-0.0661019, -0.16525678, 0.18251413, -0.268...","[[[0.047636323, -0.019825457, 0.14735073, -0.0..."
3,33128,Too big for me,0,1,0,10,"[[[-0.45213115, 0.114288636, -0.13416263, -0.1...","[[[-0.1816419, -0.120666966, 0.108133174, -0.2..."
4,1304,Mask quality consider not bad for the price. D...,0,0,1,1,"[[[-0.3741603, -0.3796699, 0.5739704, 0.079023...","[[[-0.24662277, -0.23489419, 0.08370317, 0.217..."


In [35]:
type(df_all.iloc[0, -1])

numpy.ndarray

In [28]:
# def save_to_dic(keys, values):
#     dic = {}
#     for i, key in enumerate(keys):
#         dic[key] = list(values[i])
#     return dic

# def save_dic_to_txt(save_name, dic):
#     f = open(save_name, 'w')
#     f.write(str(dic))
#     f.close()

In [29]:
# bert_2d_dict = save_to_dic(df_all.review_id, feature_list)
# save_dic_to_txt('bert_2d_dict.txt', bert_2d_dict)

In [30]:
# distilbert_2d_dict = save_to_dic(df_all.review_id, feature_list_distilbert)
# save_dic_to_txt('distilbert_2d_dict.txt', distilbert_2d_dict)

In [31]:
train = pd.read_csv('../input/bt5153traintestbertfullsentence/train_data_bert_fullsent.csv', dtype={'label':str})
test = pd.read_csv('../input/bt5153traintestbertfullsentence/test_data_bert_fullsent.csv', dtype={'label':str})
train.head()

Unnamed: 0,review_id,review_content,review_stars,delivery,product,service,bert_avg,bert_max,bert_layer0,distil_bert_avg,distil_bert_max,distil_bert_layer0,label
0,9161,"Delivery took more than a week, short expiry d...",2,1,1,0,"[-0.06329992, -0.32380387, 0.35608995, -0.1268...","[0.93797976, 0.25249124, 1.1054325, 0.49609792...","[-0.19600664, -0.4481723, 0.20494524, -0.39789...","[-0.046782605, -0.27473888, 0.2780265, 0.03694...","[0.4727717, 0.23103717, 0.7197178, 0.6083572, ...","[-0.022450736, -0.34166092, 0.13424423, -0.170...",110
1,8198,"Quality is so-so, loops are too huge resulting...",3,0,1,0,"[0.19020107, -0.3453789, 0.3494842, 0.14938574...","[1.1594802, 0.60506886, 1.084206, 0.8051733, 0...","[0.04546422, -0.29597518, -0.35977763, -0.4039...","[0.1399654, -0.11672436, 0.23911656, 0.1187961...","[0.8445691, 0.4961368, 0.7105234, 0.6109599, 0...","[0.12642525, -0.1857245, 0.08149249, -0.019622...",10
2,5314,Received within 3days. Well packed in a carton...,4,0,1,0,"[-0.028732965, -0.14712504, 0.36105582, 0.0254...","[1.3182856, 0.7953221, 1.3915914, 0.87499446, ...","[-0.38282838, -0.22101544, 0.6115111, -0.33658...","[0.07374873, -0.01620108, 0.20269494, 0.149083...","[0.76418287, 0.48008785, 0.97139245, 0.5615782...","[0.026645288, -0.27306008, 0.20750262, 0.05986...",10
3,17589,Not sure if these were the authentic as I brou...,3,0,1,0,"[0.3033499, 0.02644875, 0.16410081, 0.04581299...","[1.2807441, 0.85781705, 0.80655, 0.49130562, 0...","[-0.078149214, 0.117149524, 0.04559862, -0.218...","[0.18770164, 0.114881225, 0.08257794, 0.121869...","[0.9882787, 0.5043827, 0.54232115, 0.44097498,...","[0.23179623, 0.050810635, 0.07551272, -0.15587...",10
4,31446,its a fashion mask. no filter so no protection...,3,0,1,0,"[-0.18616518, -0.38300017, 0.52194715, 0.18413...","[0.65815246, 0.20897251, 1.0151551, 0.48899576...","[-0.29987, -0.26352054, 0.3190687, -0.22115439...","[-0.0903255, -0.1808794, 0.33290026, 0.2622123...","[0.66676205, 0.099603325, 0.8990194, 0.5398592...","[-0.34698877, -0.3470553, 0.3122651, 0.0037901...",10


In [32]:
train = train.merge(df_all[['review_id', 'bert_vec_2d', 'distilbert_vec_2d']], on='review_id')
train.head()

Unnamed: 0,review_id,review_content,review_stars,delivery,product,service,bert_avg,bert_max,bert_layer0,distil_bert_avg,distil_bert_max,distil_bert_layer0,label,bert_vec_2d,distilbert_vec_2d
0,9161,"Delivery took more than a week, short expiry d...",2,1,1,0,"[-0.06329992, -0.32380387, 0.35608995, -0.1268...","[0.93797976, 0.25249124, 1.1054325, 0.49609792...","[-0.19600664, -0.4481723, 0.20494524, -0.39789...","[-0.046782605, -0.27473888, 0.2780265, 0.03694...","[0.4727717, 0.23103717, 0.7197178, 0.6083572, ...","[-0.022450736, -0.34166092, 0.13424423, -0.170...",110,"[[[-0.19600664, -0.4481723, 0.20494524, -0.397...","[[[-0.022450736, -0.34166092, 0.13424423, -0.1..."
1,8198,"Quality is so-so, loops are too huge resulting...",3,0,1,0,"[0.19020107, -0.3453789, 0.3494842, 0.14938574...","[1.1594802, 0.60506886, 1.084206, 0.8051733, 0...","[0.04546422, -0.29597518, -0.35977763, -0.4039...","[0.1399654, -0.11672436, 0.23911656, 0.1187961...","[0.8445691, 0.4961368, 0.7105234, 0.6109599, 0...","[0.12642525, -0.1857245, 0.08149249, -0.019622...",10,"[[[0.04546422, -0.29597518, -0.35977763, -0.40...","[[[0.12642525, -0.1857245, 0.08149249, -0.0196..."
2,5314,Received within 3days. Well packed in a carton...,4,0,1,0,"[-0.028732965, -0.14712504, 0.36105582, 0.0254...","[1.3182856, 0.7953221, 1.3915914, 0.87499446, ...","[-0.38282838, -0.22101544, 0.6115111, -0.33658...","[0.07374873, -0.01620108, 0.20269494, 0.149083...","[0.76418287, 0.48008785, 0.97139245, 0.5615782...","[0.026645288, -0.27306008, 0.20750262, 0.05986...",10,"[[[-0.38282838, -0.22101544, 0.6115111, -0.336...","[[[0.026645288, -0.27306008, 0.20750262, 0.059..."
3,17589,Not sure if these were the authentic as I brou...,3,0,1,0,"[0.3033499, 0.02644875, 0.16410081, 0.04581299...","[1.2807441, 0.85781705, 0.80655, 0.49130562, 0...","[-0.078149214, 0.117149524, 0.04559862, -0.218...","[0.18770164, 0.114881225, 0.08257794, 0.121869...","[0.9882787, 0.5043827, 0.54232115, 0.44097498,...","[0.23179623, 0.050810635, 0.07551272, -0.15587...",10,"[[[-0.078149214, 0.117149524, 0.04559862, -0.2...","[[[0.23179623, 0.050810635, 0.07551272, -0.155..."
4,31446,its a fashion mask. no filter so no protection...,3,0,1,0,"[-0.18616518, -0.38300017, 0.52194715, 0.18413...","[0.65815246, 0.20897251, 1.0151551, 0.48899576...","[-0.29987, -0.26352054, 0.3190687, -0.22115439...","[-0.0903255, -0.1808794, 0.33290026, 0.2622123...","[0.66676205, 0.099603325, 0.8990194, 0.5398592...","[-0.34698877, -0.3470553, 0.3122651, 0.0037901...",10,"[[[-0.29987, -0.26352054, 0.3190687, -0.221154...","[[[-0.34698877, -0.3470553, 0.3122651, 0.00379..."


In [33]:
test = test.merge(df_all[['review_id', 'bert_vec_2d', 'distilbert_vec_2d']], on='review_id')

In [36]:
train.iloc[0,-1].shape

(1, 86, 768)

In [52]:
sequence_length = 86
vector_length = 768
input_shape = (sequence_length,vector_length)
model_input = Input(shape=input_shape)

In [53]:
# Convolutional Layer 
conv_blocks = []
#capture two-grams, 3-grams and 4 grams
filter_sizes = [2,3,4]
#for each filter, the number of filters
num_filters = 30
#loop over the different filter sizes
for sz in filter_sizes:
    # sz is the window size
    conv = Conv1D(filters=num_filters,
                  kernel_size=sz,
                  padding="valid",
                  activation="relu",
                  strides=1)(model_input)
    # Pooling Layer
    conv = GlobalMaxPooling1D()(conv)
    conv_blocks.append(conv)
# Fully-connected Layer
hiddenz = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]

In [54]:
# It is binary classifcation problem. We can use sigmoid layer.
# If it is multi-class classifcaiton problem, we can use softmax layer 
model_output = Dense(3, activation="sigmoid")(hiddenz)
cnn_model = Model(model_input, model_output)
cnn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["AUC"])

In [55]:
print(cnn_model.summary())

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 86, 768)]    0                                            
__________________________________________________________________________________________________
conv1d_4 (Conv1D)               (None, 85, 30)       46110       input_3[0][0]                    
__________________________________________________________________________________________________
conv1d_5 (Conv1D)               (None, 84, 30)       69150       input_3[0][0]                    
__________________________________________________________________________________________________
conv1d_6 (Conv1D)               (None, 83, 30)       92190       input_3[0][0]                    
______________________________________________________________________________________________

#### CNN Using Bert Vectors

In [None]:
# reduce one dimension, (1, 86, 768) -> (86, 768)
train.loc[:, 'bert_vec_2d'] = train.loc[:, 'bert_vec_2d'].apply(lambda x: x[0])
test.loc[:, 'bert_vec_2d'] = test.loc[:, 'bert_vec_2d'].apply(lambda x: x[0])

In [67]:
# Convert to numpy array
train_data = np.array(list(train.bert_vec_2d), dtype='float32')
test_data = np.array(list(test.bert_vec_2d), dtype='float32')
train_classes = np.array(train[['delivery', 'product','service']], dtype='int')
test_classes = np.array(test[['delivery', 'product','service']], dtype='int')

In [None]:
print(train_data.shape, test_data.shape)
print(train_classes.shape, test_classes.shape)

In [74]:
# Training
cnn_model.fit(train_data, train_classes,
          validation_data=(test_data, test_classes),
          batch_size=32,
          epochs=10,
          verbose=2)

Epoch 1/10
34/34 - 4s - loss: 0.4315 - auc: 0.8669 - val_loss: 0.3704 - val_auc: 0.9078
Epoch 2/10
34/34 - 0s - loss: 0.2147 - auc: 0.9729 - val_loss: 0.3207 - val_auc: 0.9299
Epoch 3/10
34/34 - 0s - loss: 0.1279 - auc: 0.9950 - val_loss: 0.3101 - val_auc: 0.9355
Epoch 4/10
34/34 - 0s - loss: 0.0861 - auc: 0.9988 - val_loss: 0.3120 - val_auc: 0.9389
Epoch 5/10
34/34 - 0s - loss: 0.0558 - auc: 0.9999 - val_loss: 0.3138 - val_auc: 0.9395
Epoch 6/10
34/34 - 0s - loss: 0.0369 - auc: 1.0000 - val_loss: 0.3168 - val_auc: 0.9415
Epoch 7/10
34/34 - 0s - loss: 0.0265 - auc: 1.0000 - val_loss: 0.3289 - val_auc: 0.9398
Epoch 8/10
34/34 - 0s - loss: 0.0204 - auc: 1.0000 - val_loss: 0.3325 - val_auc: 0.9407
Epoch 9/10
34/34 - 0s - loss: 0.0166 - auc: 1.0000 - val_loss: 0.3418 - val_auc: 0.9391
Epoch 10/10
34/34 - 0s - loss: 0.0126 - auc: 1.0000 - val_loss: 0.3443 - val_auc: 0.9395


<tensorflow.python.keras.callbacks.History at 0x7ff28841bed0>

In [81]:
prediction_cnn_bert = cnn_model.predict(test_data)

In [96]:
test['delivery_cnn_bert_predp'] = prediction_cnn_bert[:, 0]
test['product_cnn_bert_predp'] = prediction_cnn_bert[:, 1]
test['service_cnn_bert_predp'] = prediction_cnn_bert[:, 2]

In [101]:
test['delivery_cnn_bert_pred'] = test['delivery_cnn_bert_predp'].apply(lambda x: round(x))
test['product_cnn_bert_pred'] = test['product_cnn_bert_predp'].apply(lambda x: round(x))
test['service_cnn_bert_pred'] =test['service_cnn_bert_predp'].apply(lambda x: round(x))

In [104]:
def add_pred_target_label(df):
    df['label_pred'] = df['delivery_cnn_bert_pred']*100 + df['product_cnn_bert_pred']*10 + df['service_cnn_bert_pred']
    df['label_pred'] = '00'+df['label_pred'].astype('str')
    df['label_pred'] = df['label_pred'].apply(lambda x: x[-3:])
    return df

In [126]:
train.label.value_counts()

010    631
100    173
001    142
110     43
011     37
101     29
111     10
Name: label, dtype: int64

In [128]:
test.label_pred.value_counts()

010    288
100     62
001     47
000     22
101     18
011     11
110     10
Name: label_pred, dtype: int64

In [105]:
#add prediction target label
test = add_pred_target_label(test)
#df_test.loc[df_test.review_id==40625, 'label'] = '010'
print(test['label_pred'].unique())

['010' '001' '100' '101' '000' '011' '110']


In [124]:
test.head()

Unnamed: 0,review_id,review_content,review_stars,delivery,product,service,bert_avg,bert_max,bert_layer0,distil_bert_avg,...,label,bert_vec_2d,distilbert_vec_2d,delivery_cnn_bert_predp,product_cnn_bert_predp,service_cnn_bert_predp,delivery_cnn_bert_pred,product_cnn_bert_pred,service_cnn_bert_pred,label_pred
0,16748,it does not cover the entire nose,4,0,1,0,"[-0.102646254, -0.22875029, 0.18799533, 0.1218...","[0.9322276, 0.36336058, 0.5335494, 0.6365655, ...","[-0.13417566, 0.14176558, 0.060101416, -0.2363...","[-0.052312087, 0.11116923, 0.09106587, -0.0266...",...,10,"[[-0.13417566, 0.14176558, 0.060101416, -0.236...","[[[-0.25892562, 0.09141161, 0.11670131, -0.278...",0.000302,0.998952,0.000354,0,1,0,10
1,7936,Items received in good condition. \nFast deliv...,3,0,1,0,"[-0.073171206, -0.28532478, 0.4684518, 0.06098...","[0.6989626, 0.60659623, 1.2653117, 0.56227136,...","[-0.3070287, -0.16202989, 0.28549048, -0.38556...","[-0.077326566, -0.08241358, 0.25523275, 0.1806...",...,10,"[[-0.3070287, -0.16202989, 0.28549048, -0.3855...","[[[-0.26072696, -0.15078594, 0.09794032, -0.02...",0.001959,0.975257,0.006896,0,1,0,10
2,35144,Order was not shipped by ship by date. 2 days ...,1,0,0,1,"[0.033353284, -0.15982221, 0.28324175, -0.0954...","[0.78534734, 0.81594664, 1.2221552, 0.7786952,...","[-0.29875648, -0.26088548, 0.4403837, -0.42945...","[-0.044279817, -0.12734465, 0.29842582, 0.0614...",...,1,"[[-0.29875648, -0.26088548, 0.4403837, -0.4294...","[[[-0.141533, -0.18464121, 0.109193765, 0.0123...",0.01116,0.000797,0.9999,0,0,1,1
3,2842,"Never believe ship out within 12 hours , was t...",2,1,0,1,"[0.09460444, -0.21060538, 0.45771155, 0.155521...","[1.3276132, 0.4937837, 2.0165536, 0.51392186, ...","[-0.23258153, 0.108628765, 0.54131097, 0.02621...","[0.24199268, -0.06438464, 0.31322727, 0.142381...",...,101,"[[-0.23258153, 0.108628765, 0.54131097, 0.0262...","[[[-0.005784018, -0.09771741, 0.094822764, -0....",0.289861,0.006982,0.960201,0,0,1,1
4,14016,The grey masks are rougher than the blue ones ...,4,0,1,0,"[0.1367541, -0.21565422, 0.17242633, 0.0259907...","[1.1356148, 0.39919722, 0.8751235, 0.4737515, ...","[0.025354343, -0.047820117, -0.21739256, -0.03...","[0.21196146, -0.111543424, 0.048495315, 0.1669...",...,10,"[[0.025354343, -0.047820117, -0.21739256, -0.0...","[[[0.17126973, -0.03728881, -0.0028430712, -0....",0.00061,0.996696,7.8e-05,0,1,0,10


In [123]:
print('Result of Delivery from CNN with Bert vectors')
print('AUC score: {}'.format(roc_auc_score(test.delivery, test.delivery_cnn_bert_predp)))
print(classification_report(test.delivery, test.delivery_cnn_bert_pred))

print('Result of Product from CNN with Bert vectors')
print('AUC score: {}'.format(roc_auc_score(test['product'], test.product_cnn_bert_predp)))
print(classification_report(test['product'], test.product_cnn_bert_pred))

print('Result of Service from CNN with Bert vectors')
print('AUC score: {}'.format(roc_auc_score(test['service'], test.service_cnn_bert_predp)))
print(classification_report(test['service'], test.service_cnn_bert_pred))

Result of Delivery from CNN with Bert vectors
AUC score: 0.9589690337353889
              precision    recall  f1-score   support

           0       0.92      0.96      0.94       351
           1       0.84      0.71      0.77       107

    accuracy                           0.90       458
   macro avg       0.88      0.84      0.85       458
weighted avg       0.90      0.90      0.90       458

Result of Product from CNN with Bert vectors
AUC score: 0.9484577922077921
              precision    recall  f1-score   support

           0       0.85      0.82      0.83       154
           1       0.91      0.92      0.92       304

    accuracy                           0.89       458
   macro avg       0.88      0.87      0.87       458
weighted avg       0.89      0.89      0.89       458

Result of Service from CNN with Bert vectors
AUC score: 0.8631216931216931
              precision    recall  f1-score   support

           0       0.87      0.95      0.91       350
           

In [125]:
print('Combined Result from CNN with Bert vectors')
print(classification_report(test['label'], test.label_pred))

Combined Result from CNN with Bert vectors
              precision    recall  f1-score   support

         000       0.00      0.00      0.00         0
         001       0.72      0.51      0.60        67
         010       0.85      0.94      0.89       262
         011       0.55      0.27      0.36        22
         100       0.74      0.66      0.70        70
         101       0.33      0.35      0.34        17
         110       0.40      0.22      0.29        18
         111       0.00      0.00      0.00         2

    accuracy                           0.75       458
   macro avg       0.45      0.37      0.40       458
weighted avg       0.76      0.75      0.75       458



#### CNN Distil Bert Vectors

In [129]:
train.head()

Unnamed: 0,review_id,review_content,review_stars,delivery,product,service,bert_avg,bert_max,bert_layer0,distil_bert_avg,distil_bert_max,distil_bert_layer0,label,bert_vec_2d,distilbert_vec_2d
0,9161,"Delivery took more than a week, short expiry d...",2,1,1,0,"[-0.06329992, -0.32380387, 0.35608995, -0.1268...","[0.93797976, 0.25249124, 1.1054325, 0.49609792...","[-0.19600664, -0.4481723, 0.20494524, -0.39789...","[-0.046782605, -0.27473888, 0.2780265, 0.03694...","[0.4727717, 0.23103717, 0.7197178, 0.6083572, ...","[-0.022450736, -0.34166092, 0.13424423, -0.170...",110,"[[-0.19600664, -0.4481723, 0.20494524, -0.3978...","[[[-0.022450736, -0.34166092, 0.13424423, -0.1..."
1,8198,"Quality is so-so, loops are too huge resulting...",3,0,1,0,"[0.19020107, -0.3453789, 0.3494842, 0.14938574...","[1.1594802, 0.60506886, 1.084206, 0.8051733, 0...","[0.04546422, -0.29597518, -0.35977763, -0.4039...","[0.1399654, -0.11672436, 0.23911656, 0.1187961...","[0.8445691, 0.4961368, 0.7105234, 0.6109599, 0...","[0.12642525, -0.1857245, 0.08149249, -0.019622...",10,"[[0.04546422, -0.29597518, -0.35977763, -0.403...","[[[0.12642525, -0.1857245, 0.08149249, -0.0196..."
2,5314,Received within 3days. Well packed in a carton...,4,0,1,0,"[-0.028732965, -0.14712504, 0.36105582, 0.0254...","[1.3182856, 0.7953221, 1.3915914, 0.87499446, ...","[-0.38282838, -0.22101544, 0.6115111, -0.33658...","[0.07374873, -0.01620108, 0.20269494, 0.149083...","[0.76418287, 0.48008785, 0.97139245, 0.5615782...","[0.026645288, -0.27306008, 0.20750262, 0.05986...",10,"[[-0.38282838, -0.22101544, 0.6115111, -0.3365...","[[[0.026645288, -0.27306008, 0.20750262, 0.059..."
3,17589,Not sure if these were the authentic as I brou...,3,0,1,0,"[0.3033499, 0.02644875, 0.16410081, 0.04581299...","[1.2807441, 0.85781705, 0.80655, 0.49130562, 0...","[-0.078149214, 0.117149524, 0.04559862, -0.218...","[0.18770164, 0.114881225, 0.08257794, 0.121869...","[0.9882787, 0.5043827, 0.54232115, 0.44097498,...","[0.23179623, 0.050810635, 0.07551272, -0.15587...",10,"[[-0.078149214, 0.117149524, 0.04559862, -0.21...","[[[0.23179623, 0.050810635, 0.07551272, -0.155..."
4,31446,its a fashion mask. no filter so no protection...,3,0,1,0,"[-0.18616518, -0.38300017, 0.52194715, 0.18413...","[0.65815246, 0.20897251, 1.0151551, 0.48899576...","[-0.29987, -0.26352054, 0.3190687, -0.22115439...","[-0.0903255, -0.1808794, 0.33290026, 0.2622123...","[0.66676205, 0.099603325, 0.8990194, 0.5398592...","[-0.34698877, -0.3470553, 0.3122651, 0.0037901...",10,"[[-0.29987, -0.26352054, 0.3190687, -0.2211543...","[[[-0.34698877, -0.3470553, 0.3122651, 0.00379..."


In [133]:
# reduce one dimension, (1, 86, 768) -> (86, 768)
train.loc[:, 'distilbert_vec_2d'] = train.loc[:, 'distilbert_vec_2d'].apply(lambda x: x[0])
test.loc[:, 'distilbert_vec_2d'] = test.loc[:, 'distilbert_vec_2d'].apply(lambda x: x[0])

In [134]:
train.loc[0, 'distilbert_vec_2d'].shape

(86, 768)

In [135]:
# Convert to numpy array
train_data_distilbert = np.array(list(train.distilbert_vec_2d), dtype='float32')
test_data_distilbert = np.array(list(test.distilbert_vec_2d), dtype='float32')

# Training
cnn_model.fit(train_data_distilbert, train_classes,
          validation_data=(test_data_distilbert, test_classes),
          batch_size=32,
          epochs=10,
          verbose=2)

# Prediction
prediction_cnn_distilbert = cnn_model.predict(test_data_distilbert)

Epoch 1/10
34/34 - 1s - loss: 0.1431 - auc: 0.9889 - val_loss: 0.4016 - val_auc: 0.9238
Epoch 2/10
34/34 - 0s - loss: 0.0873 - auc: 0.9980 - val_loss: 0.3297 - val_auc: 0.9385
Epoch 3/10
34/34 - 0s - loss: 0.0534 - auc: 0.9997 - val_loss: 0.3290 - val_auc: 0.9378
Epoch 4/10
34/34 - 0s - loss: 0.0414 - auc: 1.0000 - val_loss: 0.3359 - val_auc: 0.9401
Epoch 5/10
34/34 - 0s - loss: 0.0296 - auc: 1.0000 - val_loss: 0.3442 - val_auc: 0.9384
Epoch 6/10
34/34 - 0s - loss: 0.0204 - auc: 1.0000 - val_loss: 0.3382 - val_auc: 0.9405
Epoch 7/10
34/34 - 0s - loss: 0.0159 - auc: 1.0000 - val_loss: 0.3443 - val_auc: 0.9409
Epoch 8/10
34/34 - 0s - loss: 0.0135 - auc: 1.0000 - val_loss: 0.3588 - val_auc: 0.9391
Epoch 9/10
34/34 - 0s - loss: 0.0117 - auc: 1.0000 - val_loss: 0.3520 - val_auc: 0.9410
Epoch 10/10
34/34 - 0s - loss: 0.0098 - auc: 1.0000 - val_loss: 0.3581 - val_auc: 0.9398


In [136]:
test['delivery_cnn_distilbert_predp'] = prediction_cnn_distilbert[:, 0]
test['product_cnn_distilbert_predp'] = prediction_cnn_distilbert[:, 1]
test['service_cnn_distilbert_predp'] = prediction_cnn_distilbert[:, 2]

test['delivery_cnn_distilbert_pred'] = test['delivery_cnn_distilbert_predp'].apply(lambda x: round(x))
test['product_cnn_distilbert_pred'] = test['product_cnn_distilbert_predp'].apply(lambda x: round(x))
test['service_cnn_distilbert_pred'] =test['service_cnn_distilbert_predp'].apply(lambda x: round(x))

In [149]:
def add_pred_target_label(df):
    df['label_distilbert_pred'] = df['delivery_cnn_distilbert_pred']*100 + df['product_cnn_distilbert_pred']*10 + df['service_cnn_distilbert_pred']
    df['label_distilbert_pred'] = '00'+df['label_distilbert_pred'].astype('str')
    df['label_distilbert_pred'] = df['label_distilbert_pred'].apply(lambda x: x[-3:])
    return df

#add prediction target label
test = add_pred_target_label(test)
#df_test.loc[df_test.review_id==40625, 'label'] = '010'
print(test['label_distilbert_pred'].unique())

['010' '001' '101' '100' '110' '011' '000']


In [150]:
print('Result of Delivery from CNN with Distil Bert vectors')
print('AUC score: {}'.format(roc_auc_score(test.delivery, test.delivery_cnn_distilbert_predp)))
print(classification_report(test.delivery, test.delivery_cnn_distilbert_pred))

print('Result of Product from CNN with Distil Bert vectors')
print('AUC score: {}'.format(roc_auc_score(test['product'], test.product_cnn_distilbert_predp)))
print(classification_report(test['product'], test.product_cnn_distilbert_pred))

print('Result of Service from CNN with Distil Bert vectors')
print('AUC score: {}'.format(roc_auc_score(test['service'], test.service_cnn_distilbert_predp)))
print(classification_report(test['service'], test.service_cnn_distilbert_pred))

Result of Delivery from CNN with Distil Bert vectors
AUC score: 0.9599275767500067
              precision    recall  f1-score   support

           0       0.94      0.95      0.94       351
           1       0.82      0.79      0.80       107

    accuracy                           0.91       458
   macro avg       0.88      0.87      0.87       458
weighted avg       0.91      0.91      0.91       458

Result of Product from CNN with Distil Bert vectors
AUC score: 0.9464499316473001
              precision    recall  f1-score   support

           0       0.86      0.77      0.81       154
           1       0.89      0.94      0.91       304

    accuracy                           0.88       458
   macro avg       0.87      0.85      0.86       458
weighted avg       0.88      0.88      0.88       458

Result of Service from CNN with Distil Bert vectors
AUC score: 0.8767195767195768
              precision    recall  f1-score   support

           0       0.89      0.95      0.91 

In [151]:
print('Combined Result from CNN with Bert vectors')
print(classification_report(test['label'], test.label_distilbert_pred))

Combined Result from CNN with Bert vectors
              precision    recall  f1-score   support

         000       0.00      0.00      0.00         0
         001       0.66      0.49      0.56        67
         010       0.88      0.94      0.91       262
         011       0.50      0.32      0.39        22
         100       0.77      0.63      0.69        70
         101       0.35      0.41      0.38        17
         110       0.35      0.50      0.41        18
         111       0.00      0.00      0.00         2

    accuracy                           0.76       458
   macro avg       0.44      0.41      0.42       458
weighted avg       0.77      0.76      0.76       458



### RNN with Bert Vectors

In [143]:
input_shape = (sequence_length, vector_length)
model_input = Input(shape=input_shape)

In [144]:
hidden_output = Bidirectional(SimpleRNN(20))(model_input)
model_output = Dense(3, activation="sigmoid")(hidden_output)

In [145]:
rnn_model = Model(model_input, model_output)
rnn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["AUC"])

In [146]:
print(rnn_model.summary())

Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         [(None, 86, 768)]         0         
_________________________________________________________________
bidirectional (Bidirectional (None, 40)                31560     
_________________________________________________________________
dense_1 (Dense)              (None, 3)                 123       
Total params: 31,683
Trainable params: 31,683
Non-trainable params: 0
_________________________________________________________________
None


In [147]:
# Training
rnn_model.fit(train_data, train_classes,
          validation_data=(test_data, test_classes),
          batch_size=32,
          epochs=10,
          verbose=2)

Epoch 1/10
34/34 - 6s - loss: 0.5339 - auc: 0.7858 - val_loss: 0.4848 - val_auc: 0.8322
Epoch 2/10
34/34 - 4s - loss: 0.4171 - auc: 0.8844 - val_loss: 0.4340 - val_auc: 0.8662
Epoch 3/10
34/34 - 5s - loss: 0.3523 - auc: 0.9210 - val_loss: 0.3997 - val_auc: 0.8898
Epoch 4/10
34/34 - 4s - loss: 0.3047 - auc: 0.9444 - val_loss: 0.4012 - val_auc: 0.8910
Epoch 5/10
34/34 - 4s - loss: 0.2689 - auc: 0.9586 - val_loss: 0.3756 - val_auc: 0.9042
Epoch 6/10
34/34 - 4s - loss: 0.2371 - auc: 0.9700 - val_loss: 0.3640 - val_auc: 0.9108
Epoch 7/10
34/34 - 4s - loss: 0.2104 - auc: 0.9770 - val_loss: 0.3814 - val_auc: 0.9074
Epoch 8/10
34/34 - 4s - loss: 0.1787 - auc: 0.9859 - val_loss: 0.3880 - val_auc: 0.9080
Epoch 9/10
34/34 - 4s - loss: 0.1554 - auc: 0.9902 - val_loss: 0.3819 - val_auc: 0.9075
Epoch 10/10
34/34 - 5s - loss: 0.1491 - auc: 0.9905 - val_loss: 0.4274 - val_auc: 0.9036


In [157]:
# Prediction
prediction_rnn_bert = rnn_model.predict(test_data)

test['delivery_rnn_bert_predp'] = prediction_rnn_bert[:, 0]
test['product_rnn_bert_predp'] = prediction_rnn_bert[:, 1]
test['service_rnn_bert_predp'] = prediction_rnn_bert[:, 2]

test['delivery_rnn_bert_pred'] = test['delivery_rnn_bert_predp'].apply(lambda x: round(x))
test['product_rnn_bert_pred'] = test['product_rnn_bert_predp'].apply(lambda x: round(x))
test['service_rnn_bert_pred'] =test['service_rnn_bert_predp'].apply(lambda x: round(x))

In [158]:
def add_pred_target_label(df):
    df['label_bert_rnn_pred'] = df['delivery_rnn_bert_pred']*100 + df['product_rnn_bert_pred']*10 + df['service_rnn_bert_pred']
    df['label_bert_rnn_pred'] = '00'+df['label_bert_rnn_pred'].astype('str')
    df['label_bert_rnn_pred'] = df['label_bert_rnn_pred'].apply(lambda x: x[-3:])
    return df

#add prediction target label
test = add_pred_target_label(test)
#df_test.loc[df_test.review_id==40625, 'label'] = '010'
print(test['label_bert_rnn_pred'].unique())

['010' '001' '110' '100' '000' '011' '101']


In [160]:
print('Result of Delivery from rnn with Bert vectors')
print('AUC score: {}'.format(roc_auc_score(test.delivery, test.delivery_rnn_bert_predp)))
print(classification_report(test.delivery, test.delivery_rnn_bert_pred))

print('Result of Product from rnn with Bert vectors')
print('AUC score: {}'.format(roc_auc_score(test['product'], test.product_rnn_bert_predp)))
print(classification_report(test['product'], test.product_rnn_bert_pred))

print('Result of Service from rnn with Bert vectors')
print('AUC score: {}'.format(roc_auc_score(test['service'], test.service_rnn_bert_predp)))
print(classification_report(test['service'], test.service_rnn_bert_pred))

Result of Delivery from rnn with Bert vectors
AUC score: 0.9078733658172911
              precision    recall  f1-score   support

           0       0.89      0.93      0.91       351
           1       0.73      0.64      0.68       107

    accuracy                           0.86       458
   macro avg       0.81      0.78      0.80       458
weighted avg       0.86      0.86      0.86       458

Result of Product from rnn with Bert vectors
AUC score: 0.9094113123718386
              precision    recall  f1-score   support

           0       0.86      0.53      0.66       154
           1       0.80      0.96      0.87       304

    accuracy                           0.81       458
   macro avg       0.83      0.74      0.77       458
weighted avg       0.82      0.81      0.80       458

Result of Service from rnn with Bert vectors
AUC score: 0.8475661375661375
              precision    recall  f1-score   support

           0       0.82      0.99      0.90       350
           

In [161]:
print('Combined Result from RNN with Bert vectors')
print(classification_report(test['label'], test.label_bert_rnn_pred))

Combined Result from RNN with Bert vectors
              precision    recall  f1-score   support

         000       0.00      0.00      0.00         0
         001       0.73      0.24      0.36        67
         010       0.76      0.94      0.84       262
         011       0.08      0.05      0.06        22
         100       0.62      0.57      0.60        70
         101       0.50      0.12      0.19        17
         110       0.24      0.33      0.28        18
         111       0.00      0.00      0.00         2

    accuracy                           0.68       458
   macro avg       0.37      0.28      0.29       458
weighted avg       0.67      0.68      0.65       458



### RNN Using Distil Bert Vectors

In [162]:
# Training
rnn_model.fit(train_data_distilbert, train_classes,
          validation_data=(test_data_distilbert, test_classes),
          batch_size=32,
          epochs=10,
          verbose=2)

Epoch 1/10
34/34 - 4s - loss: 0.2502 - auc: 0.9588 - val_loss: 0.3657 - val_auc: 0.9154
Epoch 2/10
34/34 - 4s - loss: 0.2107 - auc: 0.9717 - val_loss: 0.3614 - val_auc: 0.9189
Epoch 3/10
34/34 - 4s - loss: 0.1808 - auc: 0.9797 - val_loss: 0.3602 - val_auc: 0.9209
Epoch 4/10
34/34 - 5s - loss: 0.1495 - auc: 0.9882 - val_loss: 0.3799 - val_auc: 0.9170
Epoch 5/10
34/34 - 4s - loss: 0.1387 - auc: 0.9893 - val_loss: 0.3687 - val_auc: 0.9207
Epoch 6/10
34/34 - 4s - loss: 0.1067 - auc: 0.9955 - val_loss: 0.3801 - val_auc: 0.9214
Epoch 7/10
34/34 - 4s - loss: 0.1080 - auc: 0.9948 - val_loss: 0.3943 - val_auc: 0.9147
Epoch 8/10
34/34 - 4s - loss: 0.1013 - auc: 0.9958 - val_loss: 0.4134 - val_auc: 0.9164
Epoch 9/10
34/34 - 4s - loss: 0.0729 - auc: 0.9986 - val_loss: 0.3951 - val_auc: 0.9209
Epoch 10/10
34/34 - 4s - loss: 0.0807 - auc: 0.9978 - val_loss: 0.4193 - val_auc: 0.9181


<tensorflow.python.keras.callbacks.History at 0x7ff238e12250>

In [164]:
# Prediction
prediction_rnn_distilbert = rnn_model.predict(test_data_distilbert)

test['delivery_rnn_distilbert_predp'] = prediction_rnn_distilbert[:, 0]
test['product_rnn_distilbert_predp'] = prediction_rnn_distilbert[:, 1]
test['service_rnn_distilbert_predp'] = prediction_rnn_distilbert[:, 2]

test['delivery_rnn_distilbert_pred'] = test['delivery_rnn_distilbert_predp'].apply(lambda x: round(x))
test['product_rnn_distilbert_pred'] = test['product_rnn_distilbert_predp'].apply(lambda x: round(x))
test['service_rnn_distilbert_pred'] =test['service_rnn_distilbert_predp'].apply(lambda x: round(x))

In [165]:
def add_pred_target_label(df):
    df['label_distilbert_rnn_pred'] = df['delivery_rnn_distilbert_pred']*100 + df['product_rnn_distilbert_pred']*10 + df['service_rnn_distilbert_pred']
    df['label_distilbert_rnn_pred'] = '00'+df['label_distilbert_rnn_pred'].astype('str')
    df['label_distilbert_rnn_pred'] = df['label_distilbert_rnn_pred'].apply(lambda x: x[-3:])
    return df

#add prediction target label
test = add_pred_target_label(test)
#df_test.loc[df_test.review_id==40625, 'label'] = '010'
print(test['label_distilbert_rnn_pred'].unique())

['010' '001' '100' '011' '000' '101' '110' '111']


In [166]:
print('Result of Delivery from rnn with Distil Bert vectors')
print('AUC score: {}'.format(roc_auc_score(test.delivery, test.delivery_rnn_distilbert_predp)))
print(classification_report(test.delivery, test.delivery_rnn_distilbert_pred))

print('Result of Product from rnn with Distil Bert vectors')
print('AUC score: {}'.format(roc_auc_score(test['product'], test.product_rnn_distilbert_predp)))
print(classification_report(test['product'], test.product_rnn_distilbert_pred))

print('Result of Service from rnn with Distil Bert vectors')
print('AUC score: {}'.format(roc_auc_score(test['service'], test.service_rnn_distilbert_predp)))
print(classification_report(test['service'], test.service_rnn_distilbert_pred))

Result of Delivery from rnn with Distil Bert vectors
AUC score: 0.9071012061666267
              precision    recall  f1-score   support

           0       0.90      0.93      0.92       351
           1       0.74      0.67      0.71       107

    accuracy                           0.87       458
   macro avg       0.82      0.80      0.81       458
weighted avg       0.87      0.87      0.87       458

Result of Product from rnn with Distil Bert vectors
AUC score: 0.9236372180451128
              precision    recall  f1-score   support

           0       0.81      0.71      0.76       154
           1       0.86      0.92      0.89       304

    accuracy                           0.85       458
   macro avg       0.84      0.81      0.82       458
weighted avg       0.85      0.85      0.84       458

Result of Service from rnn with Distil Bert vectors
AUC score: 0.8524338624338625
              precision    recall  f1-score   support

           0       0.86      0.95      0.90 

In [167]:
print('Combined Result from RNN with Distil Bert vectors')
print(classification_report(test['label'], test.label_distilbert_rnn_pred))

Combined Result from RNN with Distil Bert vectors
              precision    recall  f1-score   support

         000       0.00      0.00      0.00         0
         001       0.63      0.43      0.51        67
         010       0.82      0.91      0.86       262
         011       0.29      0.18      0.22        22
         100       0.63      0.63      0.63        70
         101       0.12      0.06      0.08        17
         110       0.22      0.22      0.22        18
         111       0.00      0.00      0.00         2

    accuracy                           0.70       458
   macro avg       0.34      0.30      0.32       458
weighted avg       0.69      0.70      0.69       458



In [170]:
test.to_csv('test_cnn_rnn_pred.csv', index=False)