In [1]:
import pandas as pd
import numpy as np
import pickle
import warnings
warnings.filterwarnings('ignore')

## Inputs

In [2]:
file = '../input/negativecomments/train_data.csv'
testfile = '../input/negativecomments/test_data.csv'

rename_cols = {'Bad Delivery': 'delivery', 
               'Product':'product', 
               'Customer Service':'service'}

use_cols = ['review_id', 'review_content'] + list(rename_cols.values())

#seed = 5153
#valid_ratio = 0.1

#cv = 5
#num_eval = 5 
#score = 'accuracy'

## Data Preparation

In [3]:
def change_datatype(df):
    df['review_id'] = df['review_id'].astype('int64')
    df['delivery'] = df['delivery'].astype('int32')
    df['product'] = df['product'].astype('int32')
    df['service'] = df['service'].astype('int32')
    return df

In [4]:
df = pd.read_csv(file)
df = change_datatype(df.rename(columns=rename_cols)[use_cols])
print(df.shape)
df.head(3)

(1371, 5)


Unnamed: 0,review_id,review_content,delivery,product,service
0,10880,"Received well packed, Mask quality ok. But ear...",0,1,0
1,16711,If you are getting one mask then the price is ...,0,1,0
2,6351,Delivery was fast! Giving it 3 stars because I...,0,1,0


In [5]:
df_test = pd.read_csv(testfile)
df_test = change_datatype(df_test.rename(columns=rename_cols)[use_cols])
print(df_test.shape)
df_test.head(3)

(153, 5)


Unnamed: 0,review_id,review_content,delivery,product,service
0,16748,it does not cover the entire nose,0,1,0
1,7936,Items received in good condition. \nFast deliv...,0,1,0
2,35144,Order was not shipped by ship by date. 2 days ...,0,0,1


In [6]:
def add_target_label(df):
    df['label'] = df['delivery']*100 + df['product']*10 + df['service']
    df['label'] = '00'+df['label'].astype('str')
    df['label'] = df['label'].apply(lambda x: x[-3:])
    return df

In [7]:
#add target label
df = add_target_label(df)
df.loc[df.review_id==13310, 'label'] = '010'
print(df['label'].unique())
df.head(3)

['010' '001' '100' '110' '101' '011' '111']


Unnamed: 0,review_id,review_content,delivery,product,service,label
0,10880,"Received well packed, Mask quality ok. But ear...",0,1,0,10
1,16711,If you are getting one mask then the price is ...,0,1,0,10
2,6351,Delivery was fast! Giving it 3 stars because I...,0,1,0,10


In [8]:
#add target label
df_test = add_target_label(df_test)
df_test.loc[df_test.review_id==40625, 'label'] = '010'
print(df_test['label'].unique())
df_test.head(3)

['010' '001' '101' '110' '100' '011' '111']


Unnamed: 0,review_id,review_content,delivery,product,service,label
0,16748,it does not cover the entire nose,0,1,0,10
1,7936,Items received in good condition. \nFast deliv...,0,1,0,10
2,35144,Order was not shipped by ship by date. 2 days ...,0,0,1,1


## Merge DataFrames

In [9]:
df_all = pd.concat([df, df_test])
print(df.shape, df_test.shape, df_all.shape)
df_all

(1371, 6) (153, 6) (1524, 6)


Unnamed: 0,review_id,review_content,delivery,product,service,label
0,10880,"Received well packed, Mask quality ok. But ear...",0,1,0,010
1,16711,If you are getting one mask then the price is ...,0,1,0,010
2,6351,Delivery was fast! Giving it 3 stars because I...,0,1,0,010
3,33128,Too big for me,0,1,0,010
4,1304,Mask quality consider not bad for the price. D...,0,0,1,001
...,...,...,...,...,...,...
148,8674,Box a little damaged but product is still fine...,1,0,0,100
149,13363,Boxes deformed and stickers were already torn ...,1,0,1,101
150,32220,"Fits perfectly for guys, but adult size might ...",0,1,0,010
151,39695,i understand you have to change the type of ma...,0,0,1,001


# (1) BERT Feature Extraction - BertTokenizer

In [10]:
!pip install transformers



In [11]:
import torch
from transformers import BertTokenizer, BertModel

In [12]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Load pre-trained model tokenizer (vocabulary)
model = BertModel.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [13]:
#Step 1: Tokenization
tokenized = df_all['review_content'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [14]:
#Step 2: Padding
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
print(padded.shape)
print(padded[0])
print(tokenizer.convert_ids_to_tokens(0))

(1524, 86)
[  101  2363  2092  8966  1010  7308  3737  7929  1012  2021  4540 15932
  2978  4558  1998  1996  4451  6218  2081  1997  6081  3430  2029  2003
  2025  3733  2000  6260  3426  2025  4906  2007  1996  2227  7919   102
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0]
[PAD]


In [15]:
#Step 3: Masking
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(1524, 86)

In [16]:
%%time
#Step 4: Use the pre-trained BERT Model
input_ids = torch.tensor(padded)  
attention_mask = torch.tensor(attention_mask)

with torch.no_grad():
    ## the output here is a tuple of (last_hidden_state, pooler_output)
    model_outputs = model(input_ids, attention_mask=attention_mask)

CPU times: user 3min 29s, sys: 1min 2s, total: 4min 31s
Wall time: 4min 33s


In [17]:
# get the last_hidden state
last_hidden_state = model_outputs[0].numpy()
last_hidden_state.shape

(1524, 86, 768)

### Save results

In [18]:
def save_to_dic(keys, values):
    dic = {}
    for i, key in enumerate(keys):
        dic[key] = list(values[i])
    return dic

In [19]:
def save_dic_to_txt(save_name, dic):
    f = open(save_name, 'w')
    f.write(str(dic))
    f.close()

In [20]:
bert_dic_2D = save_to_dic(df_all['review_id'], last_hidden_state)
#save_dic_to_txt('negative_bert_dic_2D.txt', bert_dic_2D)
f_save = open('bert_dic_2D.pkl', 'wb')
pickle.dump(bert_dic_2D, f_save)
f_save.close()

In [21]:
#Get the embeddings for position 0
bert_layer0 = last_hidden_state[:,0,:]
print(bert_layer0.shape)

bert_layer0_dic = save_to_dic(df_all['review_id'], bert_layer0)
save_dic_to_txt('bert_layer0_dic.txt', bert_layer0_dic)

(1524, 768)


In [22]:
#Get the embeddings for avg
bert_avg = np.mean(last_hidden_state,axis=1)
print(bert_avg.shape)

bert_avg_dic = save_to_dic(df_all['review_id'], bert_avg)
save_dic_to_txt('bert_avg_dic.txt', bert_avg_dic)

(1524, 768)


In [23]:
#Get the embeddings for max
bert_max = np.max(last_hidden_state,axis=1)
print(bert_max.shape)

bert_max_dic = save_to_dic(df_all['review_id'], bert_max)
save_dic_to_txt('bert_max_dic.txt', bert_max_dic)

(1524, 768)


# (2) BERT Feature Extraction - DistilBert

In [24]:
from transformers import DistilBertTokenizer, DistilBertModel
# Load pre-trained model tokenizer (vocabulary)
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
# Load pre-trained model (weights)
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/442 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [25]:
tokenized = df_all['review_content'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=512, truncation=True)))
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(1524, 86)

In [26]:
max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])

In [27]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(1524, 86)

In [28]:
%%time
feature_list_layer0 = []
feature_list_avg = []
feature_list_max = []
feature_list_2d = []
with torch.no_grad():
    for batch_idx in range(0,padded.shape[0]):
        #BERT check 10 sample each time.
        input_ids = torch.tensor(padded[batch_idx:batch_idx+1])  
        used_attention_mask = torch.tensor(attention_mask[batch_idx:batch_idx+1])
        last_hidden_states = model(input_ids, attention_mask=used_attention_mask)
        #Get the embeddings for the [CLS] tag (position is 0)
        features_layer0 = last_hidden_states[0][:,0,:].numpy()
        features_avg = np.mean(last_hidden_states[0].numpy(),axis=1)
        features_max = np.max(last_hidden_states[0].numpy(),axis=1)
        features_2d = last_hidden_states[0].numpy()
        feature_list_layer0.append(features_layer0)
        feature_list_avg.append(features_avg)
        feature_list_max.append(features_max)
        feature_list_2d.append(features_2d)

CPU times: user 2min 42s, sys: 781 ms, total: 2min 43s
Wall time: 2min 45s


In [29]:
# preprare features
features_layer0 = np.vstack(feature_list_layer0)
print(features_layer0.shape)

features_avg = np.vstack(feature_list_avg)
print(features_avg.shape)

features_max = np.vstack(feature_list_max)
print(features_max.shape)

(1524, 768)
(1524, 768)
(1524, 768)


In [30]:
# save to files
distil_bert_layer0_dic = save_to_dic(df_all['review_id'], features_layer0)
save_dic_to_txt('distil_bert_layer0_dic.txt', distil_bert_layer0_dic)

distil_bert_avg_dic = save_to_dic(df_all['review_id'], features_avg)
save_dic_to_txt('distil_bert_avg_dic.txt', distil_bert_avg_dic)

distil_bert_max_dic = save_to_dic(df_all['review_id'], features_max)
save_dic_to_txt('distil_bert_max_dic.txt', distil_bert_max_dic)

distil_bert_dic_2D = save_to_dic(df_all['review_id'], feature_list_2d)
#save_dic_to_txt('negative_distil_bert_dic_2D.txt', distil_bert_dic_2D)
f_save = open('distil_full_bert_dic_2D.pkl', 'wb')
pickle.dump(distil_bert_dic_2D, f_save)
f_save.close()