# RoBERTa combined office and general for wrong_buying classifier

In [1]:
# install transformers
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com


In [1]:
# load: 

import os 
import json
import numpy as np
import pandas as pd
import re
from tqdm import tqdm
import time
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
tf.random.set_seed(1234)

In [2]:
# load the pre-trained models: 
from transformers import RobertaTokenizer, TFRobertaModel
with tf.device('/device:GPU:0'):
    tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
    model = TFRobertaModel.from_pretrained("roberta-base")

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [1]:
import pandas as pd
import os
train_path = '/home/ec2-user/SageMaker/amazon_thesis_detectron/general_training_data'
wrong_buying = pd.read_csv(os.path.join(train_path,'wrong_buying_general.csv'))
wrong_buying.columns

Index(['review_id', 'full_review', 'wrong_buying', 'wrong_buying_ohe'], dtype='object')

In [2]:
wrong_buying['sentence_length']= wrong_buying['full_review'].apply(lambda x: len(x.split()))
def create_sentence_list(df, label): 
    df_1 = df[df[label]==1]
    return df_1['sentence_length'].tolist()
wrong = create_sentence_list(wrong_buying, 'wrong_buying_ohe')
# Calculate the average
w = sum(wrong) / len(wrong)

print("Average:", w)

Average: 126.87869822485207


In [5]:
wrong_buying.wrong_buying_ohe.value_counts()

0    10216
1     1014
Name: wrong_buying_ohe, dtype: int64

In [6]:
wrong_buying[wrong_buying['wrong_buying_ohe']==1].iloc[39].full_review

"I purchased this paper trimmer based on reviews and wish I hadn't.  The blade leaves ragged edges to any paper cut, even thin copy paper which I tried to see how it would do.  Will not cut cardstock cleanly or completely at all.  The blade is either not sharp enough or long enough to make a clean cut.Would never purchase again nor recommend."

In [7]:
wrong_buying= wrong_buying[['wrong_buying_ohe', 'full_review']]

In [9]:
# train test split - split trainning into train and validation, take 10% as validation
from sklearn.model_selection import train_test_split
train, test_df = train_test_split(wrong_buying, test_size= 0.3, random_state=42, stratify=wrong_buying['wrong_buying_ohe'])

In [10]:
# train test split - split trainning into train and validation, take 10% as validation
train_df_1, val_df = train_test_split(train, test_size= 0.2, random_state=42, stratify=train['wrong_buying_ohe'])

In [12]:
train_df_1.wrong_buying_ohe.value_counts()

0    5720
1     568
Name: wrong_buying_ohe, dtype: int64

In [10]:

class create_tensor_dataset(): 
    
    '''Purpose: Create a tensorflow dataset for model training
    df_to_convert: The dataframe of interest, that contains the 
    text columns
    text: The column within the dataframe containing the text 
    batch_size: int, batch size
    tokenizer: The tokenizer to be utilised
    labels: The class/label column within the dataframe
    '''
    
    def __init__(self, df_to_convert, text,target_col, batch_size, labels=None ): 
        self.df_to_convert = df_to_convert
        self.text = text # column containing the text (here chunked_data)
        self.target_col = target_col # y column of interest here label
        self.batch_size = batch_size
        self.tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
        self.labels = labels # list of all the labels 
        
    def generate_training_data(self, ids, masks):
        for i, text in tqdm(enumerate(self.df_to_convert[self.text])):
            with tf.device('/device:GPU:0'):
                
                tokenized_text =  self.tokenizer.encode_plus(
                    text,# The sequence or batch of sequences to be encoded.
                    max_length=256, #Controls the maximum length to use by one of the truncation/padding parameters.
                    truncation=True, # Truncate to a maximum length specified with the argument max_length
                    padding='max_length', #pad to a maximum length specified with the argument max_length 
                    add_special_tokens=True,# encode the sequences with the special tokens relative to their model.
                    return_tensors='tf'   #return TensorFlow tf.constant objects.
                )
                ids[i, :] = tokenized_text.input_ids ##List of token ids to be fed to a model
                masks[i, :] = tokenized_text.attention_mask #List of indices specifying which tokens should be attended to by the model
        return ids, masks
    
    def ohe_label(self): 
         # convert label values to a list 
        return self.df_to_convert[self.target_col].values.tolist()

    def DatasetMapFunction(self, input_ids, attn_masks, labels):
        # create and return key-value pair of input ids, attention mask and the corresnponding labels
        return {
            'input_ids': input_ids,
            'attention_mask': attn_masks
        }, labels
    
    def tensor_data(self): 
        # input_ids array
        X_input_ids = np.zeros((len(self.df_to_convert), 256))
        # attention mask array
        X_attn_masks = np.zeros((len(self.df_to_convert), 256))
        # retrieve the input ids and attention masks
        X_input_ids, X_attn_masks = self.generate_training_data(X_input_ids, X_attn_masks)
        # creation of tensorflow dataset: 
        dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, self.ohe_label()))
        dataset = dataset.map(self.DatasetMapFunction)
        dataset = dataset.shuffle(10000).batch(self.batch_size, drop_remainder=True)
        return dataset

In [11]:
# creation of tensorflow dataset for training
import numpy as np
import sys
np.set_printoptions(threshold=sys.maxsize)
train_tensor =create_tensor_dataset(train_df_1, 'full_review', 'wrong_buying_ohe',8 )
train_dataset =train_tensor.tensor_data() 

6288it [00:07, 851.62it/s]


In [12]:
# validation tensor dataset: 
with tf.device('/device:GPU:0'):
    val_tensor =create_tensor_dataset(val_df, 'full_review', 'wrong_buying_ohe', 8)
    val_dataset =val_tensor.tensor_data() 

1573it [00:02, 744.53it/s]


In [13]:

# defining 2 input layers for input_ids and attn_masks
with tf.device('/device:GPU:0'):
    input_ids = tf.keras.layers.Input(shape=(256,), name='input_ids', dtype='int32')
    attn_masks = tf.keras.layers.Input(shape=(256,), name='attention_mask', dtype='int32')
    bert_embds = model.roberta(input_ids, attention_mask=attn_masks)[1] # pooled output
    intermediate_layer = tf.keras.layers.Dense(512, activation='relu', name='intermediate_layer')(bert_embds)
    intermediate_layer_2 = tf.keras.layers.Dense(1024, activation='relu', name='intermediate_layer_2')(intermediate_layer)
    intermediate_layer_3= tf.keras.layers.Dense(1024, activation='relu', name='intermediate_layer_3')(intermediate_layer_2)
    output_layer = tf.keras.layers.Dense(1, activation='sigmoid', name='output_layer')(intermediate_layer_3) # sigmoid -> calcs probs of binary classes
    model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)
    model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 256)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 256)]        0           []                               
                                                                                                  
 roberta (TFRobertaMainLayer)   TFBaseModelOutputWi  124645632   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 256,                                           

In [14]:
# utilization of adam optimization 
optim = tf.keras.optimizers.Adam(learning_rate=1e-5)
# utilised binary cross entropy loss 
loss_func = tf.keras.losses.BinaryCrossentropy()
model.compile(optimizer=optim, loss=loss_func, metrics=['accuracy'])

# early stopping - administer val_loss: 
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping()
custom_early_stopping = EarlyStopping(
    monitor='val_loss', 
    patience= 5,
    mode='min'
)

with tf.device('/device:GPU:0'):

    hist = model.fit(
        train_dataset,
        validation_data=val_dataset,
        epochs=100, 
        callbacks =[custom_early_stopping]
    )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100


In [8]:
def prepare_data(input_text, tokenizer):
    with tf.device('/device:GPU:0'):
        
        token = tokenizer.encode_plus(
            input_text,
            max_length=256, 
            truncation=True, 
            padding='max_length', 
            add_special_tokens=True,
            return_tensors='tf'
        )
        return {
            'input_ids': tf.cast(token.input_ids, tf.float64),
            'attention_mask': tf.cast(token.attention_mask, tf.float64)
        }

def make_predictions(model, input_text,threshold, label_list=None ): 
    with tf.device('/device:GPU:0'):
        processed_data = prepare_data(input_text, tokenizer)
        probs = model.predict(processed_data)
#         print(probs)
        if probs[0]> threshold: 
            return label_list[0]
        else: 
            return label_list[1]
#     return(probs[0])

In [16]:
# tt = test_df.head(20)
label_list= [1, 0]
test_df['result']= test_df.full_review.apply(lambda x: make_predictions(model, x, 0.5, label_list))























In [17]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
print (classification_report(test_df['wrong_buying_ohe'], test_df['result']))

              precision    recall  f1-score   support

           0       0.97      0.98      0.97      3065
           1       0.75      0.72      0.73       304

    accuracy                           0.95      3369
   macro avg       0.86      0.85      0.85      3369
weighted avg       0.95      0.95      0.95      3369



In [18]:
model.save('wrong_buying_general_fnl')



INFO:tensorflow:Assets written to: wrong_buying_general_fnl/assets


INFO:tensorflow:Assets written to: wrong_buying_general_fnl/assets


In [19]:
print('done')

done
