# RoBERTa for Office: 

In [1]:
# install transformers
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Collecting transformers
  Downloading transformers-4.27.3-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m47.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 kB[0m [31m52.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.3


In [1]:
# load: 
import os 
import json
import numpy as np
import pandas as pd
import re
from tqdm import tqdm
import time
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
# load the pre-trained models: 
from transformers import RobertaTokenizer, TFRobertaModel
with tf.device('/device:GPU:0'):
    tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
    model = TFRobertaModel.from_pretrained("roberta-base")

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [1]:
# load csvs: 
# train_df = pd.read_csv('train_df_processed.csv')
# test_df =pd.read_csv('test_df_processed.csv')
import pandas as pd
zero_star = pd.read_csv('zero_star_v1.csv')
zero_star.columns

Index(['review_id', 'full_review', 'zero_star', 'zero_star_ohe'], dtype='object')

In [2]:
zero_star.zero_star_ohe.value_counts()

0    3799
1     651
Name: zero_star_ohe, dtype: int64

In [5]:
zero_star= zero_star[['zero_star_ohe', 'full_review']]

In [7]:
# train test split - split trainning into train and validation, take 10% as validation
train, test_df = train_test_split(zero_star, test_size= 0.3, random_state=42, stratify=zero_star['zero_star_ohe'])

In [9]:
# train test split - split trainning into train and validation, take 10% as validation
train_df_1, val_df = train_test_split(train, test_size= 0.2, random_state=42, stratify=train['zero_star_ohe'])

In [11]:
val_df.zero_star_ohe.value_counts()

0    532
1     91
Name: zero_star_ohe, dtype: int64

In [17]:
val_df.iloc[1].full_review

'Amazon would not let me give it no stars.Crashes, goes into unending self diagnosis, have to pull the plug. WiFi is mythical.I want my 12 year old HP back (it finally quit) or at least something as good.It is like going back to the early days of printers and unending problems.'

In [18]:

class create_tensor_dataset(): 
    
    '''Purpose: Create a tensorflow dataset for model training
    df_to_convert: The dataframe of interest, that contains the 
    text columns
    text: The column within the dataframe containing the text 
    batch_size: int, batch size
    tokenizer: The tokenizer to be utilised
    labels: The class/label column within the dataframe
    '''
    
    def __init__(self, df_to_convert, text,target_col, batch_size, labels=None ): 
        self.df_to_convert = df_to_convert
        self.text = text # column containing the text (here chunked_data)
        self.target_col = target_col # y column of interest here label
        self.batch_size = batch_size
        self.tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
        self.labels = labels # list of all the labels 
        
    def generate_training_data(self, ids, masks):
        for i, text in tqdm(enumerate(self.df_to_convert[self.text])):
            with tf.device('/device:GPU:0'):
                
                tokenized_text =  self.tokenizer.encode_plus(
                    text,# The sequence or batch of sequences to be encoded.
                    max_length=256, #Controls the maximum length to use by one of the truncation/padding parameters.
                    truncation=True, # Truncate to a maximum length specified with the argument max_length
                    padding='max_length', #pad to a maximum length specified with the argument max_length 
                    add_special_tokens=True,# encode the sequences with the special tokens relative to their model.
                    return_tensors='tf'   #return TensorFlow tf.constant objects.
                )
                ids[i, :] = tokenized_text.input_ids ##List of token ids to be fed to a model
                masks[i, :] = tokenized_text.attention_mask #List of indices specifying which tokens should be attended to by the model
        return ids, masks
    
    def ohe_label(self): 
         # convert label values to a list 
        return self.df_to_convert[self.target_col].values.tolist()

    def DatasetMapFunction(self, input_ids, attn_masks, labels):
        # create and return key-value pair of input ids, attention mask and the corresnponding labels
        return {
            'input_ids': input_ids,
            'attention_mask': attn_masks
        }, labels
    
    def tensor_data(self): 
        # input_ids array
        X_input_ids = np.zeros((len(self.df_to_convert), 256))
        # attention mask array
        X_attn_masks = np.zeros((len(self.df_to_convert), 256))
        # retrieve the input ids and attention masks
        X_input_ids, X_attn_masks = self.generate_training_data(X_input_ids, X_attn_masks)
        # creation of tensorflow dataset: 
        dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, self.ohe_label()))
        dataset = dataset.map(self.DatasetMapFunction)
        dataset = dataset.shuffle(10000).batch(self.batch_size, drop_remainder=True)
        return dataset

In [19]:
# creation of tensorflow dataset for training
import numpy as np
import sys
np.set_printoptions(threshold=sys.maxsize)
train_tensor =create_tensor_dataset(train_df_1, 'full_review', 'zero_star_ohe',8 )
train_dataset =train_tensor.tensor_data() 

2492it [00:02, 1045.41it/s]


In [20]:
# validation tensor dataset: 
with tf.device('/device:GPU:0'):
    val_tensor =create_tensor_dataset(val_df, 'full_review', 'zero_star_ohe', 8)
    val_dataset =val_tensor.tensor_data() 

623it [00:00, 881.11it/s]


In [22]:
# Defining 2 input layers for input_ids and attn_masks
with tf.device('/device:GPU:0'):
    input_ids = tf.keras.layers.Input(shape=(256,), name='input_ids', dtype='int32')
    attn_masks = tf.keras.layers.Input(shape=(256,), name='attention_mask', dtype='int32')
    
    bert_embds = model.roberta(input_ids, attention_mask=attn_masks)[1] # Pooled output
    
    intermediate_layer = tf.keras.layers.Dense(1024, activation='relu', name='intermediate_layer')(bert_embds)
    intermediate_layer = tf.keras.layers.Dropout(0.5, name='dropout1')(intermediate_layer)
    
    intermediate_layer = tf.keras.layers.Dense(512, activation='relu', name='intermediate_layer2')(intermediate_layer)
    intermediate_layer = tf.keras.layers.Dropout(0.4, name='dropout2')(intermediate_layer)
    
    intermediate_layer = tf.keras.layers.Dense(256, activation='relu', name='intermediate_layer3')(intermediate_layer)
    intermediate_layer = tf.keras.layers.Dropout(0.3, name='dropout3')(intermediate_layer)
    
    intermediate_layer = tf.keras.layers.Dense(128, activation='relu', name='intermediate_layer4')(intermediate_layer)
    intermediate_layer = tf.keras.layers.Dropout(0.2, name='dropout4')(intermediate_layer)
    
    intermediate_layer = tf.keras.layers.Dense(64, activation='relu', name='intermediate_layer5')(intermediate_layer)
    intermediate_layer = tf.keras.layers.Dropout(0.1, name='dropout5')(intermediate_layer)
    
    output_layer = tf.keras.layers.Dense(1, activation='sigmoid', name='output_layer')(intermediate_layer) # Sigmoid activation for binary classification
    
    model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)
    model.summary()


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 256)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 256)]        0           []                               
                                                                                                  
 roberta (TFRobertaMainLayer)   TFBaseModelOutputWi  124645632   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 256,                                           

In [22]:
# utilization of adam optimization 
optim = tf.keras.optimizers.Adam(learning_rate=1e-5)
# utilised binary cross entropy loss 
loss_func = tf.keras.losses.BinaryCrossentropy()
roberta_imdb.compile(optimizer=optim, loss=loss_func, metrics=['accuracy'])

# early stopping - administer val_loss: 
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping()
custom_early_stopping = EarlyStopping(
    monitor='val_loss', 
    patience= 3,
    mode='min'
)

with tf.device('/device:GPU:0'):

    hist = roberta_imdb.fit(
        train_dataset,
        validation_data=val_dataset,
        epochs=100, 
        callbacks =[custom_early_stopping]
    )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100


In [23]:
def prepare_data(input_text, tokenizer):
    with tf.device('/device:GPU:0'):
        
        token = tokenizer.encode_plus(
            input_text,
            max_length=256, 
            truncation=True, 
            padding='max_length', 
            add_special_tokens=True,
            return_tensors='tf'
        )
        return {
            'input_ids': tf.cast(token.input_ids, tf.float64),
            'attention_mask': tf.cast(token.attention_mask, tf.float64)
        }

def make_predictions(model, input_text,threshold, label_list=None ): 
    with tf.device('/device:GPU:0'):
        processed_data = prepare_data(input_text, tokenizer)
        probs = model.predict(processed_data)
#         print(probs)
        if probs[0]> threshold: 
            return label_list[0]
        else: 
            return label_list[1]
#     return(probs[0])

In [24]:
# tt = test_df.head(20)
label_list= [1, 0]
test_df['result']= test_df.full_review.apply(lambda x: make_predictions(roberta_imdb, x, 0.5, label_list))











In [25]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
print (classification_report(test_df['zero_star_ohe'], test_df['result']))

              precision    recall  f1-score   support

           0       0.99      0.98      0.99      1140
           1       0.89      0.96      0.92       195

    accuracy                           0.98      1335
   macro avg       0.94      0.97      0.96      1335
weighted avg       0.98      0.98      0.98      1335



In [26]:
roberta_imdb.save('zero_star_v1')



INFO:tensorflow:Assets written to: zero_star_v1/assets


INFO:tensorflow:Assets written to: zero_star_v1/assets


In [39]:
test_df[test_df['result']==1].iloc[1].full_review

'This is the most overpriced piece of junk I have ever purchased online. It is the equivalent of a piece of cardboard with a gray spray painted surface. I cannot imagine even selling this item. I feel I am cheating the public by giving this even one star.'