In [1]:
# load: 
import os 
import json
import numpy as np
import pandas as pd
import re
from tqdm import tqdm
import time
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

In [2]:
# read_csvs: 
rating_management_df = pd.read_csv('rating_managment_explicit_v1.csv')
disgreement_df = pd.read_csv('disagreement_with_ratings.csv')

In [5]:
rating_management_df

Unnamed: 0,review_id,full_review,rating_managment_explicit,rating_managment_explicit_ohe
0,R1CRQ4TO7TVZYI,So the image shows noodlers polar ink series a...,0,0
1,R19JEPGLD3YJSH,I don’t think I have yet to write a negative r...,0,0
2,RITV3I7304IAG,Doesn’t erase. Even after I used some elbow gr...,1,1
3,R2C71Z7CM7WQ8O,Sadly I wish I had read the reviews before buy...,0,0
4,RGF63NTC51ASI,The media could not be loaded. ...,0,0
...,...,...,...,...
4445,R24E9VLZAXT02B,I wouldn't buy this again.1. The instructions...,0,0
4446,RRQKDX7BGHW99,I printed approx. 40 9x11 pages of my toolbo...,0,0
4447,R1I08PKQ06WQ6W,"Purchased 2/7/15, date of review 3/23/15. Wis...",0,0
4448,RRF0HLVMA6TYH,"To echo prior reviews, it is a gamble when you...",0,0


In [4]:
disgreement_df

Unnamed: 0,review_id,full_review,disagreement_with_ratings,disagreement_with_ratings_ohe
0,R1CRQ4TO7TVZYI,So the image shows noodlers polar ink series a...,0.0,0
1,R19JEPGLD3YJSH,I don’t think I have yet to write a negative r...,0.0,0
2,RITV3I7304IAG,Doesn’t erase. Even after I used some elbow gr...,0.0,0
3,R2C71Z7CM7WQ8O,Sadly I wish I had read the reviews before buy...,0.0,0
4,RGF63NTC51ASI,The media could not be loaded. ...,0.0,0
...,...,...,...,...
4445,R24E9VLZAXT02B,I wouldn't buy this again.1. The instructions...,0.0,0
4446,RRQKDX7BGHW99,I printed approx. 40 9x11 pages of my toolbo...,0.0,0
4447,R1I08PKQ06WQ6W,"Purchased 2/7/15, date of review 3/23/15. Wis...",0.0,0
4448,RRF0HLVMA6TYH,"To echo prior reviews, it is a gamble when you...",0.0,0


In [16]:
# merge the 2 datasets: 
merged_df = rating_management_df.merge(disgreement_df[['review_id','disagreement_with_ratings_ohe']], on= 'review_id', how='left')

In [17]:
merged_df.disagreement_with_ratings_ohe.value_counts()

0    4288
1     162
Name: disagreement_with_ratings_ohe, dtype: int64

In [19]:
disgreement_df.disagreement_with_ratings_ohe.value_counts() # quality check in place

0    4288
1     162
Name: disagreement_with_ratings_ohe, dtype: int64

In [20]:
merged_df

Unnamed: 0,review_id,full_review,rating_managment_explicit,rating_managment_explicit_ohe,disagreement_with_ratings_ohe
0,R1CRQ4TO7TVZYI,So the image shows noodlers polar ink series a...,0,0,0
1,R19JEPGLD3YJSH,I don’t think I have yet to write a negative r...,0,0,0
2,RITV3I7304IAG,Doesn’t erase. Even after I used some elbow gr...,1,1,0
3,R2C71Z7CM7WQ8O,Sadly I wish I had read the reviews before buy...,0,0,0
4,RGF63NTC51ASI,The media could not be loaded. ...,0,0,0
...,...,...,...,...,...
4445,R24E9VLZAXT02B,I wouldn't buy this again.1. The instructions...,0,0,0
4446,RRQKDX7BGHW99,I printed approx. 40 9x11 pages of my toolbo...,0,0,0
4447,R1I08PKQ06WQ6W,"Purchased 2/7/15, date of review 3/23/15. Wis...",0,0,0
4448,RRF0HLVMA6TYH,"To echo prior reviews, it is a gamble when you...",0,0,0


In [23]:
def create_combined_label(rating_management_col, disagreement_rating_col): 
    if rating_management_col ==1 or disagreement_rating_col ==1: 
        return 1 
    return 0

In [24]:
merged_df['combine_tag']= merged_df.apply(lambda x: create_combined_label(x.rating_managment_explicit_ohe, x.disagreement_with_ratings_ohe), axis =1)

In [26]:
merged_df.combine_tag.value_counts()

0    4235
1     215
Name: combine_tag, dtype: int64

In [27]:
# load the pre-trained models: 
from transformers import RobertaTokenizer, TFRobertaModel
with tf.device('/device:GPU:0'):
    tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
    model = TFRobertaModel.from_pretrained("roberta-base")

Some layers from the model checkpoint at roberta-base were not used when initializing TFRobertaModel: ['lm_head']
- This IS expected if you are initializing TFRobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFRobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFRobertaModel were initialized from the model checkpoint at roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.


In [28]:
# train test split - split trainning into train and validation, take 10% as validation
train, test_df = train_test_split(merged_df, test_size= 0.3, random_state=42, stratify=merged_df['combine_tag'])

In [33]:
# train test split - split trainning into train and validation, take 10% as validation
train_df_1, val_df = train_test_split(train, test_size= 0.3, random_state=42, stratify=train['combine_tag'])

In [34]:
val_df.combine_tag.value_counts()

0    890
1     45
Name: combine_tag, dtype: int64

In [35]:

class create_tensor_dataset(): 
    
    '''Purpose: Create a tensorflow dataset for model training
    df_to_convert: The dataframe of interest, that contains the 
    text columns
    text: The column within the dataframe containing the text 
    batch_size: int, batch size
    tokenizer: The tokenizer to be utilised
    labels: The class/label column within the dataframe
    '''
    
    def __init__(self, df_to_convert, text,target_col, batch_size, labels=None ): 
        self.df_to_convert = df_to_convert
        self.text = text # column containing the text (here chunked_data)
        self.target_col = target_col # y column of interest here label
        self.batch_size = batch_size
        self.tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
        self.labels = labels # list of all the labels 
        
    def generate_training_data(self, ids, masks):
        for i, text in tqdm(enumerate(self.df_to_convert[self.text])):
            with tf.device('/device:GPU:0'):
                
                tokenized_text =  self.tokenizer.encode_plus(
                    text,# The sequence or batch of sequences to be encoded.
                    max_length=256, #Controls the maximum length to use by one of the truncation/padding parameters.
                    truncation=True, # Truncate to a maximum length specified with the argument max_length
                    padding='max_length', #pad to a maximum length specified with the argument max_length 
                    add_special_tokens=True,# encode the sequences with the special tokens relative to their model.
                    return_tensors='tf'   #return TensorFlow tf.constant objects.
                )
                ids[i, :] = tokenized_text.input_ids ##List of token ids to be fed to a model
                masks[i, :] = tokenized_text.attention_mask #List of indices specifying which tokens should be attended to by the model
        return ids, masks
    
    def ohe_label(self): 
         # convert label values to a list 
        return self.df_to_convert[self.target_col].values.tolist()

    def DatasetMapFunction(self, input_ids, attn_masks, labels):
        # create and return key-value pair of input ids, attention mask and the corresnponding labels
        return {
            'input_ids': input_ids,
            'attention_mask': attn_masks
        }, labels
    
    def tensor_data(self): 
        # input_ids array
        X_input_ids = np.zeros((len(self.df_to_convert), 256))
        # attention mask array
        X_attn_masks = np.zeros((len(self.df_to_convert), 256))
        # retrieve the input ids and attention masks
        X_input_ids, X_attn_masks = self.generate_training_data(X_input_ids, X_attn_masks)
        # creation of tensorflow dataset: 
        dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, self.ohe_label()))
        dataset = dataset.map(self.DatasetMapFunction)
        dataset = dataset.shuffle(10000).batch(self.batch_size, drop_remainder=True)
        return dataset

In [36]:
# creation of tensorflow dataset for training
import numpy as np
import sys
np.set_printoptions(threshold=sys.maxsize)
train_tensor =create_tensor_dataset(train_df_1, 'full_review', 'combine_tag',8 )
train_dataset =train_tensor.tensor_data() 

2180it [00:02, 1004.04it/s]


In [37]:
# validation tensor dataset: 
with tf.device('/device:GPU:0'):
    val_tensor =create_tensor_dataset(val_df, 'full_review', 'combine_tag', 8)
    val_dataset =val_tensor.tensor_data() 

935it [00:00, 953.36it/s]


In [38]:

# defining 2 input layers for input_ids and attn_masks
with tf.device('/device:GPU:0'):
    input_ids = tf.keras.layers.Input(shape=(256,), name='input_ids', dtype='int32')
    attn_masks = tf.keras.layers.Input(shape=(256,), name='attention_mask', dtype='int32')
    bert_embds = model.roberta(input_ids, attention_mask=attn_masks)[1] # pooled output
    # intermediate_layer_1 = tf.keras.layers.Dropout(0.5, name="dropout")(bert_embds)# 0 -> activation layer (3D), 1 -> pooled output layer (2D)
    intermediate_layer = tf.keras.layers.Dense(1024, activation='relu', name='intermediate_layer')(bert_embds)
    # intermediate_layer_2 = tf.keras.layers.Dropout(0.4, name="dropout2")(intermediate_layer)
    output_layer = tf.keras.layers.Dense(1, activation='sigmoid', name='output_layer')(intermediate_layer) # sigmoid -> calcs probs of binary classes
    roberta_imdb = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)
    roberta_imdb.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 256)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 256)]        0           []                               
                                                                                                  
 roberta (TFRobertaMainLayer)   TFBaseModelOutputWi  124645632   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 256,                                           

In [39]:
# utilization of adam optimization 
optim = tf.keras.optimizers.Adam(learning_rate=1e-5)
# utilised binary cross entropy loss 
loss_func = tf.keras.losses.BinaryCrossentropy()
roberta_imdb.compile(optimizer=optim, loss=loss_func, metrics=['accuracy'])

# early stopping - administer val_loss: 
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping()
custom_early_stopping = EarlyStopping(
    monitor='val_loss', 
    patience= 3,
    mode='min'
)

with tf.device('/device:GPU:0'):

    hist = roberta_imdb.fit(
        train_dataset,
        validation_data=val_dataset,
        epochs=100, 
        callbacks =[custom_early_stopping]
    )

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100


In [40]:
def prepare_data(input_text, tokenizer):
    with tf.device('/device:GPU:0'):
        
        token = tokenizer.encode_plus(
            input_text,
            max_length=256, 
            truncation=True, 
            padding='max_length', 
            add_special_tokens=True,
            return_tensors='tf'
        )
        return {
            'input_ids': tf.cast(token.input_ids, tf.float64),
            'attention_mask': tf.cast(token.attention_mask, tf.float64)
        }

def make_predictions(model, input_text,threshold, label_list=None ): 
    with tf.device('/device:GPU:0'):
        processed_data = prepare_data(input_text, tokenizer)
        probs = model.predict(processed_data)
#         print(probs)
        if probs[0]> threshold: 
            return label_list[0]
        else: 
            return label_list[1]
#     return(probs[0])

In [41]:
# tt = test_df.head(20)
label_list= [1, 0]
test_df['result']= test_df.full_review.apply(lambda x: make_predictions(roberta_imdb, x, 0.5, label_list))











In [43]:
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
print (classification_report(test_df['combine_tag'], test_df['result']))

              precision    recall  f1-score   support

           0       0.98      0.99      0.99      1271
           1       0.83      0.59      0.69        64

    accuracy                           0.97      1335
   macro avg       0.90      0.79      0.84      1335
weighted avg       0.97      0.97      0.97      1335



In [46]:
test_df[test_df['result']==1]

Unnamed: 0,review_id,full_review,rating_managment_explicit,rating_managment_explicit_ohe,disagreement_with_ratings_ohe,combine_tag,result
3711,R21E0B01DWGBBD,This is not a review of this product. I just f...,1,1,0,1,1
4191,R2KCZJ06LGQHSK,i don't know how they're getting so many poz r...,0,0,1,1,1
3772,R2D1AHZF49E45U,I purchased these pens about a year ago for a ...,0,0,0,0,1
3727,R10OM6C2QVMF5Y,"When I opened the black cartridge, it started ...",0,0,1,1,1
518,RRH3MCHXTB4LK,Ok I don't think the 5 star reviews on this pr...,2,1,0,1,1
1181,R2PYDAGEPRMRN9,After carefully researching replacement ink fo...,0,0,1,1,1
2913,RT8F9XHR2QUUQ,"The Cyan cartridge in this pack is defective, ...",1,1,0,1,1
83,RZXSYABXW4BEM,thin and incredibly flimsy. cover is thin and ...,1,1,0,1,1
2915,R8HS5PMCXZMD3,The problem with this cart is definitely the t...,0,0,1,1,1
3198,R3M7C6CYGI1QQ4,I am not sure how this got a 5 star rating.Pac...,0,0,1,1,1


In [49]:
test_df[test_df['result']==1].iloc[3].full_review

"When I opened the black cartridge, it started to leak. Stained a t-shirt. DON'T BUY THIS PRODUCT! I don't know where the five-star reviews are coming from. If I could give it zero stars, I would."

In [50]:
test_df[test_df['result']==1].iloc[4].full_review

"Ok I don't think the 5 star reviews on this product are honest. The set is extremely cheap. The case is flimsy and the compass is not well built. I doubt this will serve the needs of my son. I am generally very good at confirming 5 star reviews but fell for it hard this time around. Buyer beware!"

In [None]:
print()