# Purpose: 

Data augmentation for rating management explicit and disagreement with ratings training set

In [2]:
import pandas as pd 
import os 
import nlpaug.augmenter.word.context_word_embs as aug
from sklearn.utils import shuffle
from tqdm import tqdm
import spacy
tqdm.pandas()
# Load the English language model
nlp = spacy.load("en_core_web_sm")

In [3]:
disagreement_df = pd.read_csv('/Users/kartikvijay/Documents/MADS/Thesis pt.2/augmentation/train_dataset_disagreement_with_ratings.csv')
rating_management_df = pd.read_csv('/Users/kartikvijay/Documents/MADS/Thesis pt.2/augmentation/training_rating_management_explicit_data.csv')


In [10]:
import numpy as np
np.random.randint(0, 100, 200)

array([44, 29, 35,  0, 76, 45, 71, 11, 14, 87, 26, 26, 86, 52, 86,  5, 30,
       51, 92, 95, 17,  6,  2, 36, 65, 80, 85, 35, 39, 89, 13, 52, 49, 86,
       98, 69, 73, 76, 25, 84, 12, 92, 51, 97, 20, 72, 81, 81, 90, 91, 11,
       32, 17, 12, 65, 72, 35, 87,  7, 66, 74, 71, 19, 74, 70, 23, 85, 11,
       11, 70, 55, 78, 65, 28, 96, 44, 87, 47, 54, 95, 73, 78, 73, 81, 57,
       62, 17, 31, 95, 41, 79, 77, 17, 48, 11, 95, 22, 76, 17, 27, 44, 59,
       11, 92, 64, 63, 45,  1, 19, 25, 63, 26, 72, 61,  6, 79, 12, 25, 63,
       52, 79,  1, 73, 26, 60,  2, 83,  9,  0, 73, 58, 98, 48, 38, 51, 23,
        7, 12, 39, 69, 84, 37, 15, 29, 16, 21, 70, 11, 63, 71, 51, 24, 58,
       67, 53, 64, 98, 22, 30, 54, 29, 59, 38, 12, 31, 23, 66, 65, 84, 61,
       81, 12, 12, 44, 33, 72, 44, 68, 51, 96, 52, 56, 65,  0, 40, 43, 11,
       72,  3, 58, 77, 67, 21, 82, 52,  4, 92, 73, 25, 78])

In [8]:
# using the ContextualWordEmbsAug class from the nlpaug library to perform text augmentation. Specifically, it is creating two instances of the ContextualWordEmbsAug class:

# augmenter_sub is initialized with the model path 'roberta-base' and the action "substitute". This means that it will use the RoBERTa model to substitute words in the text with similar words.

# augmenter_insert is initialized with the model path 'roberta-base' and the action "insert". This means that it will use the RoBERTa model to insert words into the text.

# These augmenters can be used to generate variations of text data by replacing or inserting words using the contextual information from the RoBERTa model.

augmenter_sub = aug.ContextualWordEmbsAug(model_path='roberta-base', action="substitute")
augmenter_insert = aug.ContextualWordEmbsAug(model_path='roberta-base', action="insert")

In [6]:
def augment_sentence(text, augmenter): 
    """Augments a sentence by substituting or inserting words using the given augmenter.

    Args:
        text (str): The input sentence to be augmented.
        augmenter (nlpaug.augmenter.word.ContextualWordEmbsAug): The augmenter object used for augmentation.

    Returns:
        str: The augmented sentence.
    """
    # Process the text with the spacy model
    doc = nlp(text)
    new_sent = ''
    for sent in doc.sents: 
        # begin augmentation of each sentence
        # print(sent.text)
        # print('-----------------------')
        augmented_text= augmenter.augment(sent.text)
        print(augmented_text[0])
        new_sent= new_sent+ ' '+ augmented_text[0] 
    return new_sent

def augmentMyData(df,label_name, label_val,  augmenter, repetitions=1, samples=200):
    """Augments the data by generating new samples based on the given augmentation parameters.

    Args:
        df (pandas.DataFrame): The DataFrame containing the data to be augmented.
        label_name (str): The name of the label column in the DataFrame.
        label_val: The value of the label for which augmentation is performed.
        augmenter (nlpaug.augmenter.word.ContextualWordEmbsAug): The augmenter object used for augmentation.
        repetitions (int, optional): The number of times each sentence is augmented. Defaults to 1.
        samples (int, optional): The number of samples to be generated from the minority class. Defaults to 200.

    Returns:
        pandas.DataFrame: The augmented DataFrame.
    """
    augmented_texts = []
    # select only the minority class samples
    spam_df = df[df[label_name] == label_val].reset_index(drop=True) # removes unecessary index column
    for i in tqdm(np.random.randint(0, len(spam_df), samples)):
        # generating 'n_samples' augmented texts
        for _ in range(repetitions):
            # the random sentence chosen for augmentation
            augmented_text= augment_sentence(spam_df['full_review'].iloc[i], augmenter)
            
            augmented_texts.append(augmented_text)
    
    data = {
        label_name: label_val,
        'full_review': augmented_texts
    }
    aug_df = pd.DataFrame(data)
    df = shuffle(df.append(aug_df).reset_index(drop=True))
    return df

In [17]:
test = "This is a very cheaply made product . I regret buying it.Its not even worth 2$. The reviews here are completely fake. Dont buy."
augment_sentence(test, augmenter_insert)

This is a very very cheaply home made commercial product.
I really regret buying even it.
Its a not really even worth the 2$. The reviews here are completely and fake.
Dont sell buy.


' This is a very very cheaply home made commercial product. I really regret buying even it. Its a not really even worth the 2$. The reviews here are completely and fake. Dont sell buy.'

In [36]:
rating_management_df_1 = rating_management_df[['full_review', 'rating_managment_explicit_ohe']]

In [40]:
rating_management_df_1.rating_managment_explicit_ohe.value_counts()

0    6191
1      97
Name: rating_managment_explicit_ohe, dtype: int64

In [41]:
rating_management_df_2=augmentMyData(rating_management_df_1,'rating_managment_explicit_ohe', 1, augmenter_insert, samples=5000)

100%|██████████| 5000/5000 [4:06:39<00:00,  2.96s/it]    
  df = shuffle(df.append(aug_df).reset_index(drop=True))


In [42]:
rating_management_df_2.to_csv('5000_training_rating_management_explicit.csv', index = False)

In [45]:
disagreement_train_raw_df = pd.read_csv('/Users/kartikvijay/Documents/MADS/Thesis pt.2/augmentation/train_dataset_disagreement_with_ratings.csv')
disagreement_train_raw_df.disagreement_with_ratings_ohe.value_counts()

0    5988
1     300
Name: disagreement_with_ratings_ohe, dtype: int64

In [47]:
disagreement_train_raw_df


Unnamed: 0,disagreement_with_ratings_ohe,full_review
0,0,We purchased a number of these 20 pack Assorte...
1,0,We were a little concerned because a negative ...
2,0,2018 compact pages are misaligned im using a f...
3,1,cant understand WHY this thing is rated hi.It ...
4,0,I bought these for my kids to use on the white...
...,...,...
6283,0,Unfortunately my HP OfficeJet 8600 Plus reject...
6284,0,This is the first book by David Sedaris I have...
6285,0,I bought this for our RV medicine cabinet.. Th...
6286,0,Misrepresentation of a product is a pet peeve ...


In [48]:
disagreement_train_augmented=augmentMyData(disagreement_train_raw_df,'disagreement_with_ratings_ohe', 1, augmenter_insert, samples=2500)

100%|██████████| 2500/2500 [2:01:44<00:00,  2.92s/it]   
  df = shuffle(df.append(aug_df).reset_index(drop=True))


In [49]:
disagreement_train_augmented.to_csv('disagreement_with_ratings_train_augmented_dataset.csv', index = False)

In [50]:
disagreement_train_augmented_1=augmentMyData(disagreement_train_raw_df,'disagreement_with_ratings_ohe', 1, augmenter_insert, samples=3500)

100%|██████████| 3500/3500 [2:31:55<00:00,  2.60s/it]  
  df = shuffle(df.append(aug_df).reset_index(drop=True))


In [51]:
disagreement_train_augmented_1.to_csv('3500_augmented_disagreement_with_ratings.csv', index = False)