In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm
from transformers import MarianMTModel, MarianTokenizer
from collections import defaultdict

In [2]:
data = pd.read_csv("/kaggle/input/fake-reviews-swm/fake reviews dataset.csv")

In [3]:
# function to remove rows containing NaNs and irrelevant columns
def data_clean(data):
    # total rows in data
    print("Total no. of rows in data:", len(data))
    
    # checking numbers of NaNs in all columns
    print("Total no. of NaNs in 'category' column:", data['category'].isnull().sum())
    print("Total no. of NaNs in 'rating' column:", data['rating'].isnull().sum())
    print("Total no. of NaNs in 'label' column:", data['label'].isnull().sum())
    print("Total no. of NaNs in 'text' column:", data['text_'].isnull().sum())
    
    # removing the rows containing NaNs in 'category' column
    data = data[data['category'].notna()]
    print("Total no. of rows in data after removing NaNs:", len(data))
    
    # count of CG and OR reviews
    print("Count of CG and OR reviews:", data['label'].value_counts())
    print("Ratio of CG and OR REVIEWS:", data['label'].value_counts(normalize=True))
    
    # avg number of words in review of CG and OR reviews
    data['word_count'] = data['text_'].str.split().str.len()
    print("Average count of words in CG and OR reviews:", data.groupby('label')['word_count'].mean())
    
    return data

In [4]:
data = data_clean(data)

Total no. of rows in data: 40432
Total no. of NaNs in 'category' column: 0
Total no. of NaNs in 'rating' column: 0
Total no. of NaNs in 'label' column: 0
Total no. of NaNs in 'text' column: 0
Total no. of rows in data after removing NaNs: 40432
Count of CG and OR reviews: CG    20216
OR    20216
Name: label, dtype: int64
Ratio of CG and OR REVIEWS: CG    0.5
OR    0.5
Name: label, dtype: float64
Average count of words in CG and OR reviews: label
CG    61.288237
OR    73.642610
Name: word_count, dtype: float64


In [5]:
# partitioning the data into CG and OR
data_OR = data.loc[data['label'] == 'OR']
data_OR_series = data_OR['text_'].to_list()

In [6]:
# functions for back translation (Reference - https://amitness.com/back-translation/)
target_model_name = 'Helsinki-NLP/opus-mt-en-fr'
target_tokenizer = MarianTokenizer.from_pretrained(target_model_name)
target_model = MarianMTModel.from_pretrained(target_model_name).to('cuda:0')

en_model_name = 'Helsinki-NLP/opus-mt-fr-en'
en_tokenizer = MarianTokenizer.from_pretrained(en_model_name)
en_model = MarianMTModel.from_pretrained(en_model_name).to('cuda:0')

Downloading (…)olve/main/source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]



Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/301M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

Downloading (…)olve/main/source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/301M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [7]:
def translate(texts, model, tokenizer, language="fr"):
    # Prepare the text data into appropriate format for the model
    template = lambda text: f"{text}" if language == "en" else f">>{language}<< {text}"
    src_texts = [template(text) for text in texts]

    # Tokenize the texts
    encoded = tokenizer.prepare_seq2seq_batch(src_texts,return_tensors="pt").to('cuda:0')
    
    # Generate translation using model
    translated = model.generate(**encoded)

    # Convert the generated tokens indices back into text
    translated_texts = tokenizer.batch_decode(translated, skip_special_tokens=True)
    
    return translated_texts

In [8]:
def back_translate(texts, source_lang="en", target_lang="fr"):
    # Translate from source to target language
    fr_texts = translate(texts, target_model, target_tokenizer, 
                         language=target_lang)

    # Translate from target language back to source language
    back_translated_texts = translate(fr_texts, en_model, en_tokenizer, language=source_lang)
    
    return back_translated_texts

In [9]:
# breaking augmentation data into batches for easier processing
batch_size = 56 # should be a multiple of length of original data (20216)
num_batches = int(len(data_OR_series) / batch_size)

In [10]:
# function which takes a list as input and returns a dictionary of lists containing the outputs of back translation
def back_translation_output(data):
    i = 0
    d_data = defaultdict(list)
    d_data_out = defaultdict(list)
    for i in tqdm(range(num_batches)):
        d_data[i] = data[i*batch_size:(i+1)*batch_size]
        d_data_out[i] = back_translate(d_data[i])
        
    return d_data_out

In [11]:
data_CG_bt = back_translation_output(data_OR_series)

`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.

100%|██████████| 361/361 [1:16:31<00:00, 12.72s/it]


In [12]:
data_OR.head()

Unnamed: 0,category,rating,label,text_,word_count
55,Home_and_Kitchen_5,5.0,OR,"These are just perfect, exactly what I was loo...",10
56,Home_and_Kitchen_5,5.0,OR,Such a great purchase can't beat it for the price,10
57,Home_and_Kitchen_5,5.0,OR,What can you say--- cheap and it works as inte...,10
58,Home_and_Kitchen_5,5.0,OR,"These are so nice, sturdy, like the color choi...",10
59,Home_and_Kitchen_5,5.0,OR,It is nice bowl and have had a fast shipping!,10


In [13]:
data_CG = data_OR.drop(['label', 'text_', 'word_count'], axis = 1)

In [14]:
data_CG.head()

Unnamed: 0,category,rating
55,Home_and_Kitchen_5,5.0
56,Home_and_Kitchen_5,5.0
57,Home_and_Kitchen_5,5.0
58,Home_and_Kitchen_5,5.0
59,Home_and_Kitchen_5,5.0


In [15]:
data_CG['label'] = 'CG'

In [17]:
data_CG_bt_list = list(data_CG_bt.values())

In [18]:
final_text_CG_list = []
for i in range(len(data_CG_bt_list)):
    for j in range(len(data_CG_bt_list[i])):
        final_text_CG_list.append(data_CG_bt_list[i][j])

In [21]:
data_CG['text_'] = final_text_CG_list

In [22]:
data_CG

Unnamed: 0,category,rating,label,text_
55,Home_and_Kitchen_5,5.0,CG,"They're perfect, exactly what I was looking for."
56,Home_and_Kitchen_5,5.0,CG,Such a big purchase can't beat for the price
57,Home_and_Kitchen_5,5.0,CG,What can you say... cheap and it works as plan...
58,Home_and_Kitchen_5,5.0,CG,"They are so nice, robust, like color choices too."
59,Home_and_Kitchen_5,5.0,CG,It's a good bowl and had a quick expedition!
...,...,...,...,...
40423,Clothing_Shoes_and_Jewelry_5,4.0,CG,"I don't have much opportunity to ""dress"" these..."
40425,Clothing_Shoes_and_Jewelry_5,5.0,CG,The dimensions indicated on the description ar...
40427,Clothing_Shoes_and_Jewelry_5,4.0,CG,I read a few reviews saying that this bra was ...
40429,Clothing_Shoes_and_Jewelry_5,2.0,CG,I need a 3-in-1 jacket. How is it for 3-in-1 s...


In [23]:
data_CG = data_CG.reset_index()

In [26]:
data_CG = data_CG.drop('index', axis = 1)

In [27]:
data_CG

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,CG,"They're perfect, exactly what I was looking for."
1,Home_and_Kitchen_5,5.0,CG,Such a big purchase can't beat for the price
2,Home_and_Kitchen_5,5.0,CG,What can you say... cheap and it works as plan...
3,Home_and_Kitchen_5,5.0,CG,"They are so nice, robust, like color choices too."
4,Home_and_Kitchen_5,5.0,CG,It's a good bowl and had a quick expedition!
...,...,...,...,...
20211,Clothing_Shoes_and_Jewelry_5,4.0,CG,"I don't have much opportunity to ""dress"" these..."
20212,Clothing_Shoes_and_Jewelry_5,5.0,CG,The dimensions indicated on the description ar...
20213,Clothing_Shoes_and_Jewelry_5,4.0,CG,I read a few reviews saying that this bra was ...
20214,Clothing_Shoes_and_Jewelry_5,2.0,CG,I need a 3-in-1 jacket. How is it for 3-in-1 s...


In [30]:
data_OR = data_OR.reset_index(drop = True)

In [31]:
data_OR

Unnamed: 0,category,rating,label,text_,word_count
0,Home_and_Kitchen_5,5.0,OR,"These are just perfect, exactly what I was loo...",10
1,Home_and_Kitchen_5,5.0,OR,Such a great purchase can't beat it for the price,10
2,Home_and_Kitchen_5,5.0,OR,What can you say--- cheap and it works as inte...,10
3,Home_and_Kitchen_5,5.0,OR,"These are so nice, sturdy, like the color choi...",10
4,Home_and_Kitchen_5,5.0,OR,It is nice bowl and have had a fast shipping!,10
...,...,...,...,...,...
20211,Clothing_Shoes_and_Jewelry_5,4.0,OR,This is a classy looking watch. I don't get m...,341
20212,Clothing_Shoes_and_Jewelry_5,5.0,OR,The stated dimensions on the description are o...,345
20213,Clothing_Shoes_and_Jewelry_5,4.0,OR,I had read some reviews saying that this bra r...,329
20214,Clothing_Shoes_and_Jewelry_5,2.0,OR,"You can wear the hood by itself, wear it with ...",343


In [32]:
data_OR = data_OR.drop('word_count', axis = 1)

In [34]:
common = [data_OR, data_CG]
data_final = pd.concat(common)
data_final.head()

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,OR,"These are just perfect, exactly what I was loo..."
1,Home_and_Kitchen_5,5.0,OR,Such a great purchase can't beat it for the price
2,Home_and_Kitchen_5,5.0,OR,What can you say--- cheap and it works as inte...
3,Home_and_Kitchen_5,5.0,OR,"These are so nice, sturdy, like the color choi..."
4,Home_and_Kitchen_5,5.0,OR,It is nice bowl and have had a fast shipping!


In [36]:
data_final

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5.0,OR,"These are just perfect, exactly what I was loo..."
1,Home_and_Kitchen_5,5.0,OR,Such a great purchase can't beat it for the price
2,Home_and_Kitchen_5,5.0,OR,What can you say--- cheap and it works as inte...
3,Home_and_Kitchen_5,5.0,OR,"These are so nice, sturdy, like the color choi..."
4,Home_and_Kitchen_5,5.0,OR,It is nice bowl and have had a fast shipping!
...,...,...,...,...
20211,Clothing_Shoes_and_Jewelry_5,4.0,CG,"I don't have much opportunity to ""dress"" these..."
20212,Clothing_Shoes_and_Jewelry_5,5.0,CG,The dimensions indicated on the description ar...
20213,Clothing_Shoes_and_Jewelry_5,4.0,CG,I read a few reviews saying that this bra was ...
20214,Clothing_Shoes_and_Jewelry_5,2.0,CG,I need a 3-in-1 jacket. How is it for 3-in-1 s...


In [40]:
data_final = data_final.sample(frac=1).reset_index(drop=True)

In [42]:
data_final.to_csv('/kaggle/working/new_fake_reviews_data.csv', index = False)