#### Code to artificially generate reviews
Note: took 3hrs on a GPU to generate 10k reviews

In [50]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from textaugment import EDA
from nltk.tokenize import sent_tokenize
import random
import csv
import tqdm
DEBUG = False

#Adapted from: https://github.com/Vamsi995/Paraphrase-Generator#


In [48]:
df = pd.read_csv("./Books_rating.csv")
df.head()


Unnamed: 0,Id,Title,Price,User_id,profileName,review/helpfulness,review/score,review/time,review/summary,review/text
0,1882931173,Its Only Art If Its Well Hung!,,AVCGYZL8FQQTD,"Jim of Oz ""jim-of-oz""",7/7,4.0,940636800,Nice collection of Julie Strain images,This is only for Julie Strain fans. It's a col...
1,826414346,Dr. Seuss: American Icon,,A30TK6U7DNS82R,Kevin Killian,10/10,5.0,1095724800,Really Enjoyed It,I don't care much for Dr. Seuss but after read...
2,826414346,Dr. Seuss: American Icon,,A3UH4UZ4RSVO82,John Granger,10/11,5.0,1078790400,Essential for every personal and Public Library,"If people become the books they read and if ""t..."
3,826414346,Dr. Seuss: American Icon,,A2MVUWT453QH61,"Roy E. Perry ""amateur philosopher""",7/7,4.0,1090713600,Phlip Nel gives silly Seuss a serious treatment,"Theodore Seuss Geisel (1904-1991), aka &quot;D..."
4,826414346,Dr. Seuss: American Icon,,A22X4XUPKF66MR,"D. H. Richards ""ninthwavestore""",3/3,4.0,1107993600,Good academic overview,Philip Nel - Dr. Seuss: American IconThis is b...


In [54]:
# print(len(df['Title'].unique()))
#~212,404 books#
unique_titles = df['Title'].unique()
f = open('gen_text_review.csv', 'w', newline='')
csvoutputfile = csv.writer(f)
csvoutputfile.writerow(['Title','OriginalReview','GeneratedReview'])
num_reviews_per_book  = 3

eda_augment = EDA()
tokenizer = AutoTokenizer.from_pretrained("Vamsi/T5_Paraphrase_Paws") 
model = AutoModelForSeq2SeqLM.from_pretrained("Vamsi/T5_Paraphrase_Paws").cuda()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for booktitle in tqdm.tqdm(unique_titles):
    df_out = df[df["Title"].str.lower().isin([booktitle.lower()])]
    list_of_reviews = df_out['review/text'].sample(n = min(len(df_out),num_reviews_per_book)).to_list()

    #If not enough reviews, use the same 1, augment with synonym swap
    if len(list_of_reviews)<num_reviews_per_book:
       text_to_augment = list_of_reviews[0]
       syn_augmented = eda_augment.synonym_replacement(text_to_augment)
       if DEBUG:
        print(text_to_augment)
        print(syn_augmented)
    
    #Shuffle sentences here as the paraphrasing method does not make very dissimilar texts
    shuffled_list_of_reviews = []
    for idx, r in enumerate(list_of_reviews):
        sentences = sent_tokenize(r)
        # if more than 5 sentences (very long review) randomly drop 1 sentence?
        if len(sentences)>=5:
            sentences.pop(random.randrange(len(sentences))) 
        shuffled_list_of_reviews.append(''.join([str(w) for w in random.sample(sentences, len(sentences))]))
    
    #Run inference on T5 (NLP model) to paraphrase the shuffled review#
    for idx, r in enumerate(shuffled_list_of_reviews):
       write_out_data = []
       write_out_data.append(booktitle) #book title

       text_input =  "paraphrase: " + r + " </s>"
       encoding = tokenizer.encode_plus(text_input,pad_to_max_length=True, return_tensors="pt")
       input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)
       outputs = model.generate(
           input_ids=input_ids, attention_mask=attention_masks,
           max_length=256,
           do_sample=True,
           top_k=150,
           top_p=0.95,
           early_stopping=True,
           num_return_sequences=1
        )
       for output in outputs:
            line = tokenizer.decode(output, skip_special_tokens=True,clean_up_tokenization_spaces=True)
            if DEBUG:
                print("\nOriginal:",list_of_reviews[idx], "\nShuffled:", r, "\nParaphrased:", line)
                print()
            write_out_data.append(list_of_reviews[idx])
            write_out_data.append(line)
       csvoutputfile.writerow(write_out_data)
f.close()

  5%|▌         | 5/100 [00:19<06:02,  3.81s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (1426 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 100/100 [06:25<00:00,  3.85s/it]
