## Finetuning GPT 2 to generate reviews

In [1]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F
import csv
# from langdetect import detect

Reading in all the data and preparing it

In [2]:
reviews = pd.read_csv("../tripadvisor_dataset/reviews.csv")
reviews = reviews.applymap(str) # convert to string because there are some rows with float

# We put a limit of 1024 tokens
reviews = reviews[reviews['review'].apply(lambda x: len(str(x).split(' ')) < 350)]

# Create a small test set to compare generated text with the reality
test_set = reviews.sample(n = 200)
reviews = reviews.loc[~reviews.index.isin(test_set.index)]

#Reset the indexes
test_set = test_set.reset_index()
reviews = reviews.reset_index()

#For the test set only, keep last 20 words in a new column, then remove them from original column
test_set['review_end'] = test_set['review'].str.split().str[-20:].apply(' '.join)
test_set['review'] = test_set['review'].str.split().str[:-20].apply(' '.join)

In [3]:
reviews.columns

Index(['index', 'id', 'reviewer name', 'title', 'date', 'rating', 'review'], dtype='object')

We extract the English reviews

Code can be found here: https://www.kaggle.com/hikmatelhaj/extracting-english-reviews

In [4]:
reviews = pd.read_csv("reviews_en.csv")
reviews["rating"].value_counts()

5.0    12647
4.0     8595
3.0     2744
1.0     1486
2.0     1351
Name: rating, dtype: int64

We take 1000 5-star and 1000 1-star reviews to finetune the model

In [5]:
reviews = pd.concat([reviews.query("rating == 1.0").sample(1000),reviews.query("rating == 5.0").sample(1000)])
reviews = reviews.reset_index() # reset the indices after sampling

In [6]:
test_set.head()

Unnamed: 0,index,id,reviewer name,title,date,rating,review,review_end
0,40614,3682354,453olivierv,Place to be!,"March 26, 2016",5.0,Zeer goed ! Heel,goede bediening en goede koks ! Trots op julli...
1,127453,4097420,700maudea,Goede brasserie,"December 11, 2016",4.0,Zoals ieder jaar zijn wij tijdens de beverse f...,feit dat het enorm druk was. Niet te lang moet...
2,23086,2236342,Tomas D,Muy buen lugar,"September 25, 2016",4.0,Un lugar excelente donde tomar unas,"buenas cervezas, muy buena ubicacion con vista..."
3,51771,12972530,Willy V,Etentje,"January 22, 2018",5.0,,Super gezellig. Voortreffelijke keuken Aan te ...
4,33196,2230743,annie-belgie,Allen daarheen,"July 8, 2021",5.0,"Topklasse, dat is ontegensprekelijk. Elke keer...",hier steeds erg welkom.Het eten is altijd perf...


In [7]:
class ReviewData(Dataset):
    def __init__(self, control_code, gpt2_type="gpt2", max_length=1024):
        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
        self.revs = []
        for row in reviews['review']:
            self.revs.append(torch.tensor(
                self.tokenizer.encode(f"<|{control_code}|>{row[:max_length]}<|endoftext|>")
            ))
        self.rev_count = len(self.revs)
        
    def __len__(self):
        return self.rev_count

    def __getitem__(self, item):
        return self.revs[item]

In [8]:
dataset = ReviewData(reviews['review'], gpt2_type="gpt2")

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [9]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

Downloading:   0%|          | 0.00/548M [00:00<?, ?B/s]

In [10]:
#Accumulated batch size (since GPT2 is so big)
def pack_tensor(new_tensor, packed_tensor, max_seq_len):
    if packed_tensor is None:
        return new_tensor, True, None
    if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
        return packed_tensor, False, new_tensor
    else:
        packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
        return packed_tensor, True, None
     

In [11]:
import os

def train(
    dataset, model, tokenizer,
    batch_size=16, epochs=10, lr=2e-5,
    max_seq_len=400, warmup_steps=200,
    gpt2_type="gpt2", output_dir=".", output_prefix="wreckgar",
    test_mode=False,save_model_on_epoch=False,
):

    acc_steps = 100
    device=torch.device("cuda")
    model = model.cuda()
    model.train()

    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )

    train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    loss=0
    accumulating_batch_count = 0
    input_tensor = None

    for epoch in range(epochs):

        print(f"Training epoch {epoch}")
        print(loss)
        for idx, entry in tqdm(enumerate(train_dataloader)):
            (input_tensor, carry_on, remainder) = pack_tensor(entry, input_tensor, 768)

            if carry_on and idx != len(train_dataloader) - 1:
                continue

            input_tensor = input_tensor.to(device)
            outputs = model(input_tensor, labels=input_tensor)
            loss = outputs[0]
            loss.backward()

            if (accumulating_batch_count % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()

            accumulating_batch_count += 1
            input_tensor = None
        if save_model_on_epoch:
            torch.save(
                model.state_dict(),
                os.path.join(output_dir, f"{output_prefix}-{epoch}.pt"),
            )
    return model

In [12]:
model = train(dataset, model, tokenizer)



Training epoch 0
0


2000it [01:20, 24.83it/s]


Training epoch 1
tensor(2.7957, device='cuda:0', grad_fn=<NllLossBackward0>)


2000it [01:20, 24.72it/s]


Training epoch 2
tensor(1.1757, device='cuda:0', grad_fn=<NllLossBackward0>)


2000it [01:22, 24.13it/s]


Training epoch 3
tensor(1.0292, device='cuda:0', grad_fn=<NllLossBackward0>)


2000it [01:20, 24.72it/s]


Training epoch 4
tensor(1.3510, device='cuda:0', grad_fn=<NllLossBackward0>)


2000it [01:21, 24.58it/s]


Training epoch 5
tensor(0.8160, device='cuda:0', grad_fn=<NllLossBackward0>)


2000it [01:24, 23.77it/s]


Training epoch 6
tensor(1.4777, device='cuda:0', grad_fn=<NllLossBackward0>)


2000it [01:21, 24.62it/s]


Training epoch 7
tensor(0.7970, device='cuda:0', grad_fn=<NllLossBackward0>)


2000it [01:20, 24.70it/s]


Training epoch 8
tensor(0.6082, device='cuda:0', grad_fn=<NllLossBackward0>)


2000it [01:22, 24.11it/s]


Training epoch 9
tensor(0.9392, device='cuda:0', grad_fn=<NllLossBackward0>)


2000it [01:21, 24.49it/s]


In [57]:
def generate(
    model,
    tokenizer,
    prompt,
    entry_count=10,
    entry_length=30, #maximum number of words
    top_p=0.8,
    temperature=0.5,
):
    model.eval()
    generated_num = 0
    generated_list = []
    # print(f"temperature is {temperature}")
    filter_value = -float("Inf")

    with torch.no_grad():

        for entry_idx in trange(entry_count):

            entry_finished = False
            generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)

            for i in range(entry_length):
                outputs = model(generated, labels=generated)
                loss, logits = outputs[:2]
                logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)

                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
                    ..., :-1
                ].clone()
                sorted_indices_to_remove[..., 0] = 0

                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                logits[:, indices_to_remove] = filter_value

                next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
                generated = torch.cat((generated, next_token), dim=1)

                if next_token in tokenizer.encode("<|endoftext|>"):
                    entry_finished = True

                if entry_finished:

                    generated_num = generated_num + 1

                    output_list = list(generated.squeeze().numpy())
                    output_text = tokenizer.decode(output_list)
                    generated_list.append(output_text)
                    break
            
            if not entry_finished:
              output_list = list(generated.squeeze().numpy())
              output_text = f"{tokenizer.decode(output_list)}<|endoftext|>" 
              generated_list.append(output_text)
                
    return generated_list

#Function to generate multiple sentences. Test data should be a dataframe
def text_generation(test_data):
    generated_lyrics = []
    x = generate(model.to('cpu'), tokenizer, test_data['review'][0], entry_count=1)
    generated_lyrics.append(x)
    return generated_lyrics

#Run the functions to generate the lyrics
generated_lyrics = text_generation(test_set)

100%|██████████| 1/1 [00:08<00:00,  8.56s/it]


In [46]:
def text_generation_tekst_meegeven(text, temperature=0.5):
    generated_lyrics = []
    x = generate(model.to('cpu'), tokenizer, text, entry_count=1, temperature=temperature)
    generated_lyrics.append(x)
    print(x)
    return generated_lyrics

In [15]:
generated_lyrics

[['Zeer goed! Heel! Heel! Heel! Heel! Heel! Heel! Heel! Heel! Heel! Heel! Heel<|endoftext|>']]

In [16]:
test_set['review'][0]

'Zeer goed ! Heel'

In [32]:
text_generation_tekst_meegeven("the food was delicious")

100%|██████████| 1/1 [00:06<00:00,  6.47s/it]

['the food was delicious and the service was great. The only thing I would change is the menu. I would definitely recommend this place again.<|endoftext|>']





[['the food was delicious and the service was great. The only thing I would change is the menu. I would definitely recommend this place again.<|endoftext|>']]

In [34]:
text_generation_tekst_meegeven("We had a great time")

100%|██████████| 1/1 [00:01<00:00,  1.70s/it]

["We had a great time and we'll be back next year.<|endoftext|>"]





[["We had a great time and we'll be back next year.<|endoftext|>"]]

In [54]:
text_generation_tekst_meegeven("too expensive food and meals", 0.8)

temperature is 0.8


100%|██████████| 1/1 [00:07<00:00,  7.97s/it]

['too expensive food and meals were cooked in the house.\n\nDespite this, the family was able to get some lovely restaurants that would sell you their meals in their place.<|endoftext|>']





[['too expensive food and meals were cooked in the house.\n\nDespite this, the family was able to get some lovely restaurants that would sell you their meals in their place.<|endoftext|>']]

In [55]:
text_generation_tekst_meegeven("bad food ")

temperature is 0.5


100%|██████████| 1/1 [00:09<00:00,  9.44s/it]

['bad food       I went with a group of friends and the service...\n1 person found this review helpful.\n\nReviewed By Date Rating<|endoftext|>']





[['bad food       I went with a group of friends and the service...\n1 person found this review helpful.\n\nReviewed By Date Rating<|endoftext|>']]

In [56]:
text_generation_tekst_meegeven("dry food and too salty ")

temperature is 0.5


100%|██████████| 1/1 [00:08<00:00,  8.37s/it]

['dry food and too salty \xa0for me.\nI was very disappointed with the quality of the food. The only thing I could think of was that it was not a good<|endoftext|>']





[['dry food and too salty \xa0for me.\nI was very disappointed with the quality of the food. The only thing I could think of was that it was not a good<|endoftext|>']]

In [58]:
text_generation_tekst_meegeven("impolite staff")

100%|██████████| 1/1 [00:07<00:00,  7.56s/it]

['impolite staff, who were all in the building.\n\n"I was really upset and scared, I was really upset," said one woman. "I was<|endoftext|>']





[['impolite staff, who were all in the building.\n\n"I was really upset and scared, I was really upset," said one woman. "I was<|endoftext|>']]