## Review generator from huggingface

In [2]:
import pandas as pd
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import numpy as np
import random
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import torch.nn.functional as F
import csv

### Prepare data
reviews = pd.read_csv("../tripadvisor_dataset/reviews.csv")
reviews = reviews.applymap(str)

#Drop the songs with lyrics too long (after more than 1024 tokens, does not work)
reviews = reviews[reviews['review'].apply(lambda x: len(str(x).split(' ')) < 350)]

#Create a very small test set to compare generated text with the reality
test_set = reviews.sample(n = 200)
reviews = reviews.loc[~reviews.index.isin(test_set.index)]

#Reset the indexes
test_set = test_set.reset_index()
reviews = reviews.reset_index()

#For the test set only, keep last 20 words in a new column, then remove them from original column
test_set['review_end'] = test_set['review'].str.split().str[-20:].apply(' '.join)
test_set['review'] = test_set['review'].str.split().str[:-20].apply(' '.join)

In [3]:
test_set.head()

Unnamed: 0,index,id,reviewer name,title,date,rating,review,review_end
0,101887,2330189,Steven D,Verassend lekker,"December 8, 2017",4.0,Qua interieur zeker geen hoogvlieger maar dit ...,concurrentie maar gezien de aangeboden kwalite...
1,99175,10259456,Arlette B,goed gegeten voor weinig geld,"July 3, 2017",4.0,"voor 25 euro , hadden wij gekozen voor het men...",saignant gevraagd en de steak was te veel uitg...
2,21322,694721,LD443,Gewoon lekker !,"November 1, 2014",4.0,"Meerdere maal in Le Grand Bleu gegeten,waren m...",eenvoudig decor met een eenvoudige bediening z...
3,131889,8485643,hieldeken,"klein italiaans resto, eten met smaak gebracht","February 22, 2020",3.0,"oud, authentiek pand, (geen modern kader) maar...","tijdens service,maar de moeder compenseert dit..."
4,7740,3792504,Boudewijn B,Heerlijk en eerlijk,"October 22, 2015",4.0,,"Van begin tot einde heerlijk eten, verrassende..."


In [4]:
class SongLyrics(Dataset):
    
    def __init__(self, control_code, truncate=False, gpt2_type="gpt2", max_length=1024):

        self.tokenizer = GPT2Tokenizer.from_pretrained(gpt2_type)
        self.lyrics = []
        counter = 0
        for row in reviews['review']:
            if counter > 1000:
                break
            self.lyrics.append(torch.tensor(
                self.tokenizer.encode(f"<|{control_code}|>{row[:max_length]}<|endoftext|>")
            ))
            counter += 1
                
        if truncate:
            self.lyrics = self.lyrics[:20000]
        self.lyrics_count = len(self.lyrics)
        
    def __len__(self):
        return self.lyrics_count

    def __getitem__(self, item):
        return self.lyrics[item]

In [5]:
dataset = SongLyrics(reviews['review'], truncate=True, gpt2_type="gpt2")

Downloading:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [6]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

Downloading:   0%|          | 0.00/548M [00:00<?, ?B/s]

In [7]:
#Accumulated batch size (since GPT2 is so big)
def pack_tensor(new_tensor, packed_tensor, max_seq_len):
    if packed_tensor is None:
        return new_tensor, True, None
    if new_tensor.size()[1] + packed_tensor.size()[1] > max_seq_len:
        return packed_tensor, False, new_tensor
    else:
        packed_tensor = torch.cat([new_tensor, packed_tensor[:, 1:]], dim=1)
        return packed_tensor, True, None
     

In [8]:
import os

def train(
    dataset, model, tokenizer,
    batch_size=16, epochs=20, lr=2e-5,
    max_seq_len=400, warmup_steps=200,
    gpt2_type="gpt2", output_dir=".", output_prefix="wreckgar",
    test_mode=False,save_model_on_epoch=False,
):

    acc_steps = 100
    device=torch.device("cuda")
    model = model.cuda()
    model.train()

    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=warmup_steps, num_training_steps=-1
    )

    train_dataloader = DataLoader(dataset, batch_size=1, shuffle=True)
    loss=0
    accumulating_batch_count = 0
    input_tensor = None

    for epoch in range(epochs):

        print(f"Training epoch {epoch}")
        print(loss)
        for idx, entry in tqdm(enumerate(train_dataloader)):
            (input_tensor, carry_on, remainder) = pack_tensor(entry, input_tensor, 768)

            if carry_on and idx != len(train_dataloader) - 1:
                continue

            input_tensor = input_tensor.to(device)
            outputs = model(input_tensor, labels=input_tensor)
            loss = outputs[0]
            loss.backward()

            if (accumulating_batch_count % batch_size) == 0:
                optimizer.step()
                scheduler.step()
                optimizer.zero_grad()
                model.zero_grad()

            accumulating_batch_count += 1
            input_tensor = None
        if save_model_on_epoch:
            torch.save(
                model.state_dict(),
                os.path.join(output_dir, f"{output_prefix}-{epoch}.pt"),
            )
    return model

In [9]:
model = train(dataset, model, tokenizer)



Training epoch 0
0


1001it [00:45, 21.77it/s]


Training epoch 1
tensor(4.1036, device='cuda:0', grad_fn=<NllLossBackward0>)


1001it [00:45, 22.06it/s]


Training epoch 2
tensor(3.4617, device='cuda:0', grad_fn=<NllLossBackward0>)


1001it [00:46, 21.50it/s]


Training epoch 3
tensor(1.6189, device='cuda:0', grad_fn=<NllLossBackward0>)


1001it [00:45, 22.23it/s]


Training epoch 4
tensor(0.9580, device='cuda:0', grad_fn=<NllLossBackward0>)


1001it [00:45, 21.91it/s]


Training epoch 5
tensor(0.7096, device='cuda:0', grad_fn=<NllLossBackward0>)


1001it [00:45, 21.95it/s]


Training epoch 6
tensor(0.5791, device='cuda:0', grad_fn=<NllLossBackward0>)


1001it [00:45, 21.90it/s]


Training epoch 7
tensor(1.3475, device='cuda:0', grad_fn=<NllLossBackward0>)


1001it [00:46, 21.76it/s]


Training epoch 8
tensor(1.1191, device='cuda:0', grad_fn=<NllLossBackward0>)


1001it [00:46, 21.74it/s]


Training epoch 9
tensor(0.7284, device='cuda:0', grad_fn=<NllLossBackward0>)


1001it [00:45, 21.86it/s]


Training epoch 10
tensor(0.3967, device='cuda:0', grad_fn=<NllLossBackward0>)


1001it [00:46, 21.69it/s]


Training epoch 11
tensor(0.7447, device='cuda:0', grad_fn=<NllLossBackward0>)


1001it [00:45, 21.99it/s]


Training epoch 12
tensor(0.6630, device='cuda:0', grad_fn=<NllLossBackward0>)


1001it [00:45, 21.79it/s]


Training epoch 13
tensor(1.1311, device='cuda:0', grad_fn=<NllLossBackward0>)


1001it [00:46, 21.76it/s]


Training epoch 14
tensor(1.8631, device='cuda:0', grad_fn=<NllLossBackward0>)


1001it [00:46, 21.57it/s]


Training epoch 15
tensor(0.6438, device='cuda:0', grad_fn=<NllLossBackward0>)


1001it [00:45, 21.78it/s]


Training epoch 16
tensor(0.6937, device='cuda:0', grad_fn=<NllLossBackward0>)


1001it [00:45, 21.84it/s]


Training epoch 17
tensor(0.8312, device='cuda:0', grad_fn=<NllLossBackward0>)


1001it [00:46, 21.48it/s]


Training epoch 18
tensor(1.4876, device='cuda:0', grad_fn=<NllLossBackward0>)


1001it [00:46, 21.68it/s]


Training epoch 19
tensor(0.5635, device='cuda:0', grad_fn=<NllLossBackward0>)


1001it [00:45, 22.11it/s]


In [14]:
def generate(
    model,
    tokenizer,
    prompt,
    entry_count=10,
    entry_length=30, #maximum number of words
    top_p=0.8,
    temperature=0.2,
):
    model.eval()
    generated_num = 0
    generated_list = []

    filter_value = -float("Inf")

    with torch.no_grad():

        for entry_idx in trange(entry_count):

            entry_finished = False
            generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)

            for i in range(entry_length):
                outputs = model(generated, labels=generated)
                loss, logits = outputs[:2]
                logits = logits[:, -1, :] / (temperature if temperature > 0 else 1.0)

                sorted_logits, sorted_indices = torch.sort(logits, descending=True)
                cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

                sorted_indices_to_remove = cumulative_probs > top_p
                sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[
                    ..., :-1
                ].clone()
                sorted_indices_to_remove[..., 0] = 0

                indices_to_remove = sorted_indices[sorted_indices_to_remove]
                logits[:, indices_to_remove] = filter_value

                next_token = torch.multinomial(F.softmax(logits, dim=-1), num_samples=1)
                generated = torch.cat((generated, next_token), dim=1)

                if next_token in tokenizer.encode("<|endoftext|>"):
                    entry_finished = True

                if entry_finished:

                    generated_num = generated_num + 1

                    output_list = list(generated.squeeze().numpy())
                    output_text = tokenizer.decode(output_list)
                    generated_list.append(output_text)
                    break
            
            if not entry_finished:
              output_list = list(generated.squeeze().numpy())
              output_text = f"{tokenizer.decode(output_list)}<|endoftext|>" 
              generated_list.append(output_text)
                
    return generated_list

#Function to generate multiple sentences. Test data should be a dataframe
def text_generation(test_data):
    generated_lyrics = []
    x = generate(model.to('cpu'), tokenizer, test_data['review'][0], entry_count=1)
    generated_lyrics.append(x)
    return generated_lyrics

#Run the functions to generate the lyrics
generated_lyrics = text_generation(test_set)

100%|██████████| 1/1 [00:23<00:00, 23.73s/it]


In [16]:
def text_generation_tekst_meegeven(text):
    generated_lyrics = []
    x = generate(model.to('cpu'), tokenizer, text, entry_count=1)
    generated_lyrics.append(x)
    print(x)
    return generated_lyrics

In [15]:
generated_lyrics

[['Qua interieur zeker geen hoogvlieger maar dit wordt meer dan gecompenseerd door de heerlijke eerlijke keuken, de prijzen zijn lichtjes hoger dan bij de vriendelijk.\n\nZijn een vriendelijk zeker een vriendelijk zeker een vriendel<|endoftext|>']]

In [13]:
test_set['review'][0]

'Qua interieur zeker geen hoogvlieger maar dit wordt meer dan gecompenseerd door de heerlijke eerlijke keuken, de prijzen zijn lichtjes hoger dan bij de'

In [23]:
text_generation_tekst_meegeven("the food was good ")

100%|██████████| 1/1 [00:09<00:00,  9.39s/it]

['the food was good \xa0and the service was fast \xa0and friendly \xa0and the staff was very friendly \xa0and helpful. \xa0I would definitely recommend this<|endoftext|>']





[['the food was good \xa0and the service was fast \xa0and friendly \xa0and the staff was very friendly \xa0and helpful. \xa0I would definitely recommend this<|endoftext|>']]