In [1]:
import torch
from transformers import BartForConditionalGeneration
import pandas as pd 
from datasets import *

from sklearn.model_selection import train_test_split

In [2]:
torch.cuda.empty_cache()

# T5 - To Do List
## Use T5ForConditionalGeneration
## for summarization change prompts to: ___

```
from transformers import AutoTokenizer, T5ForConditionalGeneration

tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# training
input_ids = tokenizer("The <extra_id_0> walks in <extra_id_1> park", return_tensors="pt").input_ids
labels = tokenizer("<extra_id_0> cute dog <extra_id_1> the <extra_id_2>", return_tensors="pt").input_ids
outputs = model(input_ids=input_ids, labels=labels)
loss = outputs.loss
logits = outputs.logits

# inference
input_ids = tokenizer(
    "summarize: studies have shown that owning a dog is good for you", return_tensors="pt"
).input_ids  # Batch size 1
outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
# studies have shown that owning a dog is good for you.
```



# Preprocessing

## Types of Experiments

1. gpt style prompting 
2. just concattenating reviews
3. thorough reviews 
4. adding more meta data to the prompts 


## Types of Models to Run Experiments on
1. BART base
2. GPT2 
3. BART Large
4. T5
5. Trying Distil models (ex: sshleifer/distilbart-cnn-12-6)

Other models:
BART, BigBird-Pegasus, Blenderbot, BlenderbotSmall, Encoder decoder, FairSeq Machine-Translation, GPTSAN-japanese, LED, LongT5, M2M100, Marian, mBART, MT5, MVP, NLLB, NLLB-MOE, Pegasus, PEGASUS-X, PLBart, ProphetNet, SwitchTransformers, T5, XLM-ProphetNet


## Experiment 1. Adding gpt style prompting for BART with the following structure:

"The following are the highest rated reviews of a restaurant on Yelp. Generate a summary that details the opinions about this restaurant:

1. 'Great experience at this new 3.5 month old Korean/Thai fusion gem that offered delicious food, warm, friendly service, and a nice comfortable ambiance. Adam, our server, was very welcoming, fun, engaging, helpful with menu recommendations, and thoughtful to check in on us regularly. The items we ordered were all wonderful and flavorful - the hamachi Kama, Korean spicy noodle with pork jowl, Bibimbap with beef brisket, lavender lemonade, and apple pie cheese cake. Do yourself a favor and ask for their house made chili crisp oil and sauces - so so good. Go and enjoy a great vibe, meal, and environment.'
2. 'Everything here is excellent. My favorites are the Beef Noodle Soup, Korean Spicy Noodle and the newly added salmon green curry. Great gluten-free options too!'
3. ....
4. ...
5. ...

We do the first 5 reviews instead of 10 because the model only has a max length


## Experiment 2. ADdding more meta data to prompts 

Review Set:

Restaurant Name: The Blue Door

Location: San Francisco, CA

Cuisine: American

Review 1: "The Blue Door is a must-visit if you're in San Francisco. The food is outstanding, with a great selection of American classics and some unique dishes. The service is also top-notch, with friendly and attentive staff."

Review 2: "I was blown away by the food at The Blue Door. Every dish we tried was amazing, and the drinks were excellent too. The service was also great - our server was very knowledgeable about the menu and made some great recommendations."

Review 3: "I've been to The Blue Door several times now and it never disappoints. The food is consistently excellent and the service is always friendly and attentive. I highly recommend the fried chicken and the mac and cheese."

Review 4: "The Blue Door has some of the best food I've ever had. The flavors are so well-balanced and everything is cooked to perfection. The service is also excellent - our server was very friendly and made us feel welcome."

Review 5: "This place is a hidden gem. The food is outstanding and the atmosphere is cozy and inviting. I highly recommend the beef brisket and the apple pie."

Prompt:
"Summarize the top 5 reviews for The Blue Door in San Francisco, CA. The restaurant serves American cuisine and is known for its outstanding food and top-notch service."

In [3]:
path = '../data/summarized_reviews_san_francisco.csv'
df = pd.read_csv(path)

In [4]:
seperator = '_______JOINED_THE_MESSAGE_ON THIS_STRING_______'

In [5]:
df['review'] = df['review'].apply(lambda x : x.split(seperator))

In [6]:
def determine_prompt_type(review_type):
    if review_type == 'top_rated':
        top_rated_prompt = "The following are the highest rated reviews of a restaurant on Yelp. Generate a summary that details the opinions about this restaurant:\n\n"
        return top_rated_prompt
    
    elif review_type == "low_rated":
        low_rated_prompt = "The following are the lowest rated reviews of a restaurant on Yelp. Generate a summary that details the opinions about this restaurant:\n\n"
        return low_rated_prompt
    
    elif review_type == "newest_rated":
        newest_rated_prompt = "The following are the newest rated reviews of a restaurant on Yelp. Generate a summary that details the opinions about this restaurant\n\n"
        return newest_rated_prompt
    
    elif review_type == "elited_rated":
    
        elite_rated_prompt = "The following are restaurant Yelp reviews of a restaurant written by Yelp Elite members. Generate a summary that details the opinions about this restaurant:\n\n"
        return elite_rated_prompt
    
    else:
        return "The following are restaurant Yelp reviews of a restaurant. Generate a summary that details the opinions about this restaurant:\n\n"

In [7]:
def create_prompts(df):
    
    for i, row in df.iterrows():

        prompt = determine_prompt_type(row['review_type'])
        for i_, review in enumerate(row['review']):
            
            if i_ > 5:
                break
            prompt += f'{i_+1}. {review} \n'
        df.loc[i,'prompt'] = prompt

    return df 

In [8]:
df = create_prompts(df)

In [9]:
print(df['prompt'][0])

The following are the highest rated reviews of a restaurant on Yelp. Generate a summary that details the opinions about this restaurant:

1. Great experience at this new 3.5 month old Korean/Thai fusion gem that offered delicious food, warm, friendly service, and a nice comfortable ambiance. Adam, our server, was very welcoming, fun, engaging, helpful with menu recommendations, and thoughtful to check in on us regularly. The items we ordered were all wonderful and flavorful - the hamachi Kama, Korean spicy noodle with pork jowl, Bibimbap with beef brisket, lavender lemonade, and apple pie cheese cake. Do yourself a favor and ask for their house made chili crisp oil and sauces - so so good. Go and enjoy a great vibe, meal, and environment. 
1. opopopopopopopopopopopopopopopopopopopopopopopopop­opopopopopopopopopopopopopopopopopopopopopopopopop­opopopopopopopopopop 
1. Everything here is excellent. My favorites are the Beef Noodle Soup, Korean Spicy Noodle and the newly added salmon gree

In [10]:
df_train, df_test = train_test_split(df[['name','review_type','prompt','summary']],test_size=0.2,random_state=0)


In [11]:
train_dataset = Dataset.from_pandas(df_train,split='train')
test_dataset = Dataset.from_pandas(df_test,split='test')
ds_dict = {'train':train_dataset,'test':test_dataset}
dataset = DatasetDict(ds_dict)
dataset

DatasetDict({
    train: Dataset({
        features: ['name', 'review_type', 'prompt', 'summary', '__index_level_0__'],
        num_rows: 2822
    })
    test: Dataset({
        features: ['name', 'review_type', 'prompt', 'summary', '__index_level_0__'],
        num_rows: 706
    })
})

In [12]:
import evaluate

rouge = evaluate.load("rouge")

In [13]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from transformers import BartTokenizer, BartForConditionalGeneration
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import numpy as np


In [14]:
# checkpoint = 'facebook/bart-large'
from transformers import T5ForConditionalGeneration

checkpoint = 't5-base'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = T5ForConditionalGeneration.from_pretrained(checkpoint)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [15]:
def preprocess_function(examples):
    inputs = [doc for doc in examples["prompt"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=150, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

In [16]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

Map:   0%|          | 0/2822 [00:00<?, ? examples/s]

Map:   0%|          | 0/706 [00:00<?, ? examples/s]

In [17]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [18]:
dataset

DatasetDict({
    train: Dataset({
        features: ['name', 'review_type', 'prompt', 'summary', '__index_level_0__'],
        num_rows: 2822
    })
    test: Dataset({
        features: ['name', 'review_type', 'prompt', 'summary', '__index_level_0__'],
        num_rows: 706
    })
})

In [19]:
training_args = Seq2SeqTrainingArguments(
    output_dir="T5-base",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=5,
    predict_with_generate=True
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,2.2946,1.577824,0.1926,0.084,0.1556,0.1558,19.0
2,1.7699,1.492862,0.198,0.0879,0.1597,0.1598,19.0
3,1.6334,1.456033,0.1994,0.0894,0.1612,0.1614,19.0
4,1.6008,1.437524,0.2,0.089,0.1618,0.1618,19.0


TrainOutput(global_step=3530, training_loss=1.7327030851888252, metrics={'train_runtime': 2048.863, 'train_samples_per_second': 6.887, 'train_steps_per_second': 1.723, 'total_flos': 1.659595555759104e+16, 'train_loss': 1.7327030851888252, 'epoch': 5.0})

In [20]:
tuned_model = trainer.model

In [21]:
trainer.evaluate()

{'eval_loss': 1.433504343032837,
 'eval_rouge1': 0.2004,
 'eval_rouge2': 0.0894,
 'eval_rougeL': 0.1614,
 'eval_rougeLsum': 0.1615,
 'eval_gen_len': 19.0,
 'eval_runtime': 91.6828,
 'eval_samples_per_second': 7.7,
 'eval_steps_per_second': 1.931,
 'epoch': 5.0}

In [31]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [23]:
tuned_model_t5_base = tuned_model

In [33]:
# tuned_model_t5_base.to(device)

In [34]:
summaries_t5 = []

T5TokenizerFast(name_or_path='t5-base', vocab_size=32100, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<extra_id_43>', '<extra_id

In [48]:
for prompt in dataset['test']['prompt']:
    tokenized_prompt = tokenizer.encode(prompt,return_tensors='pt').to(device)
    tokenized_summary = tuned_model_t5_base.generate(tokenized_prompt,max_length=150)
    summary = tokenizer.decode(tokenized_summary.squeeze(), skip_special_tokens=True)
    summaries_t5.append(summary)

    


KeyboardInterrupt: 

In [54]:
pd.DataFrame({'actual':dataset['test']['summary'][:158],'generated':summaries_t5}).to_csv('T5_test.csv',index=False)

In [56]:
trainer.save_model('t5model')