# Project

Using GPT2 Model but was abandoned after no reviews could be generated. 

In [None]:
!pip install datasets
!pip install evaluate

## Imports

In [2]:
import evaluate
import numpy as np
import pandas as pd
import torch
from datasets import Dataset
from transformers import Trainer, TrainingArguments, GPT2Tokenizer, GPT2LMHeadModel

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

## Helper Methods

In [4]:
# Convert the data into Dataset format for transformers
def preprocess_data(reviews):
    inputs, outputs = [], []
    for _index, row in reviews.iterrows():
        # Prompt template
        prompt = f"Write a review for a {row['name']}:"
        inputs.append(prompt)
        outputs.append(row['reviews.text'])
    return {"prompt": inputs, "review": outputs}

## Pre-Processing

In [5]:
# Load dataset from json file
data = pd.read_csv('reviews_with_predicted_sentiment_category.csv', sep=',', low_memory=False)
data.head()

Unnamed: 0,name,reviews.text,reviews.title,predicted_sentiment,predicted_categorie
0,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",This product so far has not disappointed. My c...,Kindle,positive,Amazon Fire Tablet
1,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",great for beginner or experienced person. Boug...,very fast,positive,Amazon Fire Tablet
2,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",Inexpensive tablet for him to use and learn on...,Beginner tablet for our 9 year old son.,positive,Amazon Fire Tablet
3,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",I've had my Fire HD 8 two weeks now and I love...,Good!!!,positive,Amazon Fire Tablet
4,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",I bought this for my grand daughter when she c...,Fantastic Tablet for kids,positive,Amazon Fire Tablet


In [6]:
# create prompts for all products and it's review
processed_data = preprocess_data(data)
dataset = Dataset.from_dict(processed_data)

## Train

### Helper Methods

In [7]:
def tokenize_data(examples):
    """ Tokenize the prompt and review

    Args:
        examples (dict): Dictionary containing prompt and review

    Returns:
        dict: Dictionary containing tokenized data
    """

    tokenized_input = tokenizer(examples['prompt'], truncation=True, padding='max_length', max_length=400)
    tokenized_output = tokenizer(examples['review'], truncation=True, padding='max_length', max_length=400)

    # Return tokenized data
    return {
        "input_ids": tokenized_input['input_ids'],
        "attention_mask": tokenized_input['attention_mask'],
        "labels": tokenized_output['input_ids']
    }

In [8]:
# define evaluation methods
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")


def compute_metrics(eval_pred):
    """ Compute accuracy and f1 score

    Args:
        eval_pred (tuple): Tuple containing logits and labels

    Returns:
        dict: Dictionary containing accuracy and f1 score
    """

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    # calculate metrics
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")

    # return as dictionary
    return {
        "accuracy": accuracy["accuracy"],
        "f1": f1["f1"]
    }

In [9]:
def generate_review(product_name):
    """ Generate a review for a given product name

    Args:
        product_name (str): Name of the product

    Returns:
        str: Generated review
    """

    # create prompt
    prompt = f"Write a review for the {product_name}:"
    inputs = tokenizer.encode(prompt, return_tensors="pt")
    inputs = inputs.to(model.device)

    # Move inputs to the same device as the model
    outputs = model.generate(inputs, max_length=100, num_return_sequences=1)

    # decode the generated text
    review = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return review

### GPT2 Model

In [10]:
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Move the model to the GPU
model = model.to(device)

# Add a padding token to the tokenizer
model.config.pad_token_id = model.config.eos_token_id
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

# Resize tokenizer embeddings to accommodate new tokens
model.resize_token_embeddings(len(tokenizer))



Embedding(50257, 768)

In [11]:
# Create tokenized dataset
tokenized_dataset = dataset.map(tokenize_data, batched=True)

Map:   0%|          | 0/27867 [00:00<?, ? examples/s]

In [12]:
# Split data into train and test data 80/20
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

In [13]:
# Defining arguments for the training of the model
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=4,
    evaluation_strategy="epoch",
    learning_rate=3e-5,
    eval_steps=500,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=500,
    fp16=True
)

# Initialize Trainer for prompt tuning
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    #compute_metrics=compute_metrics # This kills the gpu unfortunately
)

# Train the model
train_output = trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss
1,0.6742,0.661008
2,0.6487,0.657055
3,0.6524,0.658586
4,0.6606,0.655814


#### Save Model

In [14]:
# Define the save directory
save_directory = "model_gpt2_fine"
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

('./amazon_reviews_model_2024-10-16_12-45-54/tokenizer_config.json',
 './amazon_reviews_model_2024-10-16_12-45-54/special_tokens_map.json',
 './amazon_reviews_model_2024-10-16_12-45-54/vocab.json',
 './amazon_reviews_model_2024-10-16_12-45-54/merges.txt',
 './amazon_reviews_model_2024-10-16_12-45-54/added_tokens.json')

#### Evaluate

In [15]:
# Have a quick look on the work the trainer done
results_df = pd.DataFrame([trainer.evaluate()])
display(results_df)

Unnamed: 0,eval_loss,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch
0,0.655814,57.9094,96.254,6.027,4.0


In [18]:
# Example usage
print("Generated Review: ", generate_review("Amazon Fire Tablet"))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Review:  Write a review for the Amazon Fire Tablet:...


#### Load Model

In [17]:
load_directory = "model_gpt2_fine"

# Load the fine-tuned model
model = GPT2LMHeadModel.from_pretrained(load_directory)

# Load the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained(load_directory)

print("Model and tokenizer loaded successfully.")

Model and tokenizer loaded successfully.
