# Project

Using LLama Model to train and to generate reviews

## Imports

In [None]:
!pip install datasets
!pip install peft

In [16]:
import pandas as pd
import torch
import transformers
from datasets import Dataset
from peft import LoraConfig, get_peft_model, PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, pipeline

In [11]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

## Helper Methods

In [10]:
# Convert the data into Dataset format for transformers
def preprocess_data(reviews):
    """Preprocesses the given reviews.

    Args:
      reviews: The reviews to be preprocessed.

    Returns:
      A dictionary containing the preprocessed reviews.
    """

    inputs = []
    for _index, row in reviews.iterrows():
        # Prompt template
        prompt = f"Product: {row['name']} Sentiment: {row['predicted_sentiment']} Review: {row['reviews.text']}"
        inputs.append(prompt)

    return {"prompt": inputs}

In [12]:
def tokenize_reviews(reviews):
    """Tokenizes the given reviews using the provided tokenizer.

    Args:
      reviews: The reviews to be tokenized.

    Returns:
      A dictionary containing the tokenized reviews.
    """

    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        # Update the model to include the padding token in its embedding layer if necessary
        model.resize_token_embeddings(len(tokenizer))

    return tokenizer(reviews["prompt"], padding="max_length", truncation=True, max_length=512)

In [13]:
def generate_amazon_review(used_model, product_name, sentiment, max_length=200):
    """Generates an Amazon review for the given product name and sentiment.

    Args:
      used_model: The model to be used for generating the review.
      product_name: The name of the product.
      sentiment: The sentiment of the review.

    Returns:
      The generated review text.
    """

    prompt = f"Write a detailed Amazon review for '{product_name}'. My meaning about this product is {sentiment}"

    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Generate text using the used_model
    output = used_model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=max_length,
        do_sample=True,
        temperature=0.5,
        top_p=0.9,
        num_return_sequences=1,
        early_stopping=True,
        repetition_penalty=1.5,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id
    )

    # Decode the generated text
    review = tokenizer.decode(output[0], skip_special_tokens=True).replace(prompt, '')
    if review.startswith(". ") or review.startswith(", "):
        return review[2:]  # Remove the first two characters when it's a punctuation

    return review

## Pre-Processing

In [5]:
# Load dataset from json file
data = pd.read_csv('lists/reviews_with_predicted_sentiment_category.csv', sep=',', low_memory=False)
data.head()

Unnamed: 0,name,reviews.rating,reviews.text,reviews.title,predicted_sentiment,predicted_categorie
0,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",5,This product so far has not disappointed. My c...,Kindle,positive,Amazon Fire Tablet
1,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",5,great for beginner or experienced person. Boug...,very fast,positive,Amazon Fire Tablet
2,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",5,Inexpensive tablet for him to use and learn on...,Beginner tablet for our 9 year old son.,positive,Amazon Fire Tablet
3,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",4,I've had my Fire HD 8 two weeks now and I love...,Good!!!,positive,Amazon Fire Tablet
4,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",5,I bought this for my grand daughter when she c...,Fantastic Tablet for kids,positive,Amazon Fire Tablet


In [15]:
# Define the model name
model_name = "meta-llama/Llama-3.2-1B"

# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)

tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

In [None]:
# Configure the Low-Rank Adaptation (LoRA) which will be used to fine-tune the pre-trained language model
lora_config = LoraConfig(
    r=6,
    lora_alpha=12,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="lora_only",
    task_type="CAUSAL_LM")

In [None]:
# Apply the LoRA config to the base model
peft_model = get_peft_model(model, lora_config)
print(peft_model.print_trainable_parameters())

trainable params: 638,976 || all params: 1,236,453,376 || trainable%: 0.0517
None


## Train

In [None]:
#Example usage to generate reviews for specific products
product_name = "Amazon - Amazon Tap Portable Bluetooth and Wi-Fi Speaker - Black"
sentiment = "positive"
review = generate_amazon_review(model, product_name, sentiment)
print(review)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


I am using it in my car, while traveling to work or school.
This was the first speaker that i bought from amazon.com since they are very reliable with their products as well. The sound quality of speakers has improved so much over time, you can hear every word clearly now!


In [None]:
# Create prompts for all products and it's review
processed_data = preprocess_data(data)
dataset = Dataset.from_dict(processed_data)

In [None]:
# Generate tokenized dataset
tokenized_dataset = dataset.map(tokenize_reviews, batched=True)

Map:   0%|          | 0/27867 [00:00<?, ? examples/s]

### Helper Methods

### Llama Model

In [None]:
# Split data to train in test 20/80
train_test_split = tokenized_dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

# Does the split result looks good
print(train_dataset.shape, eval_dataset.shape)
train_sample = train_dataset.select(range(50))
display(train_sample)

(22293, 3) (5574, 3)


Dataset({
    features: ['prompt', 'input_ids', 'attention_mask'],
    num_rows: 50
})

In [None]:
# Print one sample to see what we sent to the training
print(train_sample[:1])

{'prompt': ["Product: Fire Tablet, 7 Display, Wi-Fi, 8 GB - Includes Special Offers, Magenta Sentiment: positive Review: I enjoy reading ebooks on this tablet as well as watching videos. It's just so handy to carry around in my purse."], 'input_ids': [[128000, 4921, 25, 6785, 58403, 11, 220, 22, 10848, 11, 17664, 27395, 11, 220, 23, 19397, 482, 27044, 9984, 52418, 11, 7023, 16985, 24248, 3904, 25, 6928, 10506, 25, 358, 4774, 5403, 89002, 389, 420, 21354, 439, 1664, 439, 10307, 6946, 13, 1102, 596, 1120, 779, 26222, 311, 6920, 2212, 304, 856, 53101, 13, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 1

In [None]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    auto_find_batch_size=True,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_steps=500,
    use_cpu=False,
    fp16=True,
)

# Initialize the Trainer
tokenizer.pad_token = tokenizer.eos_token
trainer = Trainer(
    model=peft_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False)
)

# Start the fine-tuning process
train_results = trainer.train()

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


Epoch,Training Loss,Validation Loss
1,1.754,1.729197
2,1.7278,1.718633


We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


#### Save Model

In [None]:
# Define the save director
trainer.model.save_pretrained("model_llama_fine")

#### Evaluate

In [None]:
results = pd.DataFrame([train_results.metrics])
display(results)

Unnamed: 0,train_runtime,train_samples_per_second,train_steps_per_second,total_flos,train_loss,epoch
0,4958.3274,8.992,1.124,1.333776e+17,1.789447,2.0


#### Load Model

In [20]:
loaded_model_peft = PeftModel.from_pretrained(model, './model_llama_fine', is_trainable=False)

print("Model and tokenizer loaded successfully.")

Model and tokenizer loaded successfully.


#### Test result

In [None]:
product_name = "Amazon - Amazon Tap Portable Bluetooth and Wi-Fi Speaker - Black"
sentiment = "positive"
review = generate_amazon_review(loaded_model_peft, product_name, sentiment, 200)
print(review)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


but the sound quality could be improved. Overall it's good value.
I bought two of these speakers to use with my Echo Dot as I have an old iPhone 4 that does not support Apple Music (or any other streaming service). These are great little devices! They connect easily via bluetooth or wifi which allows you access your music library on all three platforms without having additional hardware like iPads/Android phones/tablets/etc.. The only downside was there were no options when pairing them up except through Siri so if someone has multiple sets they may need more than one set in order make sure everything works properly... But overall awesome device!
The speaker itself isn't bad at best though its really just ok It doesn’t stand out from others especially since most people will probably get something else anyway. If u want cheap then go ahead buy em,if ur looking better


In [21]:
categories = data['predicted_categorie'].unique()
texts = [['intro', 50], ['full review', 200]]

for category in categories.tolist():
    product = data[data['predicted_categorie'] == category].sample(n=1)
    print(f"A Review for a {category}")
    print(product['name'].iloc[0])

    for text_type, length in texts:
        print(f"{text_type}:")
        text = generate_amazon_review(loaded_model_peft, product['name'], 'positive', length)
        print(text)
    print("-------------------------------\n")

A Review for a Amazon Fire Tablet
Fire Tablet, 7 Display, Wi-Fi, 8 GB - Includes Special Offers, Magenta
intro:




I have used it to play games
full review:
It's good to read books and play games on it.
I have the kindle fire hd tablet with me since last year...its great! I use my amazon account frequently..but i don't like that its not so powerful as compared o other tablets in market but still worth buying if u r looking out some cheap one!
Kindles are excellent reading devices; they're easy-to-use without being too complicated or expensive (as opposed to an e-reader). They also offer plenty of storage space which makes them very convenient when you want something quick yet portable enough where your kids can take advantage from while playing outside all day long. The only downside would be their limited battery life--which may become inconvenient after awhile depending upon how often users utilize these units throughout each month/week/year respectively
-------------------------------

A Review for a Amazon Echo White
Echo (White) Echo (White)
intro:
I am using it to control my smart home system

In [26]:
# load summarizer model
summarize_model_name = 'MurkatG/bart-reviews'
summarizer = pipeline("summarization", model="MurkatG/bart-reviews", device=device)

config.json:   0%|          | 0.00/1.74k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/558M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/384 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]



In [27]:
def summarize(text):
    """Summarizes the given text using the loaded model.

    Args:
      text: The text to be summarized.

    Returns:
      The summarized text.
    """

    return summarizer(text, max_length=30, min_length=15, do_sample=False)[0]['summary_text']

In [40]:
# Select from each category a sample product, to create a review for
review_data = []
categories = data['predicted_categorie'].unique()

for category in categories.tolist():
    product = data[data['predicted_categorie'] == category].sample(n=1)
    name = product['name'].iloc[0]

    review = generate_amazon_review(loaded_model_peft, name, 'positive', 200).replace('\n', ' ')

    # Use defined summarizer to get a summarized version of the review
    intro = summarize(review)

    review_data.append({'category': category, 'product': name, 'intro': intro, 'review': review})

reviews = pd.DataFrame(review_data)



In [41]:
# have a look on the generated reviews
reviews

Unnamed: 0,category,product,intro,review
0,Amazon Fire Tablet,"Fire Tablet, 7 Display, Wi-Fi, 8 GB - Includes...",I am satisfied with the performance of my Kind...,I am satisfied with the performance of my Kind...
1,Amazon Echo White,Echo (White) Echo (White),I love it and recommend to everyone who want s...,I love it and recommend to everyone who want s...
2,Charger & Accessories,Amazon 5W USB Official OEM Charger and Power A...,"Good quality, but read my full amazon reviews ...","I am satisfied with the quality of it, but if ..."
3,Amazon Kindle Paperwhite,Amazon Kindle Paperwhite - eBook reader - 4 GB...,I have been using it since last year and love ...,I have been using it since last year and love ...
4,Amazon Portable Speaker,Amazon - Amazon Tap Portable Bluetooth and Wi-...,I like the fact that it has 4 different sound ...,I like the fact that it has 4 different sound ...
5,Amazon Fire Tv,Amazon Fire Tv Amazon Fire Tv,but what i like most was how easy they made se...,I bought it because of the price and that you ...
6,Amazon Fire Kids Edition,"Fire Kids Edition Tablet, 7 Display, Wi-Fi, 16...",My kids love this tablet!! Very easy to set up...,I think that it's good to buy the kids tablet ...


In [43]:
# save review into csv for a later usage
reviews.to_csv('lists/reviews_for_categories.csv', index=True)