In [4]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Initialize the model and tokenizer
model_name = 'gpt2'  # You can use 'gpt2-medium', 'gpt2-large', or 'gpt2-xl' for more complex generation
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Function to generate synthetic text
def generate_synthetic_text(prompt, max_length=100, temperature=0.7):
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    output = model.generate(
        input_ids,
        max_length=max_length,
        num_return_sequences=1,
        temperature=temperature,
        top_k=50,
        top_p=0.95,
        no_repeat_ngram_size=2
    )
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

# Example prompts
prompts = [
    "In the context of restaurent review, the food is very delicious",
 
]

# Generate and print synthetic data for each prompt
for prompt in prompts:
    synthetic_text = generate_synthetic_text(prompt)
    print(f"Prompt: {prompt}")
    print(f"Synthetic Data: {synthetic_text}\n")



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Prompt: In the context of restaurent review, the food is very delicious
Synthetic Data: In the context of restaurent review, the food is very delicious. The only thing that I would change is the amount of sugar in the syrup. I think it's a good idea to add more sugar to the mix.

I think the best way to make this is to use a small amount. If you're using a large amount, you can use the smaller amount to get a little more of the sugar. You can also use it to mix the rest of your ingredients.



# Generate Synthetic Reviews

In [11]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Initialize the GPT-2 model and tokenizer
model_name = 'gpt2'  # You can use larger models like 'gpt2-medium', 'gpt2-large', or 'gpt2-xl'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Function to generate synthetic restaurant reviews
def generate_review(prompt, max_length=50, temperature=0.7):
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    output = model.generate(
        input_ids,
        max_length=max_length,
        num_return_sequences=1,
        temperature=temperature,
        top_k=50,
        top_p=0.95,
        no_repeat_ngram_size=2
    )
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

# Example prompts for restaurant reviews
prompts = [
    "I had a delicious hydrabadi biriyani in restaurant. The",
    "The ambience was terrible at the south indian restaurent. The",
    "The idli was soft and the food is hot in the fast food restaurant. The"
]

# Generate synthetic reviews
reviews = [generate_review(prompt) for prompt in prompts]

# Print generated reviews
for review in reviews:
    print(f"Generated Review: {review}\n")


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Review: I had a delicious hydrabadi biriyani in restaurant. The food was delicious and the service was great. I will definitely be back.

I was very impressed with the food. It was a little bit of a challenge to find

Generated Review: The ambience was terrible at the south indian restaurent. The food was good, but the atmosphere was not good. I was very disappointed.

The food here was pretty good and the service was great. It was a bit

Generated Review: The idli was soft and the food is hot in the fast food restaurant. The food was good and I was happy with the service. I ordered the chicken and it was delicious.

I've been here a few times and this place is



# Prepare the Data for Sentiment Analysis

In [6]:
import pandas as pd

# Create a DataFrame with synthetic reviews and their labels
data = {
    'review': reviews,
    'sentiment': ['positive', 'negative', 'positive']  # Example labels; adjust based on actual reviews
}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
df.to_csv('synthetic_reviews.csv', index=False)


In [7]:
from sklearn.model_selection import train_test_split

# Load the synthetic reviews data
df = pd.read_csv('synthetic_reviews.csv')

# Split the data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)


In [8]:
train_df

Unnamed: 0,review,sentiment
1,The service was terrible at the fast food plac...,negative
2,The sushi was fresh and the ambiance was great...,positive


In [9]:
test_df

Unnamed: 0,review,sentiment
0,I had a wonderful dining experience at the new...,positive


In [10]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, load_metric
import numpy as np
import torch

# Load tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)  # Three sentiment labels: positive, negative, neutral

# Tokenize data
def tokenize_function(examples):
    return tokenizer(examples['review'], padding="max_length", truncation=True)

train_dataset = train_df.apply(lambda row: {'review': row['review'], 'label': row['sentiment']}, axis=1).to_dict()
test_dataset = test_df.apply(lambda row: {'review': row['review'], 'label': row['sentiment']}, axis=1).to_dict()

train_dataset = load_dataset('pandas', data_files={'train': train_dataset})
test_dataset = load_dataset('pandas', data_files={'test': test_dataset})

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Define evaluation metric
metric = load_metric('accuracy')

def compute_metrics(p):
    predictions = np.argmax(p.predictions, axis=1)
    return metric.compute(predictions=predictions, references=p.label_ids)

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Train the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

trainer.train()


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


AttributeError: 'dict' object has no attribute 'decode'