In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import requests
from PIL import Image
import torch
import random
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoProcessor

In [None]:
import pandas as pd
df = pd.read_csv('/kaggle/input/filtered-data/filtered_combined.csv')

In [None]:
from transformers import pipeline

model_dir = "/kaggle/input/llama-3.2/transformers/3b-instruct/1"

# Loading the model from the local path
model = AutoModelForCausalLM.from_pretrained(
    model_dir,
    torch_dtype=torch.bfloat16,
    device_map="auto",
    local_files_only=True  # Ensure the model is loaded from local files
)

# Loading the tokenizer from the local path
tokenizer = AutoTokenizer.from_pretrained(model_dir, local_files_only=True)

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    device_map="auto",
)

def generate_synthetic_data(prompt, max_length):
    outputs = pipe(
        prompt,
        max_new_tokens=max_length,
        temperature=0.5,
        pad_token_id=pipe.tokenizer.eos_token_id
    )

    # Return the generated text
    return (outputs[0]["generated_text"]).replace(prompt, "")

In [None]:
def generate_synthetic_reviews(num_reviews=30):
    data = []
    for i in range(num_reviews):
        # Generate the individual fields using LLaMA prompts
        product_title_prompt = f"Choose any row value of {df['product_title'].iloc[num_reviews]} . Just tell a single name of the product title from the given dataframe. Do not write anything extra. Be precise."
        product_title = generate_synthetic_data(product_title_prompt, max_length=25)
        
        text_prompt = f"Write a brief product review for a product {product_title} as if you were a human. Do not put the rating in  this."
        review_text = generate_synthetic_data(text_prompt, max_length=100)
        
        review_title_prompt = f"Generate a product review title for the product having title {product_title}. It should be to the point and point out the summary in fewer words of {review_text}."
        review_title = generate_synthetic_data(review_title_prompt, max_length=15)
        
        rating_prompt = f"Based on the review given, which is {review_text}, just give a single numerical value which could be any integer value ranging from 1 to 5. Do not write anything extra."
        rating = generate_synthetic_data(rating_prompt, max_length=20)

        asin = "B0" + ''.join([random.choice("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ") for _ in range(8)])
        parent_asin = asin  # Keep asin and parent_asin mostly the same
        user_id = ''.join([random.choice("0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ") for _ in range(28)])
        
        timestamp_prompt = "Suggest a numerical value for a timestamp in the format MM/DD/YYYY HH:MM:SS AM/PM. For example, '10/3/2012 1:48:18 AM'. Do not give extra information like steps for calculation!"
        timestamp = generate_synthetic_data(timestamp_prompt, max_length=200)
        
        helpful_vote = random.randint(0, 50)
        verified_purchase = random.choice([True, False])
        
        date_prompt = "Choose a random date in the format MM/DD/YYYY. For example, '10/3/2012'. Do not give any python code as output. Only date is required. Do not give extra information like steps for calculation!"
        date = generate_synthetic_data(date_prompt, max_length=200)
        
        time_prompt = "Choose a random time in the format HH:MM in 24-hour format. For example, '16:06'. Don't give any python code as output. Only time is required. Do not give extra information like steps for calculation!"
        time = generate_synthetic_data(time_prompt, max_length=200)

        # Random categories and subcategory
        categories = "Health & Household"
        cat1 = "Vitamins & Supplements"
        print(i)
        # Append the generated data to the list
        data.append({
            "rating": rating,
            "review_title": review_title,
            "text": review_text,
            "asin": asin,
            "parent_asin": parent_asin,
            "user_id": user_id,
            "timestamp": timestamp,
            "helpful_vote": helpful_vote,
            "verified_purchase": verified_purchase,
            "date": date,
            "time": time,
            "product_title": product_title,
            "categories": categories,
            "cat1": cat1
        })
    
    # Creating the synthetic DataFrame
    df_synthetic = pd.DataFrame(data)
    return df_synthetic

# Generating 50 synthetic reviews
df_synthetic = generate_synthetic_reviews(num_reviews=50)
df_synthetic.to_csv("/kaggle/working/synthetic_reviews_llama.csv", index=False)

# Print confirmation message
print("Synthetic dataset generated and saved as 'synthetic_reviews_llama.csv'.")