<a href="https://colab.research.google.com/github/kmalhotra18/HuggingFace/blob/main/Synthetic_Data_Generator_Product.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q transformers sentencepiece accelerate bitsandbytes gradio

In [None]:
!pip install gradio

In [None]:
# Imports
import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

In [None]:
# Model (choose from: mistralai/Mixtral-8x7B-Instruct-v0.1 or meta-llama/Meta-Llama-3.1-8B-Instruct)
MODEL_NAME = "meta-llama/Meta-Llama-3.1-8B-Instruct"

In [None]:
# Quantization config to load in 4-bit for speed and memory efficiency
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

In [None]:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    quantization_config=quant_config,
    torch_dtype=torch.bfloat16
)

In [None]:
# Main generation function
def generate_synthetic_data(prompt, rows=5):
    messages = [
        {"role": "system", "content": "You are a helpful assistant that generates synthetic data in structured formats (CSV/JSON/Markdown) for testing."},
        {"role": "user", "content": f"Please generate {rows} rows of synthetic data. {prompt}"}
    ]
    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
    outputs = model.generate(inputs, max_new_tokens=1024)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response.split("</s>")[-1].strip()

In [None]:
# Gradio UI
interface = gr.Interface(
    fn=generate_synthetic_data,
    inputs=[
        gr.Textbox(label="Describe the Dataset (e.g. 'fake user data with name, email, and age')"),
        gr.Slider(1, 50, value=5, step=1, label="Number of Rows")
    ],
    outputs=gr.Code(label="🧪 Synthetic Data Output"),
    title="Synthetic Data Generator",
    description="Describe the type of data you need and how many rows. This tool will generate realistic-looking synthetic data using open-source models."
)

In [None]:
interface.launch(share=True)