# Build a Model to generate Synthetic Data

Code was written in Google Colab. 

## Imports

In [0]:
!pip install -q gradio

In [0]:
import os
import requests
import json
from google.colab import userdata

from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
import torch

import gradio as gr

In [0]:
!pip install -U bitsandbytes

In [0]:
hf_token = userdata.get('HF_TOKEN')

## Open Source Models from HF

In [0]:
deepseek_model = 'deepseek-ai/deepseek-llm-7b-chat'
llama_model = 'meta-llama/Meta-Llama-3.1-8B-Instruct'
qwen2 = 'Qwen/Qwen2-7B-Instruct'

In [0]:
login(hf_token, add_to_git_credential=True)

## Creating Prompts

In [0]:
system_prompt = "You are an expert in generating synthetic datasets. Your goal is to generate realistic datasets \
based on a given business and its requirements from the user. You will also be given the desired datset format."
system_prompt += "Do not repeat the instructions."

user_prompt = ("Please provide me a dataset for the following business."
"For example:\n"
"The Business: A retail store selling luxury watches.\n"
"The Data Format: CSV.\n"
"Output:\n"
"Item,Price,Quantity,Brand,Sale Date\n"
"Superocean II, 20.000$, 3, Breitling, 2025-04-08 \n"
"If I don't provide you the necessary columns, please create the columns based on your knowledge about the given business")

In [0]:
def dataset_format(data_format, num_records):
    format_message = ''
    if data_format == 'CSV':
        format_message = 'Please provide the dataset in a CSV format.'
    elif data_format == 'JSON':
        format_message =  'Please provide the dataset in a JSON format'
    elif data_format == 'Tabular':
        format_message =  'Please provide the dataset in a Tabular format'

    return format_message + f'Please generate {num_records} records'

In [0]:
def complete_user_prompt(user_input, data_format, num_records):
    messages = [
        {'role': 'system', 'content': system_prompt},
        {'role': 'user', 'content': user_input + user_prompt + dataset_format(data_format, num_records)}
    ]

    return messages

## Accessing the Models

In [0]:
print("CUDA available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU-Device:", torch.cuda.get_device_name(torch.cuda.current_device()))
else:
    print("No GPU found.")

In [0]:
quant_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_use_double_quant = False,
    bnb_4bit_compute_dtype= torch.bfloat16,
    bnb_4bit_quant_type= 'nf4'
)

In [0]:
def generate_model(model_id, messages):
    try:
      tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code = True)
      inputs = tokenizer.apply_chat_template(messages, return_tensors = 'pt').to('cuda')
      streamer = TextStreamer(tokenizer)
      model = AutoModelForCausalLM.from_pretrained(model_id, device_map = 'auto', quantization_config = quant_config)
      outputs = model.generate(inputs, max_new_tokens = 2000, streamer = streamer)
      generated_text = tokenizer.decode(outputs[0], skip_special_tokens = True)
      del tokenizer, streamer, model, inputs, outputs
      return generated_text

    except Exception as e:
      return f'Error during generation: {str(e)}'

## Generate Dataset

In [0]:
def generate_dataset(user_input, target_format, model_choice, num_records):
    if model_choice == 'DeepSeek':
        model_id = deepseek_model
    elif model_choice == 'Llama-3.1-8B':
        model_id = llama_model
    elif model_choice == 'Qwen2':
        model_id = qwen2

    messages = complete_user_prompt(user_input, target_format, num_records)
    return generate_model(model_id, messages)

## Creating Gradio UI

In [0]:
with gr.Blocks(title = 'Synthetic Data Generator') as ui:
    gr.Markdown('# Synthetic Data Generator')

    with gr.Row():
        with gr.Column(min_width=600):
            user_inputs = gr.Textbox(label = 'Enter your Business details and data requirements',
                                     placeholder = 'Type here...', lines = 15)

            model_choice = gr.Dropdown(
                ['DeepSeek', 'Llama-3.1-8B', 'Qwen2'],
                label = 'Choose your Model',
                value = 'DeepSeek'
            )

            target_format = gr.Dropdown(
                ['CSV', 'JSON', 'Tabular'],
                label = 'Choose your Format',
                value = 'CSV'
            )
            num_records = gr.Dropdown(
                [50, 100, 150, 200],
                label = 'Number of Records',
                value = 50
            )

            generate_button = gr.Button('Generate')

        with gr.Column():
            output = gr.Textbox(label = 'Generated Synthetic Data',
                               lines = 30)

    generate_button.click(fn = generate_dataset, inputs = [user_inputs, target_format, model_choice, num_records],
                          outputs = output
                         )

In [0]:
ui.launch(inbrowser = True)