# Generate Synthetic Data

- Write models that can generate datasets
- Use a variety of models and prompts for diverse outputs
- Create a Gradio UI for your product

In [3]:
import os
import requests
from dotenv import load_dotenv
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
import torch
from IPython.display import Markdown, display, update_display

from openai import OpenAI
import ollama

import gradio as gr


In [4]:
load_dotenv(override=True)

# HuggingFace
hf_token = os.getenv('HF_API_KEY')
login(hf_token)

In [5]:
MODEL = "meta-llama/Meta-Llama-3.1-8B-Instruct"

In [11]:
system_prompt = "You are a helpful assistant that can take a topic and a list of data types, and use this information to generate a dataset on the topic mentioned, and include an attribute of each type in the list of data types. "
system_prompt += "If the user doesn't provide a topic or a list of data types to include, generate a dataset with a variety of attribute types that has many use cases, "
system_prompt += "for example, include attributes that are categorical, datetime, boolean, numeric or float, etc. "
system_prompt += "Your answer should include a link to download the generated dataset as json and as csv. "
system_prompt += "Your answer should also include a dictionary that lists all included attributes with their names and their data type. "

In [12]:
system_prompt

"You are a helpful assistant that can take a topic and a list of data types, and use this information to generate a dataset on the topic mentioned, and include an attribute of each type in the list of data types. If the user doesn't provide a topic or a list of data types to include, generate a dataset with a variety of attribute types that has many use cases, for example, include attributes that are categorical, datetime, boolean, numeric or float, etc. Your answer should include a link to download the generated dataset as json and as csv. Your answer should also include a dictionary that lists all included attributes with their names and their data type. "

In [21]:
def user_prompt(topic, datatypes:list):
    user_prompt = f"Generate a dataset for the topic {topic}. \n"
    user_prompt += "Include an attribute for each of the following data types: \n"
    user_prompt += "\n".join(datatypes)
    user_prompt += "\n\nIn your answer, include a preview of the first 10 rows of the dataset, " \
    "a link to download the dataset in json and in csv, " \
    "and a dictionary mapping of attributes included in the dataset and their data type."
    return user_prompt

In [22]:
flowers = user_prompt("flowers", ["categorical", "boolean", "date"])
print(flowers)

Generate a dataset for the topic flowers. 
Include an attribute for each of the following data types: 
categorical
boolean
date

In your answer, include a preview of the first 10 rows of the dataset, a link to download the dataset in json and in csv, and a dictionary mapping of attributes included in the dataset and their data type.


In [None]:
# Testing with flowers example

messages = [
    {
        "role": "system",
        "content": system_prompt
    },
    {
        "role": "user",
        "content": user_prompt("flowers", ["categorical", "boolean", "date"])
    }
]

In [24]:
# Quantization Config
quant_config = BitsAndBytesConfig(
    load_in_4_bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

In [25]:
# Generate function from collab example
def generate(model, messages):
    tokenizer = AutoTokenizer.from_pretrained(model)
    tokenizer.pad_token = tokenizer.eos_token
    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", add_generation_prompt=True).to("cuda")
    streamer = TextStreamer(tokenizer)
    model = AutoModelForCausalLM.from_pretrained(model, device_map="auto", quantization_config=quant_config)
    outputs = model.generate(inputs, max_new_tokens=1000, streamer=streamer)
    

In [None]:
generate(MODEL, messages=messages)

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]