#Dataset generator (Meta-Llama-3.1-8B-Instruct model)

In [None]:
#lib install
!pip install -q requests torch bitsandbytes transformers sentencepiece accelerate gradio

**Imports**

In [None]:
#imports
import os
import requests
from google.colab import drive
from huggingface_hub import login
from google.colab import userdata
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, BitsAndBytesConfig
import torch
import gradio as gr

hf_token = userdata.get('HF_TOKEN')
login(hf_token, add_to_git_credential=True)

**Model**

In [None]:
#loading quantized model
model_name = "meta-llama/Meta-Llama-3.1-8B-Instruct"
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4"
)

model = AutoModelForCausalLM.from_pretrained(
  model_name,
  device_map="auto",
  quantization_config=quant_config
)

**Tokenizer**

In [None]:
#loading tokenizer to model
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

**Functions**

In [None]:

#generation dataset function
def generate_dataset(topic, number_of_data, inst1, resp1, inst2, resp2, inst3, resp3):
    # Convert user inputs into multi-shot examples
    multi_shot_examples = [
        {"instruction": inst1, "response": resp1},
        {"instruction": inst2, "response": resp2},
        {"instruction": inst3, "response": resp3}
    ]

    # System prompt
    system_prompt = f"""
    You are a helpful assistant whose main purpose is to generate datasets.
    Topic: {topic}
    Return the dataset in JSON format. Use examples with simple, and easy-to-understand instructions for basic understanding of provided topic.
    Include the following examples: {multi_shot_examples}
    Return {number_of_data} examples each time.
    Do not repeat the provided examples.
    """

    # Example Messages
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": f"Please generate my dataset for {topic}"}
    ]

    # Tokenize Input
    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
    streamer = TextStreamer(tokenizer)

    # Generate Output
    outputs = model.generate(inputs, max_new_tokens=2000, streamer=streamer)

    # Decode and Return
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


def gradio_interface(topic, number_of_data, inst1, resp1, inst2, resp2, inst3, resp3):
    return generate_dataset(topic, number_of_data, inst1, resp1, inst2, resp2, inst3, resp3)

In [None]:
#Fixing system_promt returning as part of the output

def generate_dataset2(topic, number_of_data, inst1, resp1, inst2, resp2, inst3, resp3):
    # Multi-shot examples como texto plano
    example_text = f"""
Examples:
Instruction: {inst1}
Response: {resp1}

Instruction: {inst2}
Response: {resp2}

Instruction: {inst3}
Response: {resp3}
"""

    # Prompt plano (estilo instrucional)
    prompt = f"""You are a helpful assistant that generates simple JSON datasets.
Topic: {topic}
Your goal is to generate {number_of_data} new instruction-response examples in JSON format.
Use clear and easy-to-understand language.
Do not repeat the examples already given.

{example_text}

Now generate the dataset:
"""

    # Tokeniza sem usar chat_template
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    streamer = TextStreamer(tokenizer)

    # Gera a resposta
    outputs = model.generate(inputs["input_ids"], max_new_tokens=2000, streamer=streamer)

    # Decodifica
    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Tenta remover o prompt do começo, se repetido
    cleaned_output = output_text.replace(prompt.strip(), "").strip()

    return cleaned_output

def gradio_interface2(topic, number_of_data, inst1, resp1, inst2, resp2, inst3, resp3):
    return generate_dataset2(topic, number_of_data, inst1, resp1, inst2, resp2, inst3, resp3)


**Default Values**

In [None]:
default_topic = "Explaining basic financial concepts to beginners"
default_number_of_data = 10
default_multi_shot_examples = [
    {
        "instruction": "What is a budget?",
        "response": "A budget is a simple plan to help you decide how to spend your money so you don’t run out."
    },
    {
        "instruction": "Why is saving money important?",
        "response": "Saving money helps you buy things later or handle emergencies without stress."
    },
    {
        "instruction": "What does interest mean in finance?",
        "response": "Interest is extra money you earn when you save, or extra money you pay when you borrow."
    }
]

**Init gradio**

In [None]:
gr_interface = gr.Interface(
    fn=gradio_interface2,
    inputs=[
        gr.Textbox(label="Topic", value=default_topic, lines=2),
        gr.Number(label="Number of Examples", value=default_number_of_data, precision=0),
        gr.Textbox(label="Instruction 1", value=default_multi_shot_examples[0]["instruction"]),
        gr.Textbox(label="Response 1", value=default_multi_shot_examples[0]["response"]),
        gr.Textbox(label="Instruction 2", value=default_multi_shot_examples[1]["instruction"]),
        gr.Textbox(label="Response 2", value=default_multi_shot_examples[1]["response"]),
        gr.Textbox(label="Instruction 3", value=default_multi_shot_examples[2]["instruction"]),
        gr.Textbox(label="Response 3", value=default_multi_shot_examples[2]["response"]),
    ],
    outputs=gr.Textbox(label="Generated Dataset")
)

**Run the app**

In [None]:
gr_interface.launch()