In [1]:
# Install required packages in Google Colab
%pip install -q python-dotenv gradio anthropic openai requests torch bitsandbytes transformers sentencepiece accelerate

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m308.0/308.0 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.3/61.3 MB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import re
import sys
import subprocess
import threading
import anthropic
import torch
import gradio as gr
from openai import OpenAI
from huggingface_hub import InferenceClient, login
from google.colab import userdata
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer, BitsAndBytesConfig

In [3]:
# Google Colab User Data
# Ensure you have set the following in your Google Colab environment:
openai_api_key = userdata.get("OPENAI_API_KEY")
anthropic_api_key = userdata.get("ANTHROPIC_API_KEY")
hf_token = userdata.get('HF_TOKEN')

In [4]:
OPENAI_MODEL = "gpt-4o-mini"
CLAUDE_MODEL = "claude-3-5-sonnet-20240620"
LLAMA = "meta-llama/Meta-Llama-3.1-8B-Instruct"

code_qwen = "Qwen/CodeQwen1.5-7B-Chat"
CODE_QWEN_URL = "https://zfkokxzs1xrqv13v.us-east-1.aws.endpoints.huggingface.cloud"

login(hf_token, add_to_git_credential=True)
openai = OpenAI(api_key=openai_api_key)
claude = anthropic.Anthropic(api_key=anthropic_api_key)

In [5]:
system_message = """
You are a specialized AI assistant for synthetic data generation. Your sole purpose is to create and save synthetic datasets based on a user's request.

The user will provide a business problem and the desired output format (e.g., 'CSV', 'JSON'). You must generate a single, complete Python script that performs the following actions:

1.  Use only standard Python libraries (e.g., `numpy`, `pandas`) and built-in libraries.
2.  Generate a synthetic dataset that fits the described business problem.
3.  Save the DataFrame to a file in the requested format.
4.  When saving to JSON, use a standard `with open(...)` block to handle file encoding explicitly.
5.  Print a confirmation message upon successful generation, including the name of the file created.

Ensure all Python code blocks are correctly indented. The final output must contain **only** the executable Python code and no other text or explanation.
"""


In [6]:
def user_prompt(**input_data):
  user_prompt = f"""
      Generate a synthetic {input_data["dataset_type"].lower()} dataset in {input_data["output_format"].upper()} format.
      Business problem: {input_data["business_problem"]}
      Samples: {input_data["num_samples"]}
      """
  return user_prompt


In [7]:
def stream_gpt(user_prompt):
  stream = openai.chat.completions.create(
      model=OPENAI_MODEL,
      messages=[
          {"role": "system", "content": system_message},
          {"role": "user","content": user_prompt},
      ],
      stream=True,
  )

  response = ""
  for chunk in stream:
      response += chunk.choices[0].delta.content or ""
      yield response

  return response


def stream_claude(user_prompt):
  result = claude.messages.stream(
      model=CLAUDE_MODEL,
      max_tokens=2000,
      system=system_message,
      messages=[
          {"role": "user","content": user_prompt}
      ]
  )
  reply = ""
  with result as stream:
      for text in stream.text_stream:
          reply += text
          yield reply
          print(text, end="", flush=True)
  return reply


In [8]:
def stream_llama(user_prompt):
  try:
    messages=[
        {"role": "system", "content": system_message},
        {"role": "user","content": user_prompt},
    ]

    tokenizer = AutoTokenizer.from_pretrained(LLAMA)
    tokenizer.pad_token = tokenizer.eos_token

    quant_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4"
    )

    model = AutoModelForCausalLM.from_pretrained(
        LLAMA,
        device_map="auto",
        quantization_config=quant_config
    )

    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)

    thread = threading.Thread(target=model.generate, kwargs={
        "input_ids": inputs,
        "max_new_tokens": 1000,
        "pad_token_id": tokenizer.eos_token_id,
        "streamer": streamer
    })
    thread.start()

    started = False
    reply = ""

    for new_text in streamer:
        if not started:
            if "<|start_header_id|>assistant<|end_header_id|>" in new_text:
                started = True
                new_text = new_text.split("<|start_header_id|>assistant<|end_header_id|>")[-1].strip()
            else:
                continue

        if "<|eot_id|>" in new_text:
            new_text = new_text.replace("<|eot_id|>", "")
            if new_text.strip():
                reply += new_text
                yield reply
            break

        if new_text.strip():
            reply += new_text
            yield reply

    return reply

  except Exception as e:
    print(f"LLaMA error: {e}")
    raise


In [9]:
def stream_code_qwen(user_prompt):
    tokenizer = AutoTokenizer.from_pretrained(code_qwen)
    messages=[
            {"role": "system", "content": system_message},
            {"role": "user","content": user_prompt},
        ]
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    client = InferenceClient(CODE_QWEN_URL, token=hf_token)
    stream = client.text_generation(text, stream=True, details=True, max_new_tokens=3000)
    result = ""
    for r in stream:
        result += r.token.text
        yield result

In [10]:
def generate_from_inputs(model, **input_data):
  # print("🔍 input_data received:", input_data)
  user_prompt_str = user_prompt(**input_data)

  if model == "GPT":
    result = stream_gpt(user_prompt_str)
  elif model == "Claude":
    result = stream_claude(user_prompt_str)
  elif model == "Llama":
    result = stream_llama(user_prompt_str)
  elif model == "Code Qwen":
    result = stream_code_qwen(user_prompt_str)
  else:
    raise ValueError("Unknown model")

  for stream_so_far in result:
    yield stream_so_far

  return result


In [11]:
def handle_generate(business_problem, dataset_type, dataset_format, num_samples, model):
  input_data = {
      "business_problem": business_problem,
      "dataset_type": dataset_type,
      "output_format": dataset_format,
      "num_samples": num_samples,
  }

  response = generate_from_inputs(model, **input_data)
  for chunk in response:
      yield chunk


In [12]:
def extract_code(text):
  match = re.search(r"```python(.*?)```", text, re.DOTALL)

  if match:
      code = match.group(0).strip()
  else:
      code = ""
      print("No matching substring found.")

  return code.replace("```python\n", "").replace("```", "")


def execute_code_in_virtualenv(text, python_interpreter=sys.executable):
  if not python_interpreter:
      raise EnvironmentError("Python interpreter not found in the specified virtual environment.")

  code_str = extract_code(text)
  command = [python_interpreter, '-c', code_str]

  try:
      result = subprocess.run(command, check=True, capture_output=True, text=True)
      stdout = result.stdout
      return stdout

  except subprocess.CalledProcessError as e:
      return f"Execution error:\n{e}"


In [None]:
def update_output_format(dataset_type):
    if dataset_type in ["Tabular", "Time-series"]:
        return gr.update(choices=["JSON", "csv"], value="JSON")
    elif dataset_type == "Text":
        return gr.update(choices=["JSON"], value="JSON")

with gr.Blocks() as ui:
    gr.Markdown("## Create a dataset for a business problem")

    with gr.Column():
        business_problem = gr.Textbox(label="Business problem", lines=2)
        dataset_type = gr.Dropdown(
            ["Tabular", "Time-series", "Text"], label="Dataset type"
        )

        output_format = gr.Dropdown( choices=["JSON", "csv"], value="JSON",label="Output Format")

        num_samples = gr.Number(label="Number of samples", value=10, precision=0)

        model = gr.Dropdown(["GPT", "Claude", "Llama", "Code Qwen"], label="Select model", value="GPT")

        dataset_type.change(update_output_format,inputs=[dataset_type], outputs=[output_format])

    with gr.Row():
            with gr.Column():
                dataset_run = gr.Button("データセットを作成")
                gr.Markdown("""⚠️ Llama や Code Qwen を使用する場合、生成されるコードは最適でない可能性があります。
                              実行前に内容を確認することを推奨します。誤りが含まれる場合があります。""")

            with gr.Column():
              code_run = gr.Button("データセット用コードを実行")
              gr.Markdown("""⚠️ このアプリをコード実行付きで公開する際は注意が必要です。
                            ユーザーが生成したコードの実行は、潜在的な脆弱性につながる可能性があり、
                            本ツールの利用は責任を持って行ってください。""")

    with gr.Row():
        dataset_out = gr.Textbox(label="Generated Dataset")
        code_out = gr.Textbox(label="Executed code")

    dataset_run.click(
        handle_generate,
        inputs=[business_problem, dataset_type, output_format, num_samples, model],
        outputs=[dataset_out]
    )

    code_run.click(
        execute_code_in_virtualenv,
        inputs=[dataset_out],
        outputs=[code_out]
    )

In [14]:
ui.launch(inbrowser=True)

It looks like you are running Gradio on a hosted Jupyter notebook, which requires `share=True`. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://a98162bd476efce801.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


