### 1. Set Up Environment Variables
To store credentials securely, rename the `.env.sample` file folder to `.env` in the same directory as the notebook and update the variables with the required connection information.

### 2. Install Dependenices


In [6]:
import os
from azure.identity import DefaultAzureCredential, get_bearer_token_provider
from azure.core.credentials import AzureKeyCredential
from openai import AzureOpenAI
import dotenv
import json

dotenv.load_dotenv()

True

### 3. Load environment variables and instantiate your OpenAI client

In [None]:
# Load Azure OpenAI environment variables
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
AZURE_OPENAI_CHAT_COMPLETION_DEPLOYED_MODEL_NAME = os.getenv("AZURE_OPENAI_CHAT_COMPLETION_DEPLOYED_MODEL_NAME")

# 🔹 Initialize Azure OpenAI Client (API Key or Managed Identity)
if AZURE_OPENAI_API_KEY:
    openai_client = AzureOpenAI(
        api_key=AZURE_OPENAI_API_KEY,
        azure_endpoint=AZURE_OPENAI_ENDPOINT,
        api_version="2024-10-21"
    )
else:
    azure_credential = DefaultAzureCredential()
    token_provider = get_bearer_token_provider(azure_credential, "https://cognitiveservices.azure.com/.default")
    openai_client = AzureOpenAI(
        azure_ad_token_provider=token_provider,
        azure_endpoint=AZURE_OPENAI_ENDPOINT,
        api_version="2024-10-21"
    )

### 4. Prepare the system prompt

You want consistent outputs in a structured form, so start with a clear system prompt and optionally include few-shot examples.

In [5]:
SYSTEM_PROMPT = "You are a data generator. Create realistic and varied Q&A pairs about Volkswagen Vans technical glossary terms. Each pair should explain a concept or abbreviation in a factual and helpful way, like a glossary copilot would."


In [4]:
FEW_SHOT_EXAMPLES = [
    {"question": "What does TDI stand for?", "answer": "TDI stands for Turbocharged Direct Injection. It refers to a type of diesel engine developed by Volkswagen that uses a turbocharger and direct fuel injection to improve performance and efficiency."},
    {"question": "What is 4MOTION?", "answer": "4MOTION is Volkswagen's all-wheel-drive system that automatically distributes power between the front and rear wheels to improve traction and stability on various surfaces."}
]


### 5. Read the data that you want to create synthetic test data from
In this example, we are reading VW technical glossary terms, that were extracted from this page: https://www.volkswagen-vans.co.uk/en/technology/technical-glossary.html
They were stored locally in this repo in JSON format. You could store any desired source data there.

In [18]:
with open("data/vw-technical-glossary/technical_glossary_vw_van.json", "r") as f:
    glossary_data = json.load(f)

fine_tune_entries = []

# Loop through glossary terms
for entry in glossary_data:
    term = entry["term"]
    definition = entry["definition"]

    user_prompt = f"Term: {term}\nDefinition: {definition}"

    try:
        response = openai_client.chat.completions.create(
            model=AZURE_OPENAI_CHAT_COMPLETION_DEPLOYED_MODEL_NAME,
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": user_prompt}
            ],
            temperature=0.5
        )

        output = response.choices[0].message.content.strip()

        # Parse Q&A
        if "Q:" in output and "A:" in output:
            question = output.split("Q:")[1].split("A:")[0].strip()
            answer = output.split("A:")[1].strip()

            fine_tune_entries.append({
                "messages": [
                    {
                        "role": "system",
                        "content": "VW Glossary Copilot is a factual chatbot that explains concepts and abbreviations about Volkswagen technical terms."
                    },
                    {
                        "role": "user",
                        "content": question
                    },
                    {
                        "role": "assistant",
                        "content": answer
                    }
                ]
            })
            print(f"✅ Parsed Q&A for: {term}")
        else:
            print(f"⚠️ Could not parse Q&A for: {term}")


    except Exception as e:
        print(f"Error processing term '{term}': {e}")

# Save to JSONL
with open("vw_glossary_finetune_dataset.jsonl", "w") as f:
    for item in fine_tune_entries:
        f.write(json.dumps(item) + "\n")

print(f"✅ Saved {len(fine_tune_entries)} entries to vw_glossary_finetune_dataset.jsonl")

✅ Saved 33 entries to vw_glossary_finetune_dataset.jsonl
