In [2]:
%pip install python-dotenv openai

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting openai
  Downloading openai-1.59.3-py3-none-any.whl.metadata (27 kB)
Collecting anyio<5,>=3.5.0 (from openai)
  Downloading anyio-4.8.0-py3-none-any.whl.metadata (4.6 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Downloading distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.28.1-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.8.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.2 kB)
Collecting sniffio (from openai)
  Downloading sniffio-1.3.1-py3-none-any.whl.metadata (3.9 kB)
Collecting typing-extensions<5,>=4.11 (from openai)
  Downloading typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai)
  Downloading httpcore-1.0.7-py3-none-any.whl.metadata (21 kB)
Collecting h11<0.15,>=0.13 (fro

In [1]:
import re
import os
from openai import AzureOpenAI
import json
from dotenv import load_dotenv

In [2]:
# Load environment variables
load_dotenv(override=True)

# Configuration
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_API_KEY = os.getenv("AZURE_OPENAI_API_KEY")
SUBSCRIPTION_ID = os.getenv("SUBSCRIPTION_ID")
RESOURCE_GROUP = os.getenv("RESOURCE_GROUP")
AML_WORKSPACE_NAME = os.getenv("AML_WORKSPACE_NAME")

In [3]:
response_format = {
    "type": "json_schema",
    "json_schema": {
      "name": "augment_seed_data",
      "strict": True,
      "schema": {
        "type": "object",
        "properties": {
          "datasets": {
            "type": "array",
            "items": {
              "type": "object",
              "properties": {
                "user_content": {
                  "type": "string"
                },
                "assistant_content": {
                  "type": "string",
                }
              },
              "required": ["user_content", "assistant_content"],
              "additionalProperties": False
            }
          },
        },
        "required": ["datasets"],
        "additionalProperties": False
      }
    }
  }

In [5]:
tool_metadata_path = "../../data/tools_metadata.json"

with open(tool_metadata_path, "r", encoding="utf-8") as file:
    tool_metadata_data = json.load(file)

tool_metadata_data_string = json.dumps(tool_metadata_data, ensure_ascii=False)

In [7]:
client = AzureOpenAI(
  azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT"), 
  api_key=os.getenv("AZURE_OPENAI_API_KEY"),  
  api_version="2024-10-21"
)

output_format = """
{"user_content": "Set temperature to 25 degrees.", "assistant_content": "<tool_call>\n{\"name\": \"set_cabin_temperature\", \"arguments\": {\"temperature\": 25}}\n</tool_call>"}
"""


def augment_seed_annotaion_data(query, augment_number):
    system_message = f"""
	Your role is to augment the provided dataset. Without changing the function name or argument names in `assistant_content` (you can modify the values of the arguments), augment the dataset by following the variations specified for `user_content`. 
	You must select a function and argument from the metadata of the given function list.
 	You need to expand the data up to '{augment_number}' items. 

	### Examples of variations for `user_content`:
	- Change the values or relative degree of patterns.
	- Make the instructions more abstract(e.g. not including the specific value.)
	- Provide more detailed and specific instructions.
	- Make the instructions shorter and more concise.
	- Present the instructions in a natural conversational style.

	### Dataset Description:
	- `user_content`: Instructions from a user (driver, passenger).
	- `assistant_content`: A function and its arguments designed to fulfill the user's instructions. 
 
	### Output Json format:
	{output_format}
 
	### Function List:
	{tool_metadata_data_string}
 
	"""
    message_text = [
		{"role":"system","content": system_message},
		{"role":"user","content": query}
	]
    completion = client.chat.completions.create(
		model="gpt-4o-20240806", # model = "deployment_name"
		messages = message_text,
		response_format=response_format,
		temperature=0.7,
		)
    return completion.choices[0].message.content

In [8]:
test_query = {"user_content": "Please set the air suspension rigidity to level 3.", "assistant_content": "<tool_call>\n{\"name\": \"adjust_air_suspension_rigidity\", \"arguments\": {\"rigidity_level\": 3}}\n</tool_call>"}

test_respone = augment_seed_annotaion_data(str(test_query), 5)

In [11]:
import json

input_file = "../../data/seed_annotation_dataset.jsonl"
output_file = "../../data/output_annotation_dataset_temperature.jsonl"

all_datasets = []

with open(input_file, "r", encoding="utf-8") as infile:
    for line in infile:
        # 1行ずつ読み込む
        seed_annotation_data = json.loads(line.strip())
        
        # 内容をプリント
        print(seed_annotation_data)
        augment_respone = augment_seed_annotaion_data(str(seed_annotation_data), 50)
        json_augment_respone = json.loads(augment_respone)
        
        print(json_augment_respone)
        print("\n")
        
        all_datasets.extend(json_augment_respone.get("datasets", []))

with open(output_file, "w", encoding="utf-8") as file:
	for dataset in all_datasets:
		file.write(json.dumps(dataset, ensure_ascii=False) + "\n")


{'user_content': 'Set the cabin temperature to 22.5 degrees Celsius.', 'assistant_content': '<tool_call>\n{"name": "set_cabin_temperature", "arguments": {"temperature": 22.5}}\n</tool_call>'}
{'datasets': [{'user_content': 'Make the cabin 22.5 degrees.', 'assistant_content': '<tool_call>\n{"name": "set_cabin_temperature", "arguments": {"temperature": 22.5}}\n</tool_call>'}, {'user_content': 'I want the cabin to be 22.5°C.', 'assistant_content': '<tool_call>\n{"name": "set_cabin_temperature", "arguments": {"temperature": 22.5}}\n</tool_call>'}, {'user_content': 'Adjust the cabin to 22.5 degrees Celsius, please.', 'assistant_content': '<tool_call>\n{"name": "set_cabin_temperature", "arguments": {"temperature": 22.5}}\n</tool_call>'}, {'user_content': 'Could you set the cabin temperature to 22.5 degrees?', 'assistant_content': '<tool_call>\n{"name": "set_cabin_temperature", "arguments": {"temperature": 22.5}}\n</tool_call>'}, {'user_content': "Let's set the cabin at 22.5 degrees Celsius."

In [13]:
dataset_synthetic_path = "../../data/dataset_synthetic_temperature.json"
dataset_synthetic = []

system_prompt_obj = {
	"content": f"You are a function calling AI model. You are provided with function signatures within <tools> </tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Ask for clarification if a user request is ambiguous.\n<tools>\n{tool_metadata_data_string}\n</tools>\nFor each function call return a json object with function name and arguments within <tool_call> </tool_call> tags with the following schema:\n<tool_call>\n{{'arguments': <args-dict>, 'name': <function-name>}}\n</tool_call>\n",
	"role": "system"
}

with open(output_file, "r", encoding="utf-8") as infile:
    for line in infile:

        output_data = json.loads(line.strip())
        
        print(output_data)
        
        user_prompt_obj = {
			"content": output_data["user_content"],
			"role": "user"
		}
        
        assistant_prompt_obj = {
			"content": output_data["assistant_content"],
			"role": "assistant"
		}
        
        line_temp_array = [
			system_prompt_obj,
			user_prompt_obj,
			assistant_prompt_obj
		]
        
        dataset_synthetic.append(line_temp_array)
        
        
with open(dataset_synthetic_path, "w", encoding="utf-8") as file:
	file.write(json.dumps(dataset_synthetic, ensure_ascii=False))

{'user_content': 'Make the cabin 22.5 degrees.', 'assistant_content': '<tool_call>\n{"name": "set_cabin_temperature", "arguments": {"temperature": 22.5}}\n</tool_call>'}
{'user_content': 'I want the cabin to be 22.5°C.', 'assistant_content': '<tool_call>\n{"name": "set_cabin_temperature", "arguments": {"temperature": 22.5}}\n</tool_call>'}
{'user_content': 'Adjust the cabin to 22.5 degrees Celsius, please.', 'assistant_content': '<tool_call>\n{"name": "set_cabin_temperature", "arguments": {"temperature": 22.5}}\n</tool_call>'}
{'user_content': 'Could you set the cabin temperature to 22.5 degrees?', 'assistant_content': '<tool_call>\n{"name": "set_cabin_temperature", "arguments": {"temperature": 22.5}}\n</tool_call>'}
{'user_content': "Let's set the cabin at 22.5 degrees Celsius.", 'assistant_content': '<tool_call>\n{"name": "set_cabin_temperature", "arguments": {"temperature": 22.5}}\n</tool_call>'}
{'user_content': 'Please adjust the temperature to 22.5°C in the cabin.', 'assistant_co