## Environment Preparation

In [None]:
%pip install python-dotenv > /dev/null 2>&1
%pip install datasets > /dev/null 2>&1
%pip install openai > /dev/null 2>&1

In [None]:
from dotenv import load_dotenv

# Load the .env file
load_dotenv()

## Create Data for Training

Load a dataset from a local file and create data to fine-tune Azure OpenAI.

In [8]:
from datasets import load_dataset
import re
import os

train_file = "./training_example.csv" # Path to the data file, replace with the correct path
test_file = "./testing_example.csv" # Path to the data file, replace with the correct path
ds = load_dataset('csv', data_files={
    'train': train_file,
    'test': test_file
}, delimiter=',')
print(ds)

def sanitize_text(text: str) -> str:
    """
    Sanitize the text by removing extra spaces and newlines.
    """
    text = text.strip()
    text = re.sub(r"\s{2,}", " ", text)

    # remove " and ' from the text
    text = text.replace('"', "")
    text = text.replace("'", "")

    return text

def prepare_training_data(key:str):
    """
    Prepare the training data by sanitizing the text and creating the training data.
    """
    training_data = []
    for example in ds[key]:
        user_input = example['user'] # Replace with the correct column name
        assistant_response = example['assistant'] # Replace with the correct column name
        system_prompt = """
        You are an Xbox customer support agent whose primary goal is to help users with issues they are experiencing with their Xbox devices. You are friendly and concise. You only provide factual answers to queries, and do not provide answers that are not related to Xbox.
        """

        training_data.append({
            "messages": [
                {"role": "system", "content": sanitize_text(system_prompt)},
                {"role": "user", "content": sanitize_text(user_input)},
                {"role": "assistant", "content": sanitize_text(assistant_response)}
            ]
        })
    
    data_dir = os.path.join(os.getcwd(), 'training_data')
    if not os.path.exists(data_dir):
        os.makedirs(data_dir)

    # Save the training data to a jsonl file, replace ' with "
    with open(os.path.join(data_dir, f"{key}.jsonl"), "w") as f:
        for data in training_data:
            f.write(str(data).replace("'", "\"") + "\n")

prepare_training_data('train')
prepare_training_data('test')

Generating train split: 3 examples [00:00, 415.69 examples/s]
Generating test split: 2 examples [00:00, 804.82 examples/s]

DatasetDict({
    train: Dataset({
        features: ['user', 'assistant'],
        num_rows: 3
    })
    test: Dataset({
        features: ['user', 'assistant'],
        num_rows: 2
    })
})





## Upload Data to Azure OpenAI

In [None]:
import os
from openai import AzureOpenAI

client = AzureOpenAI(
  azure_endpoint = os.environ.get("AZURE_OPENAI_ENDPOINT",""), 
  api_key=os.environ.get("AZURE_OPENAI_API_KEY",""),  
  api_version="2024-05-01-preview"  # This API version or later is required to access seed/events/checkpoint capabilities
)

training_file_name = os.path.join(os.getcwd(), 'training_data', 'train.jsonl')
validation_file_name = os.path.join(os.getcwd(), 'training_data', 'test.jsonl')

# Upload the training and validation dataset files to Azure OpenAI with the SDK.

training_response = client.files.create(
    file=open(training_file_name, "rb"), purpose="fine-tune"
)
training_file_id = training_response.id

validation_response = client.files.create(
    file=open(validation_file_name, "rb"), purpose="fine-tune"
)
validation_file_id = validation_response.id

print("Training file ID:", training_file_id)
print("Validation file ID:", validation_file_id)

## Create a Fine-Tune Job

In [None]:
response = client.fine_tuning.jobs.create(
    training_file=training_file_id,
    validation_file=validation_file_id,
    model="gpt-35-turbo-0613", # Enter base model name. Note that in Azure OpenAI the model name contains dashes and cannot contain dot/period characters. 
    seed = 105  # seed parameter controls reproducibility of the fine-tuning job. If no seed is specified one will be generated automatically.
)

job_id = response.id

# You can use the job ID to monitor the status of the fine-tuning job.
# The fine-tuning job will take some time to start and complete.

print("Job ID:", response.id)
print("Status:", response.id)
print(response.model_dump_json(indent=2))

## Check Fine-Tune Job Status

In [None]:
response = client.fine_tuning.jobs.retrieve(job_id)

print("Job ID:", response.id)
print("Status:", response.status)
print(response.model_dump_json(indent=2))