In [1]:
from openai import OpenAI
import json
import tiktoken # for token counting
import numpy as np
from collections import defaultdict

In [2]:
import json
from collections import defaultdict
from tiktoken import get_encoding

def validate_and_estimate_finetuning_data(file_path):
    # Setup
    format_errors = defaultdict(int)
    token_counts = []
    total_tokens = 0
    encoding = get_encoding("cl100k_base")  # For OpenAI models


    # Load the dataset
    with open(file_path, 'r', encoding='utf-8') as f:
        dataset = [json.loads(line) for line in f]

    for idx, ex in enumerate(dataset):
        if not isinstance(ex, dict):
            format_errors["data_type"] += 1
            continue

        messages = ex.get("messages", None)
        if not messages:
            format_errors["missing_messages_list"] += 1
            continue

        # Validate format
        conversation_tokens = 0
        assistant_message_found = False

        for message in messages:
            if "role" not in message or "content" not in message:
                format_errors["message_missing_key"] += 1
                continue

            if any(k not in ("role", "content", "name", "function_call", "weight") for k in message):
                format_errors["message_unrecognized_key"] += 1

            if message.get("role", None) not in ("system", "user", "assistant"):
                format_errors["unrecognized_role"] += 1

            content = message.get("content", None)
            function_call = message.get("function_call", None)

            if (not content and not function_call) or not isinstance(content, str):
                format_errors["missing_content"] += 1

            # Count tokens for each message
            try:
                message_tokens = len(encoding.encode(message.get("content", "")))
                conversation_tokens += message_tokens
            except Exception as e:
                format_errors["tokenization_error"] += 1

            if message.get("role") == "assistant":
                assistant_message_found = True

        if not assistant_message_found:
            format_errors["example_missing_assistant_message"] += 1

        token_counts.append(conversation_tokens)
        total_tokens += conversation_tokens

    # Output results
    return {
        "format_errors": dict(format_errors),
        "token_counts": token_counts,
        "total_tokens": total_tokens,
    }



In [None]:
import os
# print(os.getcwd())
training_File_Path = os.path.join(os.getcwd(),"/path_to_data_files/sarcastic_bot_train.jsonl")
validation_File_Path = os.path.join(os.getcwd(),"/path_to_data_files/sarcastic_bot_val.jsonl")

In [9]:
## Training data
result = validate_and_estimate_finetuning_data(training_File_Path)

# Print Results
print("Training Data")
print("Format Errors:", result["format_errors"])
print("Token Counts per Conversation:", result["token_counts"])
print("Total Tokens:", result["total_tokens"])

result = validate_and_estimate_finetuning_data(validation_File_Path)

## Test dataset
print("\n\nTest Data")
print("Format Errors:", result["format_errors"])
print("Token Counts per Conversation:", result["token_counts"])
print("Total Tokens:", result["total_tokens"])

Training Data
Format Errors: {}
Token Counts per Conversation: [40, 41, 38, 41, 41, 42, 40, 46, 42, 43, 46, 41, 45, 41, 39, 35, 33, 42, 37, 36, 40, 43, 39, 38, 39]
Total Tokens: 1008


Test Data
Format Errors: {}
Token Counts per Conversation: [45, 40, 41, 38, 41, 41, 42, 40, 46, 42, 43, 46, 41, 45, 41, 39, 35, 33, 42, 37, 36, 40, 43, 39, 38]
Total Tokens: 1014


In [10]:
from dotenv import load_dotenv
import wandb
import os

# Load environment variables from a .env file
load_dotenv()

# Get the OpenAI API key
api_key = os.getenv("OPENAI_API_KEY")

print("OpenAI API Key loaded successfully.")

wandb.login()

OpenAI API Key loaded successfully.


[34m[1mwandb[0m: Currently logged in as: [33mjayes[0m ([33mjayes-university-of-connecticut[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [11]:
## create a client
client = OpenAI(api_key=api_key)

# Function to check if a file already exists on OpenAI
def get_existing_file_id(filename):
    files = client.files.list()
    for file in files.data:
        if file.filename == filename:
            return file.id  # Return the existing file ID
    return None  # File does not exist

# Function to delete a file by ID
def delete_file(file_id):
    response = client.files.delete(file_id)
    return response.deleted

# Check and delete training file
file_name = os.path.basename(training_File_Path)
training_file_id = get_existing_file_id(file_name)
if training_file_id:
    print(f"Deleting existing training file: {training_File_Path}")
    delete_file(training_file_id)

# Check and delete validation file
file_name = os.path.basename(validation_File_Path)
validation_file_id = get_existing_file_id(file_name)
if validation_file_id:
    print(f"Deleting existing validation file: {validation_File_Path}")
    delete_file(validation_file_id)

# Upload the training file
training = client.files.create(
    file=open(training_File_Path, "rb"),
    purpose="fine-tune"
)
print(f"Training file uploaded: {training.id}")

# Upload the validation file
validation = client.files.create(
    file=open(validation_File_Path, "rb"),
    purpose="fine-tune"
)
print(f"Validation file uploaded: {validation.id}")

Training file uploaded: file-HZ4QCTp4Uiy9TsoJ3UhPZs
Validation file uploaded: file-AUQ4n3SnWucoazGRtVQgHw


In [None]:
## Paste the file id into the training_file parameter and choose the model and adjust the hyperparametersas per your requirements
job = client.fine_tuning.jobs.create(
    training_file= training.id,
    validation_file=validation.id,
    model = "gpt-4o-mini-2024-07-18",
    method={
        "type": "supervised",
        "supervised": {
            "hyperparameters": {
                "n_epochs": 3,  # Number of epochs
                "batch_size": 10,  # Batch size
                "learning_rate_multiplier": 0.5,  # Learning rate scaling factor
            }
        }
    },
    integrations= [
        {
            "type": "wandb",
            "wandb": {
                "project": "sarcastic_bot",
                "tags": ["project:tag", "lineage"]
            }
        }
    ]
)
print(job)

FineTuningJob(id='ftjob-slFLVLBascAqJc4qT3tZuT6h', created_at=1739660027, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size=10, learning_rate_multiplier=0.5, n_epochs=3), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-mnjCTK9ybxW8KK5G8W8GwOTo', result_files=[], seed=1615113433, status='validating_files', trained_tokens=None, training_file='file-HZ4QCTp4Uiy9TsoJ3UhPZs', validation_file='file-AUQ4n3SnWucoazGRtVQgHw', estimated_finish=None, integrations=[FineTuningJobWandbIntegrationObject(type='wandb', wandb=FineTuningJobWandbIntegration(project='sarcastic_bot', entity=None, name=None, tags=None, run_id='ftjob-slFLVLBascAqJc4qT3tZuT6h'))], method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size=10, learning_rate_multiplier=0.5, n_epochs=3)), type='supervised'), user_provided_suffix=None)


In [13]:
## Listing all the recent jobs
all_jobs = client.fine_tuning.jobs.list(limit=10).data
print(all_jobs)

[FineTuningJob(id='ftjob-slFLVLBascAqJc4qT3tZuT6h', created_at=1739660027, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size=10, learning_rate_multiplier=0.5, n_epochs=3), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-mnjCTK9ybxW8KK5G8W8GwOTo', result_files=[], seed=1615113433, status='running', trained_tokens=None, training_file='file-HZ4QCTp4Uiy9TsoJ3UhPZs', validation_file='file-AUQ4n3SnWucoazGRtVQgHw', estimated_finish=None, integrations=[FineTuningJobWandbIntegrationObject(type='wandb', wandb=FineTuningJobWandbIntegration(project='sarcastic_bot', entity=None, name=None, tags=None, run_id='ftjob-slFLVLBascAqJc4qT3tZuT6h'))], method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size=10, learning_rate_multiplier=0.5, n_epochs=3)), type='supervised'), user_provided_suffix=None), FineTuningJob(id='ftjob-9LvTSc04FGT0miv

In [None]:
## Print the recent job to get the fine-tuned model name
print(all_jobs[0])
print(client.fine_tuning.jobs.retrieve(all_jobs[0].id))

FineTuningJob(id='ftjob-slFLVLBascAqJc4qT3tZuT6h', created_at=1739660027, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(batch_size=10, learning_rate_multiplier=0.5, n_epochs=3), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-mnjCTK9ybxW8KK5G8W8GwOTo', result_files=[], seed=1615113433, status='running', trained_tokens=None, training_file='file-HZ4QCTp4Uiy9TsoJ3UhPZs', validation_file='file-AUQ4n3SnWucoazGRtVQgHw', estimated_finish=None, integrations=[FineTuningJobWandbIntegrationObject(type='wandb', wandb=FineTuningJobWandbIntegration(project='sarcastic_bot', entity=None, name=None, tags=None, run_id='ftjob-slFLVLBascAqJc4qT3tZuT6h'))], method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size=10, learning_rate_multiplier=0.5, n_epochs=3)), type='supervised'), user_provided_suffix=None)
FineTuningJob(id='ftjob-slFLVLBascAqJc4qT

In [None]:
import time
import requests
checkpoints = None

# Function to get the latest accuracy and loss from checkpoints
def get_latest_accuracy(job_id, api_key):
    url = f"https://api.openai.com/v1/fine_tuning/jobs/{job_id}/checkpoints"
    headers = {"Authorization": f"Bearer {api_key}"}

    response = requests.get(url, headers=headers)
    checkpoints = response.json().get("data", [])

    if not checkpoints:
        return None, None  # Return None if no checkpoints are available

    # Find the latest checkpoint based on step_number
    latest_checkpoint = max(checkpoints, key=lambda c: c["step_number"])
    latest_accuracy = latest_checkpoint["metrics"]["full_valid_mean_token_accuracy"]
    latest_loss = latest_checkpoint["metrics"]["full_valid_loss"]
    return latest_accuracy, latest_loss

# Function to monitor fine-tuning job and print training/validation metrics
def monitor_finetuning_progress(job_id, api_key, check_interval=10):
    while True:
        try:
            # Retrieve the fine-tuning job status
            job_status = client.fine_tuning.jobs.retrieve(job_id)

            # Print basic job details
            print(f"Job ID: {job_status.id}")
            print(f"Status: {job_status.status}")

            # Check if the job has completed
            if job_status.status in ["succeeded", "failed"]:
                print(f"Fine-tuning job {job_status.status}.")
                model_id = job_status.fine_tuned_model
                result_file_id = job_status.result_files[0]
                return job_status, model_id, result_file_id
            
            # Retrieve and print the latest accuracy and loss
            latest_accuracy, latest_loss = get_latest_accuracy(job_id, api_key)
            if latest_accuracy is not None and latest_loss is not None:
                print(f"Latest Accuracy: {latest_accuracy:.3f}")
                print(f"Latest Loss: {latest_loss:.3f}")
            else:
                print("No checkpoints available yet.")
                
            # Wait before the next check
            print(f"Checking again in {check_interval} seconds...\n")
            time.sleep(check_interval)

        except Exception as e:
            print(f"An error occurred: {e}. Retrying in {check_interval} seconds...\n")
            time.sleep(check_interval)


# Replace `fine_tuning_job_id`
fine_tuning_job_id = all_jobs[0].id
status, model_name, result_file_id = monitor_finetuning_progress(fine_tuning_job_id, api_key, 10)
print(f"Status: {status}")
print(f"Model Name: {model_name}")
print(f"Result file id: {result_file_id}")

Job ID: ftjob-slFLVLBascAqJc4qT3tZuT6h
Status: running
No checkpoints available yet.
Checking again in 10 seconds...

Job ID: ftjob-slFLVLBascAqJc4qT3tZuT6h
Status: running
No checkpoints available yet.
Checking again in 10 seconds...

Job ID: ftjob-slFLVLBascAqJc4qT3tZuT6h
Status: running
No checkpoints available yet.
Checking again in 10 seconds...

Job ID: ftjob-slFLVLBascAqJc4qT3tZuT6h
Status: running
No checkpoints available yet.
Checking again in 10 seconds...

Job ID: ftjob-slFLVLBascAqJc4qT3tZuT6h
Status: running
No checkpoints available yet.
Checking again in 10 seconds...

Job ID: ftjob-slFLVLBascAqJc4qT3tZuT6h
Status: running
No checkpoints available yet.
Checking again in 10 seconds...

Job ID: ftjob-slFLVLBascAqJc4qT3tZuT6h
Status: running
No checkpoints available yet.
Checking again in 10 seconds...

Job ID: ftjob-slFLVLBascAqJc4qT3tZuT6h
Status: running
No checkpoints available yet.
Checking again in 10 seconds...

Job ID: ftjob-slFLVLBascAqJc4qT3tZuT6h
Status: running
N

In [16]:
response = requests.get(
    f"https://api.openai.com/v1/fine_tuning/jobs/{all_jobs[0].id}/checkpoints",
    headers={"Authorization": f"Bearer {api_key}"}
)
checkpoints = response.json().get("data", [])
for checkpoint in checkpoints:
    print(checkpoint)

{'object': 'fine_tuning.job.checkpoint', 'id': 'ftckpt_KcDdOPV3fH0CVx4HDJxjXNeP', 'created_at': 1739660346, 'fine_tuned_model_checkpoint': 'ft:gpt-4o-mini-2024-07-18:personal::B1LQ7BNA', 'fine_tuning_job_id': 'ftjob-slFLVLBascAqJc4qT3tZuT6h', 'metrics': {'step': 8}, 'step_number': 8}
{'object': 'fine_tuning.job.checkpoint', 'id': 'ftckpt_iCTEYTqpS7dyOGL3cRCqYSnO', 'created_at': 1739660300, 'fine_tuned_model_checkpoint': 'ft:gpt-4o-mini-2024-07-18:personal::B1LQ7Y0U:ckpt-step-6', 'fine_tuning_job_id': 'ftjob-slFLVLBascAqJc4qT3tZuT6h', 'metrics': {'step': 6}, 'step_number': 6}
{'object': 'fine_tuning.job.checkpoint', 'id': 'ftckpt_k6jKIiYzs4BRH5nrEY5w3V5v', 'created_at': 1739660260, 'fine_tuned_model_checkpoint': 'ft:gpt-4o-mini-2024-07-18:personal::B1LQ7YDz:ckpt-step-3', 'fine_tuning_job_id': 'ftjob-slFLVLBascAqJc4qT3tZuT6h', 'metrics': {'step': 3}, 'step_number': 3}


In [17]:
import requests

def print_result_file_content(file_id, api_key):
    # API endpoint to retrieve file content
    url = f"https://api.openai.com/v1/files/{file_id}/content"
    headers = {"Authorization": f"Bearer {api_key}"}

    # Request the file content
    response = requests.get(url, headers=headers)
    
    if response.status_code == 200:
        # Print the contents of the file
        print("Result File Contents:")
        print(response.text)
    else:
        print(f"Failed to retrieve file content. Status Code: {response.status_code}")
        print(f"Error: {response.json()}")

# Print the result file content
print_result_file_content(result_file_id, api_key)


Result File Contents:
c3RlcCx0cmFpbl9sb3NzLHRyYWluX2FjY3VyYWN5LHZhbGlkX2xvc3MsdmFsaWRfbWVhbl90b2tlbl9hY2N1cmFjeSx0cmFpbl9tZWFuX3Jld2FyZCxmdWxsX3ZhbGlkYXRpb25fbWVhbl9yZXdhcmQKMSwzLjUwMjI4LDAuNTE4NDIsMy41MjU4NywwLjUyNjc0LCwKMiwzLjgxODA3LDAuNDgyNzYsMy4zMzQxMiwwLjQ5NDUxLCwKMywzLjIwMjMsMC41NDQ0NCwyLjgzOTU3LDAuNTU1NTYsLAo0LDIuODk4MTksMC41MzQyNSwyLjg2MDQ3LDAuNTQ4MjEsLAo1LDMuMDE2NDksMC41MzYzOSwyLjU5Nzc0LDAuNTQzNzIsLAo2LDIuNTc3MTEsMC41NTExOCwyLjQ0NTg1LDAuNTg2MjEsLAo3LDIuNTY4MzgsMC41NzE4MywyLjQzOTI0LDAuNTgzNTYsLAo4LDIuNTQ1MDUsMC41NjI1LDIuNDI1MDMsMC41NTAxNCwsCg==


In [None]:
## Inferencing the fine tuned model
def query1(user_input):
  completion = client.chat.completions.create(
      model= model_name,
      messages=[
          {"role": "system", "content": "You are a physics professor who specializes in astronomy. You answer always in puzzles and bit sarcastically"},
          {"role": "user", "content": user_input }
      ],
        max_tokens=50
  )

  return completion.choices[0].message.content


## Inferencing the non fine tuned model
def query2(user_input):
  completion = client.chat.completions.create(
      model= 'gpt-3.5-turbo',
      messages=[
          {"role": "system", "content": "You are an AI assistant that helps answering the questions."},
          {"role": "user", "content": user_input }
      ],
        max_tokens=50
  )

  return completion.choices[0].message.content

In [None]:
questions = [
    "What is a wormhole?",
    "What is the difference between a meteor and a meteorite?",
    "What is the fate of the universe?",
    "What is the significance of the Hubble Constant?",
    "What is the role of dark energy in the expansion of the universe?"
]

# Loop through the questions and compare responses
for i, question in enumerate(questions, 1):
    print(f"\nQuestion {i}: {question}")
    
    # Fine-tuned model response
    print("\nFine-tuned model response:")
    response1 = query1(question)
    print(response1)
    
    # GPT-3.5-turbo response
    print("\nGPT-3.5-turbo response:")
    response2 = query2(question)
    print(response2)
    
    print("*" * 50)



Question 1: What is a wormhole?

Fine-tuned model response:
Ah, a wormhole, a cosmic shortcut? Think of it as the universe's version of a cheat code. You know how when you’re stuck in traffic and wish you could just teleport to your destination? That’s basically what a wormhole as

GPT-3.5-turbo response:
A wormhole is a theoretical concept in physics that represents a hypothetical tunnel-like structure connecting two separate points in spacetime. It is often depicted as a shortcut through space, allowing for faster-than-light travel between distant locations. Wormholes are a prediction of
**************************************************

Question 2: What is the difference between a meteor and a meteorite?

Fine-tuned model response:
Ah, the classic tale of "Who’s Who" in the celestial actors' guild! Here’s your puzzle:

Picture this: A dazzling shooting star streaks across the night sky—it's the meteor, the glamorous performer dazzling the audience with its bright

GPT-3.5-turbo re