In [48]:
from llama_index.llms import openai_utils
import pandas as pd
from projectgurukul import prompt_templates
import json
from llama_index.llms import ChatMessage, MessageRole
from collections import defaultdict

In [14]:
all_df = pd.read_csv("./Gurukul Data - data_labelled.csv")
train_df = all_df[all_df.Split == "Train"][["Question", "used_context", "Final_label"]]
train_df.head()

Unnamed: 0,Question,used_context,Final_label
0,Why does Vasishta refuse to give Sabala to Vis...,"[""sarga: 54\nfile_path: data/ramayana/data/bal...",While Vasishta's refusal to give Sabala to Vis...
1,"Did Hanuman's devotion to Rama ever waver, eve...","[""sarga: 13\nfile_path: data/ramayana/data/sun...","Hanuman's devotion to Rama remained steadfast,..."
2,"What drove Ravana to kidnap Sita, and was it s...","[""sarga: 55\nfile_path: data/ramayana/data/ara...",Ravana's abduction of Sita was primarily drive...
3,Describe Bharata's character and his approach ...,"[""sarga: 16\nfile_path: data/ramayana/data/ara...",**Character:**\n\nBharata is depicted as a vir...
4,How did Ravana react upon learning about Dhumr...,"[""sarga: 51\nfile_path: data/ramayana/data/yud...",The provided context does not provide details ...


In [15]:
train_df["context_str"] = train_df.used_context.apply(
    lambda contexts: "\n\n".join(json.loads(contexts))
    )

In [45]:
def get_example(row):
    chat_messages = prompt_templates.training_text_qa_template.format_messages(
        query_str = row.Question,context_str= row.context_str)
    chat_messages.append(
        ChatMessage(
            role=MessageRole.ASSISTANT,
            content=row.Final_label
        )
    )
    messages = openai_utils.to_openai_message_dicts(chat_messages)
    return {"messages":messages}

train_df["open_ai_examples"] = train_df.apply(get_example, axis = 1)
messages = [json.dumps(m, ensure_ascii=False) for m in train_df["open_ai_examples"]]
len(messages)

53

In [46]:
FILE_NAME = "openai_input.jsonl"
with open(FILE_NAME, "w") as f:
    for message in messages:
        f.write(message)
        f.write("\n")

In [None]:
# Load the dataset
with open(FILE_NAME, 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]

# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[0]["messages"]:
    print(message)

In [49]:
# Format error checks
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue
        
    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue
        
    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1
        
        if any(k not in ("role", "content", "name", "function_call") for k in message):
            format_errors["message_unrecognized_key"] += 1
        
        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1
            
        content = message.get("content", None)
        function_call = message.get("function_call", None)
        
        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1
    
    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

No errors found


In [55]:
from openai import OpenAI
from openai.types.fine_tuning.fine_tuning_job import Hyperparameters
client = OpenAI()

client.files.create(
  file=open(FILE_NAME, "rb"),
  purpose="fine-tune"
)

FileObject(id='file-0JYtMd5VLwtO8fElt5Q3bKpM', bytes=560135, created_at=1705849148, filename='openai_input.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [58]:
client = OpenAI()

client.fine_tuning.jobs.create(
  training_file="file-0JYtMd5VLwtO8fElt5Q3bKpM", 
  model="gpt-3.5-turbo-1106"
)

FineTuningJob(id='ftjob-AD07tu8yAlazKOE98zRwsYHY', created_at=1705849245, error=None, fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-3.5-turbo-1106', object='fine_tuning.job', organization_id='org-H3gA3aIB39O3UI9PxCnRxxWD', result_files=[], status='validating_files', trained_tokens=None, training_file='file-0JYtMd5VLwtO8fElt5Q3bKpM', validation_file=None)

In [64]:
print(messages[0]['role'])

system


In [74]:
# List 10 fine-tuning jobs
client.fine_tuning.jobs.list(limit=10)



SyncCursorPage[FineTuningJob](data=[FineTuningJob(id='ftjob-AD07tu8yAlazKOE98zRwsYHY', created_at=1705849245, error=None, fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs=3, batch_size=1, learning_rate_multiplier=2), model='gpt-3.5-turbo-1106', object='fine_tuning.job', organization_id='org-H3gA3aIB39O3UI9PxCnRxxWD', result_files=[], status='running', trained_tokens=None, training_file='file-0JYtMd5VLwtO8fElt5Q3bKpM', validation_file=None)], object='list', has_more=False)

In [88]:
# Retrieve the state of a fine-tune
job = client.fine_tuning.jobs.retrieve("ftjob-AD07tu8yAlazKOE98zRwsYHY")
job

FineTuningJob(id='ftjob-AD07tu8yAlazKOE98zRwsYHY', created_at=1705849245, error=None, fine_tuned_model='ft:gpt-3.5-turbo-1106:macro-mate::8jTl73oZ', finished_at=1705849845, hyperparameters=Hyperparameters(n_epochs=3, batch_size=1, learning_rate_multiplier=2), model='gpt-3.5-turbo-1106', object='fine_tuning.job', organization_id='org-H3gA3aIB39O3UI9PxCnRxxWD', result_files=['file-V5Yh4ovNAdS03xiE6HaKasHv'], status='succeeded', trained_tokens=479898, training_file='file-0JYtMd5VLwtO8fElt5Q3bKpM', validation_file=None)

In [87]:
# List up to 10 events from a fine-tuning job
events = client.fine_tuning.jobs.list_events(fine_tuning_job_id="ftjob-AD07tu8yAlazKOE98zRwsYHY", limit=20)
for d in events.data:
    print (d.data)

{}
{}
{'step': 151, 'train_loss': 0.5254576802253723, 'train_mean_token_accuracy': 0.8252595067024231}
{'step': 141, 'train_loss': 0.5257101655006409, 'train_mean_token_accuracy': 0.8579465746879578}
{'step': 131, 'train_loss': 0.8190422654151917, 'train_mean_token_accuracy': 0.7857142686843872}
{'step': 121, 'train_loss': 0.3676629960536957, 'train_mean_token_accuracy': 0.8921568393707275}
{'step': 111, 'train_loss': 0.44338780641555786, 'train_mean_token_accuracy': 0.862500011920929}
{'step': 101, 'train_loss': 0.34354105591773987, 'train_mean_token_accuracy': 0.8834951519966125}
{'step': 91, 'train_loss': 1.0396333932876587, 'train_mean_token_accuracy': 0.7714285850524902}
{'step': 81, 'train_loss': 1.409155011177063, 'train_mean_token_accuracy': 0.6800000071525574}
{'step': 71, 'train_loss': 0.21233804523944855, 'train_mean_token_accuracy': 0.9389401078224182}
{'step': 61, 'train_loss': 0.3999319076538086, 'train_mean_token_accuracy': 0.8715789318084717}
{'step': 51, 'train_loss': 