## OpenAI Fintune Job

- References:
    - [Basic intro](https://platform.openai.com/docs/guides/fine-tuning)

In [1]:
import json
import tiktoken # for token counting
import numpy as np
from collections import defaultdict
import os,sys
sys.path.insert(0,'../libs')
from oai_fintune_utils import load_jsonl,check_format_errors,token_analysis

In [41]:
import pandas as pd
import openai
from dotenv import load_dotenv
from tqdm import tqdm
env_path = '../../.env'
load_dotenv(dotenv_path=env_path)

True

#### First validate dataset

In [15]:
data_folder='/data/home/xiong/data/Fund/CSR'
fintune_data_folder=os.path.join(data_folder,'Fintuning_data','Monetary')
train_path = os.path.join(fintune_data_folder,'train_mon_stance.jsonl')
test_path = os.path.join(fintune_data_folder,'test_mon_stance.jsonl')

In [3]:
#### run checks
data_path = train_path
sample_dataset = load_jsonl(data_path,verbose=True)
check_format_errors(sample_dataset)
token_analysis(sample_dataset)

Num examples: 462
First example:
{'role': 'system', 'content': 'You are an experience macroeconomist from IMF. \nGiven a piece of text concerning a particular country in a given year expressing the views of IMF staff, complete the following two tasks. \n\nFirst, classify the country\'s recent or current monetary policy stance as described in the text into \n**restrictive/neutral/accommodative/unclear/irrelevant**; \nif it discusses monetary policy but the specific stance is not clear, assign unclear; \nif it does not discuss monetary policy, assign irrelevant. \n\nSecond, classify the IMF staff\'s recommended or planned near-future (next year) direction of change in monetary policy stance \nas described in the text into **tightening/tightening bias/no change/loosening bias/loosening/unclear/irrelevant**; \nif it discusses monetary policy stance but the direction of change is not clear, assign no change; \nif it does not discuss monetary policy stance, assign unclear (if it discusses mo

#### Create a training job 

In [14]:
## create clinet and test api key
client = openai.OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"),
)
## run one test 
messages=[
    {"role": "system", "content": "You are a helpful AI assistant"},
    {"role": "user", "content": 'say : api working properly'},
]
## just run one test, make sure the api works 
response = client.chat.completions.create(
    model="gpt-4o-mini", 
    messages=messages,
    temperature=0,
)
print(response.choices[0].message.content)


API is working properly.


In [44]:
## upload training data
if os.path.exists(data_path):
    print('path exists')
    file = client.files.create(
      file=open(data_path, "rb"),
      purpose="fine-tune"
    )
    print(file.id)
else:
    print('path does not exists')

## upload validateion  data
if os.path.exists(test_path):
    print('path exists')
    eval_file = client.files.create(
      file=open(test_path, "rb"),
      purpose="fine-tune"
    )
    print(eval_file.id)
else:
    print('path does not exists')

path exists
file-DzmK6c28Ra77B9FgKfhb7uUC
path exists
file-nVDsECpM6EEwiFYkVu2RIjNt


In [45]:

FineTuningJob = client.fine_tuning.jobs.create(
    training_file=file.id,  # Specify the training file ID
    validation_file=eval_file.id,  # Specify the validation file ID
    model="gpt-4o-mini-2024-07-18",  # Specify the model to use
    hyperparameters={
        "n_epochs": "auto",     #2,  # Specify the number of epochs for training
        "batch_size":"auto",
        "learning_rate_multiplier":"auto"
    }
)

In [46]:
print(FineTuningJob)

FineTuningJob(id='ftjob-avBwxJWen5OjjFoYDev1D83g', created_at=1729487540, error=Error(code=None, message=None, param=None), fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='gpt-4o-mini-2024-07-18', object='fine_tuning.job', organization_id='org-3AqEuQZh0o1lNQSUihUxYYSd', result_files=[], seed=116659446, status='validating_files', trained_tokens=None, training_file='file-DzmK6c28Ra77B9FgKfhb7uUC', validation_file='file-nVDsECpM6EEwiFYkVu2RIjNt', estimated_finish=None, integrations=[], user_provided_suffix=None)


In [49]:
# List up to 10 events from a fine-tuning job
res = client.fine_tuning.jobs.list_events(fine_tuning_job_id=FineTuningJob.id, limit=10)
df_res = pd.DataFrame(res.data)
df_res.head()

Unnamed: 0,0,1,2,3,4,5,6
0,"(id, ftevent-t5acFCgKyim715ihy4b6AZ2g)","(created_at, 1729487540)","(level, info)","(message, Validating training file: file-DzmK6...","(object, fine_tuning.job.event)","(data, {})","(type, message)"
1,"(id, ftevent-Vk9tQtXTIK6P8zeF2jI8W9za)","(created_at, 1729487540)","(level, info)","(message, Created fine-tuning job: ftjob-avBwx...","(object, fine_tuning.job.event)","(data, {})","(type, message)"
