In [45]:
from openai import OpenAI
import os
import json
import signal
import datetime
import time

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv())

client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

# Fine-Tuning GPT-3 Using the OpenAI API and Python

## Dataset

In [37]:
training_data = [
	{
    	"prompt": "What is the capital of France?->",
    	"completion": """ The capital of France is Paris.\n"""
	},
	{
    	"prompt": "What is the primary function of the heart?->",
    	"completion": """ The primary function of the heart is to pump blood throughout the body.\n"""
	},
	{
    	"prompt": "What is photosynthesis?->",
    	"completion": """ Photosynthesis is the process by which green plants and some other organisms convert sunlight into chemical energy stored in the form of glucose.\n"""
	},
	{
    	"prompt": "Who wrote the play 'Romeo and Juliet'?->",
    	"completion": """ William Shakespeare wrote the play 'Romeo and Juliet'.\n"""
	},
	{
    	"prompt": "Which element has the atomic number 1?->",
    	"completion": """ Hydrogen has the atomic number 1.\n"""
	},
	{
    	"prompt": "What is the largest planet in our solar system?->",
    	"completion": """ Jupiter is the largest planet in our solar system.\n"""
	},
	{
    	"prompt": "What is the freezing point of water in Celsius?->",
    	"completion": """ The freezing point of water in Celsius is 0 degrees.\n"""
	},
	{
    	"prompt": "What is the square root of 144?->",
    	"completion": """ The square root of 144 is 12.\n"""
	},
	{
    	"prompt": "Who is the author of 'To Kill a Mockingbird'?->",
    	"completion": """ The author of 'To Kill a Mockingbird' is Harper Lee.\n"""
	},
	{
    	"prompt": "What is the smallest unit of life?->",
    	"completion": """ The smallest unit of life is the cell.\n"""
	}
]

validation_data = [
	{
    	"prompt": "Which gas do plants use for photosynthesis?->",
    	"completion": """ Plants use carbon dioxide for photosynthesis.\n"""
	},
	{
    	"prompt": "What are the three primary colors of light?->",
    	"completion": """ The three primary colors of light are red, green, and blue.\n"""
	},
	{
    	"prompt": "Who discovered penicillin?->",
    	"completion": """ Sir Alexander Fleming discovered penicillin.\n"""
	},
	{
    	"prompt": "What is the chemical formula for water?->",
    	"completion": """ The chemical formula for water is H2O.\n"""
	},
	{
    	"prompt": "What is the largest country by land area?->",
    	"completion": """ Russia is the largest country by land area.\n"""
	},
	{
    	"prompt": "What is the speed of light in a vacuum?->",
    	"completion": """ The speed of light in a vacuum is approximately 299,792 kilometers per second.\n"""
	},
	{
    	"prompt": "What is the currency of Japan?->",
    	"completion": """ The currency of Japan is the Japanese Yen.\n"""
	},
	{
    	"prompt": "What is the smallest bone in the human body?->",
    	"completion": """ The stapes, located in the middle ear, is the smallest bone in the human body.\n"""
	}
]

## Prepare training data

In [38]:
training_file_name = "training_data.jsonl"
validation_file_name = "validation_data.jsonl"

def prepare_data(dictionary_data, final_file_name): 
  with open(final_file_name, 'w') as outfile:
    for entry in dictionary_data:
      json.dump(entry, outfile)
      outfile.write('\n')

prepare_data(training_data, "training_data.jsonl")
prepare_data(validation_data, "validation_data.jsonl")

!openai tools fine_tunes.prepare_data -f "training_data.jsonl"
!openai tools fine_tunes.prepare_data -f "validation_data.jsonl"

Analyzing...

- Your file contains 10 prompt-completion pairs. In general, we recommend having at least a few hundred examples. We've found that performance tends to linearly increase for every doubling of the number of examples
- All prompts end with suffix `?->`
- All prompts start with prefix `Wh`
- All completions end with suffix `.\n`

No remediations found.

You can use your file for fine-tuning:
> openai api fine_tunes.create -t "training_data.jsonl"

After you’ve fine-tuned a model, remember that your prompt has to end with the indicator string `?->` for the model to start generating completions, rather than continuing with the prompt. Make sure to include `stop=[".\n"]` so that the generated texts ends at the expected place.
Once your model starts training, it'll approximately take 2.58 minutes to train a `curie` model, and less for `ada` and `babbage`. Queue will approximately take half an hour per job ahead of you.
Analyzing...

- Your file contains 8 prompt-completion pairs

In [39]:
def upload_data_to_OpenAI(file_name):
  result = client.files.create(
    file=open(file_name, "rb"),
    purpose="fine-tune"
  )
  return result.id

training_file_id = upload_data_to_OpenAI(training_file_name)
validation_file_id = upload_data_to_OpenAI(validation_file_name)

print(f"Training File ID: {training_file_id}")
print(f"Validation File ID: {validation_file_id}")

Training File ID: file-tV1rpaL0GnbfCVCfNz4PHyoC
Validation File ID: file-TbVXTGAgAsIAzq9IqOYK51Kt


## Create a fine-tuning job

In [40]:
create_args = {
	"training_file": training_file_id,
	"validation_file": validation_file_id,
	"model": "davinci-002",
	# "n_epochs": 15,
	# "batch_size": 3,
	# "learning_rate_multiplier": 0.3
}

response = client.fine_tuning.jobs.create(**create_args)
job_id = response.id
status = response.status

print(f'Fine-tunning model with jobID: {job_id}.')
print(f"Training Response: {response}")
print(f"Training Status: {status}")

Fine-tunning model with jobID: ftjob-j1wI3gywlOdWwVdozcxxQEma.
Training Response: FineTuningJob(id='ftjob-j1wI3gywlOdWwVdozcxxQEma', created_at=1705139088, error=None, fine_tuned_model=None, finished_at=None, hyperparameters=Hyperparameters(n_epochs='auto', batch_size='auto', learning_rate_multiplier='auto'), model='davinci-002', object='fine_tuning.job', organization_id='org-IwOD4bbgH6aDvW0CURl9aBb4', result_files=[], status='validating_files', trained_tokens=None, training_file='file-tV1rpaL0GnbfCVCfNz4PHyoC', validation_file='file-TbVXTGAgAsIAzq9IqOYK51Kt')
Training Status: validating_files


## Obtain information about the training status

In [44]:
def signal_handler(sig, frame):
  status = client.fine_tuning.jobs.retrieve(job_id).status
  print(f"Stream interrupted. Job is still {status}.")
  return

print(f'Streaming events for the fine-tuning job: {job_id}')
signal.signal(signal.SIGINT, signal_handler)

events = client.fine_tuning.jobs.list_events(job_id)
try:
  for event in events:
    print(f'{datetime.datetime.fromtimestamp(event.created_at)} {event.message}')
except Exception:
  print("Stream interrupted (client disconnected).")

Streaming events for the fine-tuning job: ftjob-j1wI3gywlOdWwVdozcxxQEma
2024-01-13 10:46:55 The job has successfully completed
2024-01-13 10:46:53 New fine-tuned model created: ft:davinci-002:personal::8gUtI1hR
2024-01-13 10:46:43 Step 91/100: training loss=0.00, validation loss=0.23
2024-01-13 10:46:41 Step 81/100: training loss=0.00, validation loss=0.01
2024-01-13 10:46:38 Step 71/100: training loss=0.00, validation loss=2.73
2024-01-13 10:46:36 Step 61/100: training loss=0.00, validation loss=1.33
2024-01-13 10:46:34 Step 51/100: training loss=0.00, validation loss=0.82
2024-01-13 10:46:31 Step 41/100: training loss=0.01, validation loss=0.15
2024-01-13 10:46:29 Step 31/100: training loss=0.03, validation loss=0.02
2024-01-13 10:46:24 Step 21/100: training loss=0.11, validation loss=0.53
2024-01-13 10:46:22 Step 11/100: training loss=0.50, validation loss=0.48
2024-01-13 10:46:19 Step 1/100: training loss=0.84, validation loss=1.42
2024-01-13 10:45:11 Fine-tuning job started
2024-

## Check the fine-tuning job status

In [53]:
status = client.fine_tuning.jobs.retrieve(job_id).status
if status not in ["succeeded", "failed"]:
  print(f'Job not in terminal status: {status}. Waiting.')
  while status not in ["succeeded", "failed"]:
    time.sleep(2)
    status = client.fine_tuning.jobs.retrieve(job_id).status
    print(f'Status: {status}')
else:
	print(f'Finetune job {job_id} finished with status: {status}')

print('Checking other finetune jobs in the subscription.')
result = client.fine_tuning.jobs.list()
print(f'Found {len(result.data)} finetune jobs.')

Finetune job ftjob-j1wI3gywlOdWwVdozcxxQEma finished with status: succeeded
Checking other finetune jobs in the subscription.
Found 5 finetune jobs.


## Validation of the model

In [59]:
print(result)

fine_tuned_model = result.data[0].fine_tuned_model
print(fine_tuned_model)

SyncCursorPage[FineTuningJob](data=[FineTuningJob(id='ftjob-j1wI3gywlOdWwVdozcxxQEma', created_at=1705139088, error=None, fine_tuned_model='ft:davinci-002:personal::8gUtI1hR', finished_at=1705139211, hyperparameters=Hyperparameters(n_epochs=10, batch_size=1, learning_rate_multiplier=2), model='davinci-002', object='fine_tuning.job', organization_id='org-IwOD4bbgH6aDvW0CURl9aBb4', result_files=['file-u99SZ1uAuTj9ST36rlgvD5HE'], status='succeeded', trained_tokens=2240, training_file='file-tV1rpaL0GnbfCVCfNz4PHyoC', validation_file='file-TbVXTGAgAsIAzq9IqOYK51Kt'), FineTuningJob(id='ftjob-JPbKPLIp5btL65SgN4RyJDcq', created_at=1705138911, error=None, fine_tuned_model='ft:davinci-002:personal::8gUqWTGO', finished_at=1705139039, hyperparameters=Hyperparameters(n_epochs=10, batch_size=1, learning_rate_multiplier=2), model='davinci-002', object='fine_tuning.job', organization_id='org-IwOD4bbgH6aDvW0CURl9aBb4', result_files=['file-ZohYZWc1cdxouyDXoOkmPc2K'], status='succeeded', trained_tokens=2

In [68]:
new_prompt = "Which part is the smallest bone in the entire human body?"
answer = client.completions.create(
  model=fine_tuned_model,
  prompt=new_prompt
)

print(answer.choices[0].text)

 The smallest bone in the entire human body is the stapes bone of the middle


In [76]:
new_prompt = """ Which type of gas is utilized by plants during the process of photosynthesis?"""
answer = client.completions.create(
  model=fine_tuned_model,
  prompt=new_prompt
)

print(answer.choices[0].text)

 The type of gas utilized by plants during the process of photosynthesis is carbon dioxide
