# Finetuning GPT-3.X-FeaTxT

https://github.com/norahsakal/fine-tune-gpt3-model/blob/main/fine_tune_step_by_step.ipynb

Finetuning GPT-turbo with the **all-in-one predictions strategy**.
The prompts are slightly improved and we fine-tune on 4 epochs.

Data files: `data_train_v3b.jsonl`, `data_val_v3b.jsonl`, `data_test_v3b.jsonl`

## Libraries

In [1]:
# !pip install --upgrade pip
# !pip install openai
# (choose "base" kernel)

In [2]:
import os
import json
import pickle
import pandas as pd
from pathlib import Path

from sklearn.metrics import classification_report

import openai
from openai import OpenAI

## API key

In [3]:
api_key ="<your key>"
openai.api_key = api_key

## Upload data to OpenAI

In [4]:
data_dir = os.path.join(os.getcwd(), "data")

In [5]:
train_file_name = os.path.join(data_dir, "data_train_v3b.jsonl")
val_file_name = os.path.join(data_dir, "data_val_v3b.jsonl")
test_file_name = os.path.join(data_dir, "data_test_v3b.jsonl")

In [6]:
client = OpenAI(api_key=api_key)

### Train set

In [7]:
train_upload_response = client.files.create(
    file = Path(train_file_name),
    purpose = "fine-tune"
)

In [8]:
train_upload_response

FileObject(id='file-pvEGGkdOIwepD1fwW3HMMWcb', bytes=1389444, created_at=1715366028, filename='data_train_v3b.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [9]:
train_file_id = train_upload_response.id
train_file_id

'file-pvEGGkdOIwepD1fwW3HMMWcb'

### Validation set

In [10]:
val_upload_response = client.files.create(
    file = Path(val_file_name),
    purpose = "fine-tune"
)

In [11]:
val_upload_response

FileObject(id='file-vG7xak3cmlNkJYOcbHx90Mwm', bytes=157144, created_at=1715366043, filename='data_val_v3b.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [12]:
val_file_id = val_upload_response.id
val_file_id

'file-vG7xak3cmlNkJYOcbHx90Mwm'

### Test set

## Fine-tune model

In [13]:
# Launch fine-tuning

finetune_response = client.fine_tuning.jobs.create(
    training_file = train_file_id,
    validation_file = val_file_id,
    model = "gpt-3.5-turbo",
    hyperparameters = {"n_epochs": 4},
    suffix = "pe_finetune_v3d"
)

In [14]:
print(finetune_response.id)

ftjob-pKcSQs5dvcQ9vY0DriFOgPPS


In [20]:
# Checking progress

finetune_events = client.fine_tuning.jobs.list_events(fine_tuning_job_id=finetune_response.id, limit=10)
finetune_events.model_dump()

{'data': [{'id': 'ftevent-pbWLfHYtF9rII5n3W7JytYHD',
   'created_at': 1715369681,
   'level': 'info',
   'message': 'The job has successfully completed',
   'object': 'fine_tuning.job.event',
   'data': {},
   'type': 'message'},
  {'id': 'ftevent-P4aglHodMhWbS6LXjl3YhNOW',
   'created_at': 1715369676,
   'level': 'info',
   'message': 'New fine-tuned model created: ft:gpt-3.5-turbo-0125:personal:pe-finetune-v3d:9NQIm43k',
   'object': 'fine_tuning.job.event',
   'data': {},
   'type': 'message'},
  {'id': 'ftevent-HrB5Qv47MdmPBsQ0FPva37WF',
   'created_at': 1715369676,
   'level': 'info',
   'message': 'Checkpoint created at step 870 with Snapshot ID: ft:gpt-3.5-turbo-0125:personal:pe-finetune-v3d:9NQIl5p9:ckpt-step-870',
   'object': 'fine_tuning.job.event',
   'data': {},
   'type': 'message'},
  {'id': 'ftevent-1wBvFWLTlcrsxSbBvzVKg9Ei',
   'created_at': 1715369676,
   'level': 'info',
   'message': 'Checkpoint created at step 580 with Snapshot ID: ft:gpt-3.5-turbo-0125:personal:pe

In [19]:
retrieve_response = client.fine_tuning.jobs.retrieve(finetune_response.id)
retrieve_response.model_dump()

{'id': 'ftjob-pKcSQs5dvcQ9vY0DriFOgPPS',
 'created_at': 1715366089,
 'error': {'code': None, 'message': None, 'param': None},
 'fine_tuned_model': None,
 'finished_at': None,
 'hyperparameters': {'n_epochs': 4,
  'batch_size': 1,
  'learning_rate_multiplier': 2},
 'model': 'gpt-3.5-turbo-0125',
 'object': 'fine_tuning.job',
 'organization_id': 'org-ao865HLfwm7KSarTu10iG90O',
 'result_files': [],
 'status': 'running',
 'trained_tokens': None,
 'training_file': 'file-pvEGGkdOIwepD1fwW3HMMWcb',
 'validation_file': 'file-vG7xak3cmlNkJYOcbHx90Mwm',
 'user_provided_suffix': 'pe_finetune_v3d',
 'seed': 1009528146,
 'estimated_finish': None,
 'integrations': []}

## Save fine-tuned model

In [25]:
job_id = "ftjob-pKcSQs5dvcQ9vY0DriFOgPPS"

In [26]:
# Option 3

if retrieve_response.fine_tuned_model == None:
    
    finetuned_model = client.fine_tuning.jobs.retrieve(job_id).fine_tuned_model

else:
    
    finetuned_model = retrieve_response.model_dump()["fine_tuned_model"]

In [27]:
finetuned_model

'ft:gpt-3.5-turbo-0125:personal:pe-finetune-v3d:9NQIm43k'

## Evaluate on test set

In [28]:
# Evaluate the whole test set

predictions_l = []

with open(os.path.join(data_dir, "data_test_v3b.jsonl"), 'r') as fh:
    
    lines_l = [json.loads(line) for line in fh]
    
    for i, line in enumerate(lines_l):
        
        if i%10 == 0:
            print(f"{i} essays processed")
        
        messages = line["messages"]
        
        response = client.chat.completions.create(
        model=finetuned_model,
        messages=messages
        )

        predictions_l.append(response.choices[0].message.content)

0 essays processed
10 essays processed
20 essays processed
30 essays processed
40 essays processed
50 essays processed
60 essays processed
70 essays processed


In [29]:
len(predictions_l)

80

In [51]:
# json should have been in double quotes...
predictions_clean_l = []

for d in predictions_l:
    d = d.replace("'", '"')
    predictions_clean_l.append(d)

In [59]:
# predictions_clean_l2 = []

for i in range(len(predictions_clean_l)):
    
    d = predictions_clean_l[i]
    predictions_clean_l[i] = json.loads(d)

In [71]:
for i in range(len(predictions_clean_l)):
    
    d = predictions_clean_l[i]
    predictions_clean_l[i] = list(d.items())

In [78]:
# sanity check: ok

for preds in predictions_clean_l:
    
    i = 0
    
    for t in preds:
        
        num = int(t[0].split("Argument ")[1])
        
        if num != i+1:
            print("error")
        i = num

In [79]:
# save results

with open("results_v3b.pkl", "wb") as fh:
    
    pickle.dump(predictions_clean_l, fh)

## Results

In [80]:
df = pd.read_csv(os.path.join(data_dir, "persuasive_essays_dataset.csv"), index_col=0)

In [81]:
test_df = df[df.split == 'TEST']
# test_df

In [82]:
essay_files_l = list(df[df.split=="TEST"].essay_file.value_counts().index)
len(essay_files_l)

80

In [83]:
grounds_l = []

for essay in essay_files_l:
    
    sub_df = test_df[test_df.essay_file==essay]
    labels = list(sub_df.label.values)
    
    grounds_l.append(labels)

    
def rename_labels(l):
    
    l = list(map(lambda x: x.replace("MajorClaim", "major claim"), l))
    l = list(map(lambda x: x.replace("Claim", "claim"), l))
    l = list(map(lambda x: x.replace("Premise", "premise"), l))
    
    return l

for i, labels in enumerate(grounds_l):
    
    grounds_l[i] = rename_labels(labels)

In [91]:
predictions_l_final = []

for i, l in enumerate(predictions_clean_l):
    
    x = [t[1] for t in l]
    predictions_l_final.append(x)

In [97]:
counter = 1

for l1, l2 in zip(predictions_l_final, grounds_l):
    
    if len(l1) != len(l2):
        print(f"Essay no {counter}:", f"nb of pred: {len(l1)}", f"/ nb of labels: {len(l2)}")
    
    counter += 1

In [98]:
def flatten_list(ll):
    return [item for l in ll for item in l]

grounds_l_final = flatten_list(grounds_l)
predictions_l_final_v2 = flatten_list(predictions_l_final)

len(grounds_l_final), len(predictions_l_final_v2)

(1266, 1266)

In [99]:
print(classification_report(grounds_l_final, predictions_l_final_v2, digits=3))

              precision    recall  f1-score   support

       claim      0.798     0.832     0.815       304
 major claim      0.940     0.928     0.934       153
     premise      0.946     0.933     0.940       809

    accuracy                          0.908      1266
   macro avg      0.895     0.898     0.896      1266
weighted avg      0.910     0.908     0.909      1266



```
Results for 1 epoch

              precision    recall  f1-score   support

       claim      0.677     0.691     0.684       304
 major claim      0.922     0.922     0.922       153
     premise      0.894     0.888     0.891       809

    accuracy                          0.844      1266
   macro avg      0.831     0.833     0.832      1266
weighted avg      0.845     0.844     0.845      1266
```

```
Results for 4 epochs

              precision    recall  f1-score   support

       claim      0.798     0.832     0.815       304
 major claim      0.940     0.928     0.934       153
     premise      0.946     0.933     0.940       809

    accuracy                          0.908      1266
   macro avg      0.895     0.898     0.896      1266
weighted avg      0.910     0.908     0.909      1266

```