# Finetuning GPT-3.X-FeaTxT

https://github.com/norahsakal/fine-tune-gpt3-model/blob/main/fine_tune_step_by_step.ipynb

Finetuning GPT-turbo with the **all-in-one predictions strategy**.

Data files: `data_train_v3.jsonl`, `data_val_v3.jsonl`, `data_test_v3.jsonl`

## Libraries

In [1]:
# !pip install --upgrade pip
# !pip install openai
# (choose "base" kernel)

In [2]:
import os
import json
import pickle
import pandas as pd
from pathlib import Path

from sklearn.metrics import classification_report

import openai
from openai import OpenAI

## API key

In [3]:
api_key ="<your key>"
openai.api_key = api_key

## Upload data to OpenAI

In [4]:
data_dir = os.path.join(os.getcwd(), "data")

In [5]:
train_file_name = os.path.join(data_dir, "data_train_v3.jsonl")
val_file_name = os.path.join(data_dir, "data_val_v3.jsonl")
test_file_name = os.path.join(data_dir, "data_test_v3.jsonl")

In [6]:
client = OpenAI(api_key=api_key)

### Train set

In [7]:
train_upload_response = client.files.create(
    file = Path(train_file_name),
    purpose = "fine-tune"
)

In [8]:
train_upload_response

FileObject(id='file-guu73kQEEqLyKDrYT7bogdh2', bytes=1309172, created_at=1705237534, filename='data_train_v3.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [9]:
train_file_id = train_upload_response.id
train_file_id

'file-guu73kQEEqLyKDrYT7bogdh2'

### Validation set

In [10]:
val_upload_response = client.files.create(
    file = Path(val_file_name),
    purpose = "fine-tune"
)

In [11]:
val_upload_response

FileObject(id='file-KezBTR34IUBHwB9DUijxnvAe', bytes=147676, created_at=1705237541, filename='data_val_v3.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [12]:
val_file_id = val_upload_response.id
val_file_id

'file-KezBTR34IUBHwB9DUijxnvAe'

### Test set

## Fine-tune model

In [13]:
# Launch fine-tuning

finetune_response = client.fine_tuning.jobs.create(
    training_file = train_file_id,
    validation_file = val_file_id,
    model = "gpt-3.5-turbo",
    hyperparameters = {"n_epochs": 1},
    suffix = "pe_finetune_v3d"
)

In [14]:
print(finetune_response.id)

ftjob-EBORDTXBnPozUJU5LIKLF5TO


In [21]:
# Checking progress

finetune_events = client.fine_tuning.jobs.list_events(fine_tuning_job_id=finetune_response.id, limit=10)
finetune_events.model_dump()

{'data': [{'id': 'ftevent-gc2EVm1k9Yt9DnxsqaWF30iK',
   'created_at': 1705238392,
   'level': 'info',
   'message': 'The job has successfully completed',
   'object': 'fine_tuning.job.event',
   'data': {},
   'type': 'message'},
  {'id': 'ftevent-0cAWhrFtopjFJVx3vu5rCqHS',
   'created_at': 1705238389,
   'level': 'info',
   'message': 'New fine-tuned model created: ft:gpt-3.5-turbo-0613:personal:pe-finetune-v3d:8gugueoR',
   'object': 'fine_tuning.job.event',
   'data': {},
   'type': 'message'},
  {'id': 'ftevent-tjO8J67FrLZSX34koGimxnQC',
   'created_at': 1705238367,
   'level': 'info',
   'message': 'Step 281/290: training loss=0.05, validation loss=0.10',
   'object': 'fine_tuning.job.event',
   'data': {'step': 281,
    'train_loss': 0.05166623368859291,
    'valid_loss': 0.10081902256718388,
    'train_mean_token_accuracy': 0.9743589758872986,
    'valid_mean_token_accuracy': 0.7407407407407407},
   'type': 'metrics'},
  {'id': 'ftevent-VXfEzuhvi5RUJqK0ScmN8HbG',
   'created_at'

In [22]:
retrieve_response = client.fine_tuning.jobs.retrieve(finetune_response.id)
retrieve_response.model_dump()

{'id': 'ftjob-EBORDTXBnPozUJU5LIKLF5TO',
 'created_at': 1705237553,
 'error': None,
 'fine_tuned_model': 'ft:gpt-3.5-turbo-0613:personal:pe-finetune-v3d:8gugueoR',
 'finished_at': 1705238387,
 'hyperparameters': {'n_epochs': 1,
  'batch_size': 1,
  'learning_rate_multiplier': 2},
 'model': 'gpt-3.5-turbo-0613',
 'object': 'fine_tuning.job',
 'organization_id': 'org-ao865HLfwm7KSarTu10iG90O',
 'result_files': ['file-0w7xgbRJvAWLT0Y3LLKX5Eby'],
 'status': 'succeeded',
 'trained_tokens': 260386,
 'training_file': 'file-guu73kQEEqLyKDrYT7bogdh2',
 'validation_file': 'file-KezBTR34IUBHwB9DUijxnvAe'}

## Save fine-tuned model

In [23]:
# Option 3

if retrieve_response.fine_tuned_model == None:
    
    finetuned_model = client.fine_tuning.jobs.retrieve("<JOB ID from API>").fine_tuned_model

else:
    
    finetuned_model = retrieve_response.model_dump()["fine_tuned_model"]

In [24]:
finetuned_model

'ft:gpt-3.5-turbo-0613:personal:pe-finetune-v3d:8gugueoR'

## Evaluate on test set

In [25]:
# Evaluate the whole test set

predictions_l = []

with open(os.path.join(data_dir, "data_test_v3.jsonl"), 'r') as fh:
    
    lines_l = [json.loads(line) for line in fh]
    
    for i, line in enumerate(lines_l):
        
        if i%10 == 0:
            print(f"{i} samples processed")
        
        messages = line["messages"]
        
        response = client.chat.completions.create(
        model=finetuned_model,
        messages=messages
        )

        predictions_l.append(response.choices[0].message.content)

0 samples processed
10 samples processed
20 samples processed
30 samples processed
40 samples processed
50 samples processed
60 samples processed
70 samples processed


In [39]:
len(predictions_l)

80

In [41]:
# predictions_l[0].strip("][").split(", ")#[0][1:-1]

In [42]:
def str2list(l):
    
    results_l = []
    
    for preds in l:
        
        tmp = preds.strip("][").split(", ")                
        tmp = [x[1:-1] for x in tmp]

        results_l.append(tmp)
    
    return results_l

In [43]:
predictions_l_final = str2list(predictions_l)
len(predictions_l_final)

80

In [44]:
predictions_l_final

[['premise',
  'premise',
  'major claim',
  'claim',
  'premise',
  'premise',
  'premise',
  'premise',
  'claim',
  'claim',
  'premise',
  'premise',
  'premise',
  'premise',
  'claim',
  'premise',
  'premise',
  'premise',
  'premise',
  'premise',
  'premise',
  'claim',
  'claim',
  'claim',
  'major claim',
  'claim',
  'claim',
  'premise'],
 ['major claim',
  'claim',
  'claim',
  'premise',
  'premise',
  'premise',
  'premise',
  'claim',
  'premise',
  'premise',
  'premise',
  'premise',
  'premise',
  'premise',
  'premise',
  'claim',
  'premise',
  'premise',
  'premise',
  'premise',
  'premise',
  'premise',
  'premise',
  'claim',
  'major claim',
  'claim'],
 ['major claim',
  'premise',
  'premise',
  'premise',
  'premise',
  'claim',
  'claim',
  'premise',
  'premise',
  'premise',
  'premise',
  'claim',
  'premise',
  'major claim',
  'premise',
  'premise',
  'premise',
  'premise',
  'claim',
  'claim',
  'claim',
  'premise',
  'premise',
  'major claim'

In [33]:
with open(os.path.join(data_dir, 'predictions_l_v3.pkl'), 'wb') as f:
    
    pickle.dump(predictions_l_final, f)

## Results

In [67]:
df = pd.read_csv(os.path.join(data_dir, "persuasive_essays_dataset.csv"), index_col=0)

In [68]:
test_df = df[df.split == 'TEST']
# test_df

In [69]:
essay_files_l = list(df[df.split=="TEST"].essay_file.value_counts().index)
len(essay_files_l)

80

In [70]:
grounds_l = []

for essay in essay_files_l:
    
    sub_df = test_df[test_df.essay_file==essay]
    labels = list(sub_df.label.values)
    
    grounds_l.append(labels)

    
def rename_labels(l):
    
    l = list(map(lambda x: x.replace("MajorClaim", "major claim"), l))
    l = list(map(lambda x: x.replace("Claim", "claim"), l))
    l = list(map(lambda x: x.replace("Premise", "premise"), l))
    
    return l

for i, labels in enumerate(grounds_l):
    
    grounds_l[i] = rename_labels(labels)

In [75]:
counter = 1

for l1, l2 in zip(predictions_l_final, grounds_l):
    
    if len(l1) != len(l2):
        print(f"Essay no {counter}:", f"nb of pred: {len(l1)}", f"/ nb of labels: {len(l2)}")
    
    counter += 1

Essay no 6: nb of pred: 23 / nb of pred: 22


- Only for essay #6, the model predicts 23 labels for 22 components. We remove the last prediction for matching purposes.
- For the rest, the number of predictions matches the number of labels.

In [86]:
predictions_l_final[5] = predictions_l_final[5][:-1]

for l1, l2 in zip(predictions_l_final, grounds_l):
    
    if len(l1) != len(l2):
        print(f"Essay no {counter}:", f"nb of pred: {len(l1)}", f"/ nb of pred: {len(l2)}")
    
    counter += 1

In [87]:
def flatten_list(ll):
    return [item for l in ll for item in l]

grounds_l_final = flatten_list(grounds_l)
predictions_l_final_v2 = flatten_list(predictions_l_final)

len(grounds_l_final), len(predictions_l_final_v2)

(1266, 1266)

In [88]:
print(classification_report(grounds_l_final, predictions_l_final_v2, digits=3))

              precision    recall  f1-score   support

       claim      0.677     0.691     0.684       304
 major claim      0.922     0.922     0.922       153
     premise      0.894     0.888     0.891       809

    accuracy                          0.844      1266
   macro avg      0.831     0.833     0.832      1266
weighted avg      0.845     0.844     0.845      1266



```
Results for 1 epoch

              precision    recall  f1-score   support

       claim      0.677     0.691     0.684       304
 major claim      0.922     0.922     0.922       153
     premise      0.894     0.888     0.891       809

    accuracy                          0.844      1266
   macro avg      0.831     0.833     0.832      1266
weighted avg      0.845     0.844     0.845      1266
```