# Finetuning GPT-3.X-FeaTxT

https://github.com/norahsakal/fine-tune-gpt3-model/blob/main/fine_tune_step_by_step.ipynb

Finetuning GPT-turbo with **features as text (explicit form)** with the **all-in-one prerdictions strategy**.

Data files: `data_train_v4.jsonl`, `data_val_v4.jsonl`, `data_test_v4.jsonl`

## Libraries

In [1]:
# !pip install --upgrade pip
# !pip install openai
# (choose "base" kernel)

In [2]:
import os
import json
import pickle
import pandas as pd
from pathlib import Path

from sklearn.metrics import classification_report

import openai
from openai import OpenAI

## API key

In [3]:
api_key ="<your key>"
openai.api_key = api_key

## Upload data to OpenAI

In [4]:
data_dir = os.path.join(os.getcwd(), "data")

In [5]:
train_file_name = os.path.join(data_dir, "data_train_v4.jsonl")
val_file_name = os.path.join(data_dir, "data_val_v4.jsonl")
test_file_name = os.path.join(data_dir, "data_test_v4.jsonl")

In [6]:
client = OpenAI(api_key=api_key)

### Train set

In [7]:
train_upload_response = client.files.create(
    file = Path(train_file_name),
    purpose = "fine-tune"
)

In [8]:
train_upload_response

FileObject(id='file-BhaoZzUWz2Pq7Sas70cpbJBP', bytes=1584699, created_at=1705247644, filename='data_train_v4.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [9]:
train_file_id = train_upload_response.id
train_file_id

'file-BhaoZzUWz2Pq7Sas70cpbJBP'

### Validation set

In [10]:
val_upload_response = client.files.create(
    file = Path(val_file_name),
    purpose = "fine-tune"
)

In [11]:
val_upload_response

FileObject(id='file-J0YJJaVCoK9TdWra09z8ZwRR', bytes=179708, created_at=1705247650, filename='data_val_v4.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [12]:
val_file_id = val_upload_response.id
val_file_id

'file-J0YJJaVCoK9TdWra09z8ZwRR'

### Test set

## Fine-tune model

In [13]:
# Launch fine-tuning

finetune_response = client.fine_tuning.jobs.create(
    training_file = train_file_id,
    validation_file = val_file_id,
    model = "gpt-3.5-turbo",
    hyperparameters = {"n_epochs": 1},
    suffix = "pe_finetune_v4"
)

In [14]:
print(finetune_response.id)

ftjob-BCJP4ueFBs5bYzLNA1c7J77r


In [17]:
# Checking progress

finetune_events = client.fine_tuning.jobs.list_events(fine_tuning_job_id=finetune_response.id, limit=10)
finetune_events.model_dump()

{'data': [{'id': 'ftevent-V8V5zkJC45DGFBx8jkKh0L5H',
   'created_at': 1705248226,
   'level': 'info',
   'message': 'Step 141/290: training loss=0.13, validation loss=0.06',
   'object': 'fine_tuning.job.event',
   'data': {'step': 141,
    'train_loss': 0.1271219253540039,
    'valid_loss': 0.05853742168795678,
    'train_mean_token_accuracy': 0.9347826242446899,
    'valid_mean_token_accuracy': 0.7580645161290323},
   'type': 'metrics'},
  {'id': 'ftevent-lYf0JpYjcy1KggnS0v3gMfwX',
   'created_at': 1705248206,
   'level': 'info',
   'message': 'Step 131/290: training loss=0.07, validation loss=0.08',
   'object': 'fine_tuning.job.event',
   'data': {'step': 131,
    'train_loss': 0.07114148885011673,
    'valid_loss': 0.0757993475183264,
    'train_mean_token_accuracy': 0.9767441749572754,
    'valid_mean_token_accuracy': 0.7402597402597403},
   'type': 'metrics'},
  {'id': 'ftevent-gky5xjaHzIP5ooQTt51RxSCF',
   'created_at': 1705248188,
   'level': 'info',
   'message': 'Step 121/29

In [18]:
retrieve_response = client.fine_tuning.jobs.retrieve(finetune_response.id)
retrieve_response.model_dump()

{'id': 'ftjob-BCJP4ueFBs5bYzLNA1c7J77r',
 'created_at': 1705247654,
 'error': None,
 'fine_tuned_model': 'ft:gpt-3.5-turbo-0613:personal:pe-finetune-v4:8gxKRVCr',
 'finished_at': 1705248526,
 'hyperparameters': {'n_epochs': 1,
  'batch_size': 1,
  'learning_rate_multiplier': 2},
 'model': 'gpt-3.5-turbo-0613',
 'object': 'fine_tuning.job',
 'organization_id': 'org-ao865HLfwm7KSarTu10iG90O',
 'result_files': ['file-ejF8LosyejQJ8eVEysf1GV6s'],
 'status': 'succeeded',
 'trained_tokens': 376063,
 'training_file': 'file-BhaoZzUWz2Pq7Sas70cpbJBP',
 'validation_file': 'file-J0YJJaVCoK9TdWra09z8ZwRR'}

## Save fine-tuned model

In [21]:
# Option 3

if retrieve_response.fine_tuned_model == None:
    
    finetuned_model = client.fine_tuning.jobs.retrieve("<JOB ID from API>").fine_tuned_model

else:
    
    finetuned_model = retrieve_response.model_dump()["fine_tuned_model"]

In [22]:
finetuned_model

'ft:gpt-3.5-turbo-0613:personal:pe-finetune-v4:8gxKRVCr'

## Evaluate on test set

In [23]:
# Evaluate the whole test set

predictions_l = []

with open(os.path.join(data_dir, "data_test_v4.jsonl"), 'r') as fh:
    
    lines_l = [json.loads(line) for line in fh]
    
    for i, line in enumerate(lines_l):
        
        if i%10 == 0:
            print(f"{i} samples processed")
        
        messages = line["messages"]
        
        response = client.chat.completions.create(
        model=finetuned_model,
        messages=messages
        )

        predictions_l.append(response.choices[0].message.content)

0 samples processed
10 samples processed
20 samples processed
30 samples processed
40 samples processed
50 samples processed
60 samples processed
70 samples processed


In [127]:
len(predictions_l)

80

In [128]:
# predictions_l[0].strip("][").split(", ")#[0][1:-1]

In [129]:
def str2list(l):
    
    results_l = []
    
    for preds in l:
        
        tmp = preds.strip("][").split(", ")
        tmp = [x[1:-1] for x in tmp]

        results_l.append(tmp)
    
    return results_l

In [130]:
predictions_l_final = str2list(predictions_l)
len(predictions_l_final)

80

In [131]:
def flatten_list(ll):
    return [item for l in ll for item in l]

In [132]:
set(flatten_list(predictions_l_final))

{"'premise", 'claim', 'major claim', 'premise', "premise',\n'major claim"}

In [133]:
# CLEAN THE DATA !!!

for l in predictions_l_final:
    
    for i in range(len(l)):
        
        if l[i] == "'premise":
            l[i] = 'premise'
            
        if l[i] == "premise',\n'major claim":
            l[i] = 'premise'
            l.insert(i+1, 'major claim')
            
len(predictions_l_final)

80

In [134]:
with open(os.path.join(data_dir, 'predictions_l_v4.pkl'), 'wb') as f:
    
    pickle.dump(predictions_l_final, f)

## Results

In [135]:
df = pd.read_csv(os.path.join(data_dir, "persuasive_essays_dataset.csv"), index_col=0)

In [136]:
test_df = df[df.split == 'TEST']
# test_df

In [137]:
essay_files_l = list(df[df.split=="TEST"].essay_file.value_counts().index)
len(essay_files_l)

80

In [138]:
grounds_l = []

for essay in essay_files_l:
    
    sub_df = test_df[test_df.essay_file==essay]
    labels = list(sub_df.label.values)
    
    grounds_l.append(labels)

    
def rename_labels(l):
    
    l = list(map(lambda x: x.replace("MajorClaim", "major claim"), l))
    l = list(map(lambda x: x.replace("Claim", "claim"), l))
    l = list(map(lambda x: x.replace("Premise", "premise"), l))
    
    return l

for i, labels in enumerate(grounds_l):
    
    grounds_l[i] = rename_labels(labels)

In [139]:
counter = 1

for l1, l2 in zip(predictions_l_final, grounds_l):
    
    if len(l1) != len(l2):
        print(f"Essay no {counter}:", f"nb of preds: {len(l1)}", f"/ nb of labels: {len(l2)}")
    
    counter += 1

Essay no 12: nb of preds: 20 / nb of labels: 21


- Only for essay #12, the model predictions and the labels do not match. We add a last **wrong** prediction for matching purposes.
- For the rest, the number of predictions matches the number of labels.

In [140]:
predictions_l_final[11] = predictions_l_final[11] + ["premise"]

In [141]:
for l1, l2 in zip(predictions_l_final, grounds_l):
    
    if len(l1) != len(l2):
        print(f"Essay no {counter}:", f"nb of pred: {len(l1)}", f"/ nb of pred: {len(l2)}")
    
    counter += 1

In [142]:
def flatten_list(ll):
    return [item for l in ll for item in l]

grounds_l_final = flatten_list(grounds_l)
predictions_l_final_v2 = flatten_list(predictions_l_final)

len(grounds_l_final), len(predictions_l_final_v2)

(1266, 1266)

In [143]:
print(classification_report(grounds_l_final, predictions_l_final_v2, digits=3))

              precision    recall  f1-score   support

       claim      0.662     0.664     0.663       304
 major claim      0.907     0.895     0.901       153
     premise      0.889     0.890     0.889       809

    accuracy                          0.836      1266
   macro avg      0.819     0.817     0.818      1266
weighted avg      0.837     0.836     0.837      1266



```
Results for 1 epoch

              precision    recall  f1-score   support

       claim      0.662     0.664     0.663       304
 major claim      0.907     0.895     0.901       153
     premise      0.889     0.890     0.889       809

    accuracy                          0.836      1266
   macro avg      0.819     0.817     0.818      1266
weighted avg      0.837     0.836     0.837      1266
```