# OpenAI models Fine Tuning

In this notebook, we will fine-tune the OpenAI models `Davinci` & `GPT-3.5` on the dataset generated by `GPT-4` described in the notebook [DatasetCreation](a_DatasetCreation.ipynb). We will then evaluate the performance of these models with and without fine-tuning in the following notebook.

In [1]:
# ---------------------------- PREPARING NOTEBOOK ---------------------------- #
# Autoreload
%load_ext autoreload
%autoreload 2

# Random seed
import numpy as np
np.random.seed(42)

# External modules
import os
from IPython.display import display

# Set global log level
import logging
logging.basicConfig(level=logging.INFO)
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# Define PWD as the current git repository
import git
repo = git.Repo('.', search_parent_directories=True)
pwd = repo.working_dir
os.chdir(pwd)

# import

In [2]:
# -------------------------- LOAD PREVIOUS NOTEBOOKS ------------------------- #
import json
import __main__
import black

paths = [
    os.path.join(pwd, "notebooks", "text_simplification", "a_DatasetCreation.ipynb"),
]

# Read notebooks
code_dict = {}
for path in paths:
    code = ""
    with open(path, "r") as f:
        temp = json.load(f)

    cells = [
        cell
        for cell in temp["cells"]
        if cell["cell_type"] == "code"
        and len(cell["source"]) > 0
        and cell["source"][-1] == "# import"
    ]
    notebook_code = "\n".join(
        line
        for cell in cells
        for line in cell["source"]
        if line != "# import" and len(line) > 0 and line[0] != "%"
    )
    # Create something like a header
    code += f"# {'-'*76} #\n"
    code += f"# {os.path.basename(path).upper():^76} #\n"
    code += f"# {'-'*76} #\n"
    code += notebook_code

    # Add "Module Creation"
    notebook_name = (
        os.path.basename(path).replace("imported_", "").replace(".ipynb", "")
    )
    code += """
# --------------------------------- IMPORTER --------------------------------- #
import types


class MyNotebook:
    pass


NOTEBOOK_NAME = MyNotebook()
# Put every function defined in the notebook in the class
NOTEBOOK_NAME.__dict__.update(
    {
        name: obj
        for name, obj in locals().items()
        if isinstance(obj, (type, types.FunctionType))
        if not (name.startswith("_") or name == "MyNotebook")
    }
)
    """.replace(
        "NOTEBOOK_NAME", notebook_name
    )

    # Remove empty lines
    code = "\n".join([line for line in code.split("\n") if len(line) > 0])
    # Format code
    code = black.format_str(code, mode=black.FileMode())

    # Write scrach file
    path = os.path.join(
        pwd, "scratch", f"imported_{os.path.basename(path).replace('ipynb', 'py')}"
    )
    if not os.path.exists(os.path.dirname(path)):
        os.makedirs(os.path.dirname(path))
    with open(path, "w") as f:
        f.write(code)
    code_dict[path] = code


# Mainify code
for path, code in code_dict.items():
    compiled = compile(code, path, "exec")
    exec(compiled, __main__.__dict__)

# import

## Création des fichiers JSON

Dans un premier temps, nous devons formatter les exemples présents dans notre jeu de données sous la forme d'un fichier json où chaque ligne a le format suivant :
1. Pour le modèle `GPT3.5`
```json
{"messages": [{"role": "system", "content": "Context"}, {"role": "user", "content": "Original"}, {"role": "assistant", "content": "Simplified"}]}
```
2. Pour le modèle `Davinci`
```json
{"prompt": "Original", "completion": "Simplified"}
```

In [3]:
# --------------------------- FORMATTING FUNCTIONS --------------------------- #
# Define Context and templates
CONTEXT = "Vous êtes un modèle de langage naturel capable de simplifier des phrases en français. La phrase simplifiée doit avoir un sens aussi proche que possible de la phrase originale, mais elle est d'un niveau inférieur du CECRL et donc plus facile à comprendre. Par exemple, si une phrase est au niveau C1 du CECRL, simplifiez-la en B2. Si elle se situe au niveau B2, simplifiez-la en B1. Si elle se situe au niveau B1, simplifiez-la en A2. Si le niveau A2 est atteint, simplifiez en A1."
GPT_TEMPLATE = """{"messages": [{"role": "system", "content": "CONTEXT"}, {"role": "user", "content": "INPUT"}, {"role": "assistant", "content": "OUTPUT"}]}"""
DAVINCI_TEMPLATE = """{"prompt": "INPUT", "completion": "OUTPUT"}"""


# Define conversation for GPT
def create_gpt_conversation(row, training=True):
    if training:
        difficulty = row["index"] % 5
    else:
        difficulty = {"A2": 0, "B1": 1, "B2": 2, "C1": 3, "C2": 4}[row["Difficulty"]]
    instruction = f"""Voici une phrase en français de niveau CECRL {['A2', 'B1', 'B2', 'C1', 'C2'][difficulty]} à simplifier :\\n'''{row['Original']}'''\\nDonne moi une phrase simplifiée au niveau CECRL {['A1', 'A2', 'B1', 'B2', 'C1'][difficulty]} tout en conservant au maximum son sens original"""
    if training:
        return (
            GPT_TEMPLATE.replace("CONTEXT", CONTEXT)
            .replace("INPUT", instruction)
            .replace("OUTPUT", row["Simplified"])
        )
    else:
        # Return only INPUT
        return instruction


# Define conversation for Davinci
def create_davinci_conversation(row, training=True):
    if training:
        difficulty = row["index"] % 5
    else:
        difficulty = {"A2": 0, "B1": 1, "B2": 2, "C1": 3, "C2": 4}[row["Difficulty"]]
    instruction = f"""Voici une phrase en français de niveau CECRL {['A2', 'B1', 'B2', 'C1', 'C2'][difficulty]} à simplifier :\\n'''{row['Original']}'''\\nDonne moi une phrase simplifiée au niveau CECRL {['A1', 'A2', 'B1', 'B2', 'C1'][difficulty]} tout en conservant au maximum son sens original"""
    if training:
        return DAVINCI_TEMPLATE.replace("INPUT", instruction).replace(
            "OUTPUT", row["Simplified"]
        )
    else:
        # Return INPUT only
        return instruction


# import

In [4]:
# ---------------------------- JSON FILES CREATION --------------------------- #
import csv

# Load dataset
train_df = a_DatasetCreation.download_data()
train_df.columns = ["Original", "Simplified"]


# Create conversations
train_df = train_df.reset_index()
gpt_conversation = train_df.apply(create_gpt_conversation, axis=1)
davinci_conversation = train_df.apply(create_davinci_conversation, axis=1)

# Save in json
path = os.path.join(pwd, "scratch", "text_simplification", "OpenAIFineTuning")
if not os.path.exists(path):
    os.makedirs(path)
gpt_conversation.to_csv(
    os.path.join(path, "gpt_conversation.json"),
    index=False,
    header=False,
    sep="\n",
    quoting=csv.QUOTE_NONE,
)
davinci_conversation.to_csv(
    os.path.join(path, "davinci_conversation.json"),
    index=False,
    header=False,
    sep="\n",
    quoting=csv.QUOTE_NONE,
)

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

  data = pd.read_csv(


## Training Models

We are now going to train the `GPT3.5` and `Davinci` models on the data generated by `GPT-4`.

### Connect to OpenAI API

To simplify access to the OpenAI API, we're going to use the `openai` library to connect to the OpenAI API. Let's create a function so that we can easily connect to this API.

In [5]:
# ----------------------------- CONNECT TO OPENAI ---------------------------- #
import openai


# Connect to OpenAI
def connect_to_openai():
    try:
        with open(os.path.join(pwd, ".openai_key"), "r") as f:
            openai_key = f.read()
            openai.api_key = openai_key
    except:
        key = input("Please enter your OpenAI key: ")
        with open(os.path.join(pwd, ".openai_key"), "w") as f:
            f.write(key)
        openai.api_key = key


# import

In [6]:
connect_to_openai()

### Define Fine-Tuning Function

To make it easy to train models later on, let's turn it into a function.

In [7]:
# ---------------------------- FINE TUNE FUNCTION ---------------------------- #
import time
import json


def fine_tune_openai(file: str, model: str, save_path: str):
    # Push training data to OpenAI
    file = openai.File.create(
        file=open(file),
        purpose="fine-tune",
    )

    # Wait for the file to be processed
    logging.info(f"Waiting for file {file['id']} to be processed")
    while file.status != "processed":
        file = openai.File.retrieve(file.id)
        time.sleep(5)

    # Fine tune model
    logging.info(f"Fine tuning {model} with {file['id']}")
    model = openai.FineTuningJob.create(
        training_file=file["id"],
        model=model,
    )

    # Wait for the fine tuning to be completed
    while model.status != "succeeded":
        model = openai.FineTuningJob.retrieve(model.id)
        time.sleep(5)

    # Save model and file in one json
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    with open(os.path.join(save_path, f"{model}_trained.json"), "w") as f:
        json.dump(
            {"model": model, "file": file},
            f,
            indent=4,
            ensure_ascii=False,
        )

    return model, file


# import

### Fine-Tune GPT-3.5 & Davinci

In [8]:
# ---------------------------- FINE TUNING DAVINCI --------------------------- #
davinci_model, davinci_file = fine_tune_openai(
    file=os.path.join(path, "davinci_conversation.json"),
    model="davinci-002",
    save_path=os.path.join(pwd, "results", "text_simplification", "OpenAIFineTuning"),
)

ServiceUnavailableError: The server is overloaded or not ready yet.

In [None]:
# ---------------------------- FINE TUNING DAVINCI --------------------------- #
davinci_model, davinci_file = fine_tune_openai(
    file=os.path.join(path, "gpt_conversation.json"),
    model="gpt-3.5-turbo-1106",
    save_path=os.path.join(pwd, "results", "text_simplification", "OpenAIFineTuning"),
)

INFO:root:Waiting for file file-Yxm1w5GG8MMmXA4QDviKc27l to be processed
INFO:root:Fine tuning gpt-3.5-turbo-1106 with file-Yxm1w5GG8MMmXA4QDviKc27l
