<img width="10%" alt="Naas" src="https://landen.imgix.net/jtci2pxwjczr/assets/5ice39g4.png?w=160"/>

# Google Sheet - Fine tune dataset with OpenAI

**Tags**: #gsheet #openai #finetuning #ai #llm

**Author:** [Maxime Jublou](https://www.linkedin.com/in/maximejublou)

**Last update:** 2023-11-08 (Created: 2023-11-08)

**Description:** This notebook fine tuned a dataset from a Google spreadsheet with an OpenAI model to create your own model.

**References:** list of references and websites utilized in the creation of this notebook
- [OpenAI Fine-tuning](https://platform.openai.com/docs/guides/fine-tuning)

## Input

### Import libraries

In [None]:
import naas
from naas_drivers import gsheet
import pandas as pd
import json
try:
    import openai
except:
    !pip install --user --upgrade openai openai[wandb]
    import openai
import time
import psycopg2
from IPython.display import clear_output
import os
import naas_data_product

### Setup variables

In [None]:
# Avatar meta
avatar_name = "Jeremy Ravenel"
output_dir = os.path.join(naas_data_product.OUTPUTS_PATH, "ai-characters", avatar_name.lower().replace(" ", "_"), avatar_model_alias.replace("/", "_"))
print("Output directory:", output_dir)

# Google Sheets
spreadsheet_url = "https://docs.google.com/spreadsheets/d/1wediMdHcq5WDqLMZ7ryNrcPxCmlX8BX4ZEl3JNWT8wg/edit#gid=0"
sheet_name = "Jeremy"
question_col = "question"
answer_col = "answer"

# OpenAI
openai_api_key = naas.secret.get("OPENAI_API_KEY")

# JSONL
dataset_file = "dataset.jsonl"

## Model

### Create system prompt

In [None]:
system_prompt = f'{avatar_name} avatar assistant.'
print("System prompt:", system_prompt)

### Get data from Google sheets spreasheet

In [None]:
df_gsheet = gsheet.connect(spreadsheet_url).get(sheet_name=sheet_name)
print("Rows:", len(df_gsheet))
df_gsheet.head(1)

### Generate JsonL file

In [None]:
def build_dataset(
    df_gsheet,
    system_prompt,
    question_col,
    answer_col,
):
    # Init
    dataset = []
    for i, row in df_gsheet.iterrows():
        user = row[question_col]
        assistant = row[answer_col]
        dataset.append({
            "messages": [
                {
                    "role": "system",
                    "content": system_prompt
                },
                {
                    "role": "user",
                    "content": user
                },
                {
                    "role": "assistant",
                    "content": assistant
                }
            ]
        })
    return dataset

# Build fine-tuned dataset
dataset = build_dataset(df_gsheet, system_prompt, question_col, answer_col)

# Save dataset
file_path = os.path.join(output_dir, dataset_file)
with open(file_path, 'w', encoding='utf-8') as file:
    for data in dataset:
        json.dump(data, file, ensure_ascii=False)
        file.write('\n')

### Upload dataset to OpenAI

In [None]:
openai.api_key = openai_api_key
uploaded_file = openai.File.create(
  file=open(file_path, "rb"),
  purpose='fine-tune'
)
file_id = uploaded_file.id

### Fine tune model

In [None]:
fine_tuning_job = openai.FineTuningJob.create(training_file=file_id, model="gpt-3.5-turbo")
fine_tuning_job

while True:
    clear_output()
    print('Waiting for fine-tuning to complete ...')
    state = openai.FineTuningJob.retrieve(fine_tuning_job.id)
    print(state)
    if state['status'] in ['succeeded', 'failed', 'canceled']:
        break
    time.sleep(5)
    
state

## Output

### Save model name

In [None]:
model_name = state.fine_tuned_model
file_path = os.path.join(output_dir, model_name)
open(file_path, 'w')
print("Model name saved:", file_path)