In [1]:
import requests
import openai
import json
import time
import numpy as np
import random
from datetime import datetime
from pprint import pprint
import pandas as pd

## Credentials

In [2]:
open_ai_api_key = "{YOUR-OPENAI-API-KEY}"
org_id = "{YOUR-ORGANIZATION-ID}"

openai.api_key = open_ai_api_key

## Preprocessing

In [3]:
# Expected CSV content:
#    Human;AI
#    Hello;Hi, I'm an AI
#    Open the pod bay doors please;No
#    Open the pod bay doors!;Well, if you insist
#    <human_questions>;<ai_answers>
#    ###;###
#    Good morning;Hey
#    Can you read me the news?;No, I can't
#    Is there something you can do?;You should know
#    Oh okey;Yeah
#    <human_questions>;<ai_answers>

def csv_to_df_list(csv_path, csv_sep=';', df_sep="###", encoding='utf-8'):

    df = pd.read_csv(csv_path, sep=csv_sep, encoding=encoding)
    df = df.reset_index()

    splitted_df = []
    subdf = pd.DataFrame([], columns=[df.columns[1], df.columns[2]])
    for i, row in df.iterrows():
        if row[df.columns[1]] == df_sep:
            splitted_df.append(subdf)
            subdf = pd.DataFrame([], columns=[df.columns[1], df.columns[2]])
        else:
            row = {df.columns[1]:df.iloc[i][1],df.columns[2]:df.iloc[i][2]}
            subdf = subdf.append(row, ignore_index = True)
    
    if len(subdf.index) > 0:
        splitted_df.append(subdf)

    return splitted_df

    
def df_to_buffered_jsons_list(dataframe, jsons_list, include_stop=True, buffer_size=3, show=False):
          
    df = dataframe.reset_index()

    for i, row in df.iterrows():

        if i < buffer_size - 1:
            continue

        prompt = ""
        for j in reversed(range(buffer_size)):
            if j > 0:
                prompt += df.columns[1] + ': ' + df.iloc[i-j][1] + '\n' + df.columns[2] + ': ' + df.iloc[i-j][2] + '\n'
            else:
                prompt += df.columns[1] + ': ' + df.iloc[i-j][1] + '\n' + df.columns[2] + ':'

        completion = ' ' + df.iloc[i][2]
        
        if include_stop:
            completion += '\n' + df.columns[1] + ': '

        if show:
            str_prompt = prompt.replace("\n", "\\n")
            str_completion = completion.replace("\n", "\\n")
            print(f"PROMPT:'{str_prompt}' COMPLETION:'{str_completion}'")

        buffered_json = json.dumps({"prompt":prompt,"completion":completion}, ensure_ascii=False)
        jsons_list.append(buffered_json)


def jsons_list_to_jsonl(jsons_list, jsonl_path, encoding='utf-8', clear_first=True):
    if clear_first:
        open(jsonl_path, 'w', encoding=encoding).close() # clear content
        
    with open(jsonl_path, 'a', encoding=encoding) as outfile:
        for json_str in jsons_list:
            outfile.write(json_str)
            outfile.write('\n')
            
            
def csv_to_jsonl(csv_path, jsonl_train_path, jsonl_val_path,
                 csv_sep=';', df_sep="###", encoding='utf-8',
                 include_stop=True, buffer_size=3, train_partition=0.8):
    
    df_list = csv_to_df_list(csv_path, csv_sep=csv_sep, df_sep=df_sep, encoding=encoding)
    print(f'df_list:{len(df_list)}')
    jsons_list = []
    for df in df_list:
        df_to_buffered_jsons_list(df, jsons_list, include_stop=include_stop, buffer_size=3, show=False)
    print(f'jsons_list:{len(jsons_list)}')
    
    random.shuffle(jsons_list)
    train, val = np.split(jsons_list, [int(len(jsons_list)*train_partition)])
    
    jsons_list_to_jsonl(train, jsonl_train_path, encoding=encoding, clear_first=True)
    jsons_list_to_jsonl(val, jsonl_val_path, encoding=encoding, clear_first=True)


In [4]:
csv_path = '.\\sample_chat.csv'
jsonl_train_path = '.\\train.jsonl'
jsonl_val_path = '.\\val.jsonl'

include_stop=True
buffer_size=3
partition=0.8

csv_to_jsonl(csv_path, jsonl_train_path, jsonl_val_path,
             include_stop=include_stop, buffer_size=buffer_size, train_partition=partition)

df_list:3
jsons_list:5


## Files

In [None]:
def upload_file(file_path, purpose='fine-tune', encoding='utf-8', show=False):
    # https://beta.openai.com/docs/api-reference/files/upload
    resp = openai.File.create(purpose=purpose, file=open(file_path, encoding=encoding))
    if show:
        pprint(resp)
    return resp


def list_files(show=False):
    # https://beta.openai.com/docs/api-reference/files/list
    resp = openai.File.list()
    if show:
        pprint(resp)
    return resp
    

def delete_file(file_id, show=False):
    # https://beta.openai.com/docs/api-reference/files/delete
    resp = openai.File.delete(file_id)
    if show:
        pprint(resp)
    return resp


## Fine Tuning

In [None]:
def create_ft_model(train_file_id, val_file_id=None, suffix=None, model='davinci', show=True):
    # https://beta.openai.com/docs/api-reference/fine-tunes/create
    resp = openai.FineTune.create(training_file=train_file_id, validation_file=val_file_id, suffix=suffix, model=model)
    if show:
        pprint(resp)
    return resp


def list_ft_models(show=False):
    # https://beta.openai.com/docs/api-reference/fine-tunes/list
    resp = openai.FineTune.list()
    if show:
        pprint(resp)
    return resp
    
    
def retrieve_ft_model(ft_model_id, show=False):
    # https://beta.openai.com/docs/api-reference/fine-tunes/retrieve
    resp = openai.FineTune.retrieve(id=ft_model_id)
    if show:
        pprint(resp)
    return resp


def cancel_ft_job(ft_model_id, show=False):
    # https://beta.openai.com/docs/api-reference/fine-tunes/cancel
    resp = openai.FineTune.cancel(id=ft_model_id)
    if show:
        pprint(resp)
    return resp


def delete_ft_model(ft_model_id, show=False):
    # https://beta.openai.com/docs/api-reference/fine-tunes/cancel
    resp = openai.FineTune.delete(sid=ft_model_id)
    if show:
        pprint(resp)
    return resp
    
    
def list_ft_events(ft_model_id, show=False):
    # https://beta.openai.com/docs/api-reference/fine-tunes/events
    resp = openai.FineTune.list_events(id=ft_model_id)
    if show:
        pprint(resp)
    return resp


## Models

In [None]:
def list_models(owned_by=None, show=False):
    # https://beta.openai.com/docs/api-reference/models/list
    resp = openai.Model.list()
    if owned_by != None:
        filtered_models = []
        for model in resp['data']:
            if model['owned_by'] == owned_by:
                filtered_models.append(model)
        resp['data'] = filtered_models
    if show:
        pprint(resp)
    return resp


def retrieve_model(model_id, show=False):
    # https://beta.openai.com/docs/api-reference/models/retrieve
    resp = openai.Model.retrieve(model_id)
    if show:
        pprint(resp)
    return resp


def delete_model(model_id, show=False):
    # https://beta.openai.com/docs/api-reference/fine-tunes/delete-model
    resp = openai.Model.delete(model_id)
    if show:
        pprint(resp)
    return resp


## Utils

In [None]:
def check_content_policy(input_text, show=False):
    # https://beta.openai.com/docs/api-reference/moderations/create
    resp = openai.Moderation.create(input=input_text)
    if show:
        pprint(resp)
    return resp


In [None]:
def delete_all_files():
    file_list = list_files()['data']
    for f in file_list:
        print(f"DELETING FILE {f['id']}")
        delete_file(f['id'])
    print("ALL FILES DELETED")
    
    
def delete_all_models(org_id):
    models = list_models(owned_by=org_id)['data']
    for m in models:
        print(f"DELETING MODEL {m['id']}")
        delete_model(m['id'])
    print("ALL MODELS DELETED") 
        
        
def delete_all_ft_models(try_to_delete=False):
    ft_models = list_ft_models()['data']
    for ft in ft_models:
        if ft['status'] != 'succeeded' and ft['status'] != 'cancelled':
            print(f"TRYING TO CANCEL FT MODEL {ft['id']}")
            cancel_ft_job(ft['id'])
        else:
            if try_to_delete:
                try:
                    print(f"TRYING TO DELETE FT MODEL {ft['id']}")
                    delete_ft_model(ft['id'])
                except:
                    print(f"NOT ALLOWED")
    if try_to_delete:
        print("ALL FINE TUNINING JOBS DELETED")
    else:
        print("ALL FINE TUNINING JOBS CANCELED OR SUCCEEDED")
    

In [None]:
def listen_job_events(ft_model_id, every_seconds=1.0):
    latest_msg = ""
    while True:
        events = openai.FineTune.list_events(id=ft_model_id)['data']
        str_time = datetime.now().strftime("%m/%d/%Y, %H:%M:%S")
        if len(events) > 0:
            latest_event = events[len(events)-1]
            if "succeeded" in latest_event['message']:
                print("",end='\n')
                print(f"[{str_time}] Job succeeded!")
                return
            elif latest_msg != latest_event['message']:
                latest_msg = latest_event['message']
                print("",end='\n')
                print(f"[{str_time}] Event: {latest_msg}", end='\r')
            else:
                print(f"[{str_time}] Event: {latest_msg}", end='\r')
        else:
          print(f'Waiting for new events...', end='\r')
        time.sleep(every_seconds)


## Actions

In [None]:
file_list = list_files(show=True)
model_list = list_models(owned_by=org_id, show=True)
#ft_models_list = list_ft_models()


In [None]:
# WARNING: Destructive operations!

#delete_all_ft_models()
#delete_all_files()
#delete_all_models(org_id)


In [None]:
train_file_info = upload_file(jsonl_train_path)
train_file_id = train_file_info['id']

val_file_info = upload_file(jsonl_val_path)
val_file_id = val_file_info['id']


In [None]:
ft_model_creation = create_ft_model(train_file_id, val_file_id=val_file_id, suffix='my_fine_tuned_model', model='davinci')
ft_model_id = ft_model_creation['id']
listen_job_events(ft_model_id, every_seconds=1.0)
