In [1]:
# 複製openai使用教學
!git clone https://github.com/openai/openai-cookbook.git

Cloning into 'openai-cookbook'...
remote: Enumerating objects: 5265, done.[K
remote: Counting objects: 100% (1059/1059), done.[K
remote: Compressing objects: 100% (765/765), done.[K
remote: Total 5265 (delta 308), reused 1037 (delta 294), pack-reused 4206[K
Receiving objects: 100% (5265/5265), 240.73 MiB | 18.41 MiB/s, done.
Resolving deltas: 100% (2809/2809), done.
Updating files: 100% (1274/1274), done.


In [2]:
# 下載最新版的openai套件
!pip install openai==1.6.1  # origin using 1.12
# 下載token計算套件
!pip install tiktoken
!pip install datasets

Collecting openai==1.6.1
  Downloading openai-1.6.1-py3-none-any.whl (225 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/225.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━[0m [32m153.6/225.4 kB[0m [31m4.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m225.4/225.4 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
Collecting httpx<1,>=0.23.0 (from openai==1.6.1)
  Downloading httpx-0.26.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.9/75.9 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Collecting httpcore==1.* (from httpx<1,>=0.23.0->openai==1.6.1)
  Downloading httpcore-1.0.3-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.0/77.0 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting h11<0.15,>=0.13 (from httpcore==1.*->httpx<1,>=0.23.0->openai==1.6.1)

In [3]:
import openai
import json
# for token counting
import tiktoken
import numpy as np
from collections import defaultdict
from datasets import load_dataset
# drive.mount('/content/drive/')

In [6]:
data_path = ""  #/toy_chat_fine_tuning.jsonl

# Load the dataset
with open(data_path, 'r', encoding='utf-8') as f:
    dataset = [json.loads(line) for line in f]

# Initial dataset stats
print("Num examples:", len(dataset))
print("First example:")
for message in dataset[0]["messages"]:
    print(message)



Num examples: 1783
First example:
{'role': 'system', 'content': '法條索引'}
{'role': 'user', 'content': '公司法第 340 條?'}
{'role': 'assistant', 'content': '公司對於其債務之清償，應依其債權額比例為之。但依法得行使優先受償權或別除權之債權，不在此限。 '}


In [7]:
# 檢查資料格式是否符合openai官方設定
format_errors = defaultdict(int)

for ex in dataset:
    if not isinstance(ex, dict):
        format_errors["data_type"] += 1
        continue

    messages = ex.get("messages", None)
    if not messages:
        format_errors["missing_messages_list"] += 1
        continue

    for message in messages:
        if "role" not in message or "content" not in message:
            format_errors["message_missing_key"] += 1

        if any(k not in ("role", "content", "name", "function_call") for k in message):
            format_errors["message_unrecognized_key"] += 1

        if message.get("role", None) not in ("system", "user", "assistant", "function"):
            format_errors["unrecognized_role"] += 1

        content = message.get("content", None)
        function_call = message.get("function_call", None)

        if (not content and not function_call) or not isinstance(content, str):
            format_errors["missing_content"] += 1

    if not any(message.get("role", None) == "assistant" for message in messages):
        format_errors["example_missing_assistant_message"] += 1

if format_errors:
    print("Found errors:")
    for k, v in format_errors.items():
        print(f"{k}: {v}")
else:
    print("No errors found")

No errors found


In [8]:
# 數訓練資料中中包含幾個token
encoding = tiktoken.get_encoding("cl100k_base")  #tiktoken is a fast BPE tokeniser for use with OpenAI's models.

# not exact!
# simplified from https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb
def num_tokens_from_messages(instruction, tokens_per_message=3, tokens_per_name=1):
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3
    return num_tokens

def num_assistant_tokens_from_messages(messages):
    num_tokens = 0
    for message in messages:
        if message["role"] == "assistant":
            num_tokens += len(encoding.encode(message["content"]))
    return num_tokens

def print_distribution(values, name):
    print(f"\n#### Distribution of {name}:")
    print(f"min / max: {min(values)}, {max(values)}")
    print(f"mean / median: {np.mean(values)}, {np.median(values)}")
    print(f"p5 / p95: {np.quantile(values, 0.1)}, {np.quantile(values, 0.9)}")

In [9]:
# 預計訓練次數和價格
MAX_TOKENS_PER_EXAMPLE = 4096

TARGET_EPOCHS = 3
MIN_TARGET_EXAMPLES = 100
MAX_TARGET_EXAMPLES = 25000
MIN_DEFAULT_EPOCHS = 1
MAX_DEFAULT_EPOCHS = 25

n_epochs = TARGET_EPOCHS
n_train_examples = len(dataset)
if n_train_examples * TARGET_EPOCHS < MIN_TARGET_EXAMPLES:
    n_epochs = min(MAX_DEFAULT_EPOCHS, MIN_TARGET_EXAMPLES // n_train_examples)
elif n_train_examples * TARGET_EPOCHS > MAX_TARGET_EXAMPLES:
    n_epochs = max(MIN_DEFAULT_EPOCHS, MAX_TARGET_EXAMPLES // n_train_examples)

convo_lens = [len(conversation['messages']) for conversation in dataset]
n_billing_tokens_in_dataset = sum(min(MAX_TOKENS_PER_EXAMPLE, length) for length in convo_lens)
print(f"Dataset has ~{n_billing_tokens_in_dataset} tokens that will be charged for during training")
print(f"By default, you'll train for {n_epochs} epochs on this dataset")
print(f"By default, you'll be charged for ~{n_epochs * n_billing_tokens_in_dataset} tokens")

Dataset has ~5349 tokens that will be charged for during training
By default, you'll train for 3 epochs on this dataset
By default, you'll be charged for ~16047 tokens


In [None]:
training_file = ""
validation_file = ""

In [10]:
# 上傳訓練資料


from openai import OpenAI  #for verion 1.12
client = OpenAI(api_key="")

client.files.create(
  file=open("" , "rb"),
  purpose="fine-tune"
)

FileObject(id='file-yI9mGxdfZnFY8IK6avk0z8kX', bytes=814479, created_at=1708330010, filename='remain_lawdata.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [None]:
from openai import OpenAI
client = OpenAI(api_key="")

client.files.create(
  file=open(validation_file, "rb"),
  purpose="fine-tune"
)

FileObject(id='file-ZVCGTcNrMzYDBHVYyfBc1PlB', bytes=238352, created_at=1708017588, filename='test-api.jsonl', object='file', purpose='fine-tune', status='processed', status_details=None)

In [None]:
# OpenAI.file.retrieve("file-zNnRhfyCXhtSq1lVtRVVylx6")  #trace the file status

AttributeError: type object 'OpenAI' has no attribute 'file'

In [12]:
# !pip install wandb  #sync the traning data with wandb
import wandb
wandb.login()
# !pip install requests

[34m[1mwandb[0m: Currently logged in as: [33m10622130[0m ([33mtw-nlp-law[0m). Use [1m`wandb login --relogin`[0m to force relogin


True

In [13]:
# 建立並執行finetune任務
from openai import OpenAI
from openai.types.fine_tuning import FineTuningJob, FineTuningJobEvent
client = OpenAI(api_key="")

response = client.fine_tuning.jobs.create(
  training_file="",
  model="gpt-3.5-turbo",
  suffix="lawdata01"
  # hyperparameters={
  #   "n_epochs":2
  # }  # the default epoch is 3
)

job_id = response.id
print("Job ID:", response.id)
print("Status:", response.status)


BadRequestError: Error code: 400 - {'error': {'message': 'invalid training_file: /content/remain_lawdata.jsonl', 'type': 'invalid_request_error', 'param': 'training_file', 'code': None}}

In [14]:
import os
from wandb.integration.openai.fine_tuning import WandbLogger  #sync the data

# Finetuning logic
os.environ["OPENAI_API_KEY"] =""


WandbLogger.sync(fine_tune_job_id="")

[34m[1mwandb[0m: Retrieving fine-tune job...


[34m[1mwandb[0m: Waiting for the OpenAI fine-tuning job to be finished...


'🎉 wandb sync completed successfully'

In [None]:
# 此區塊用於對訓練任務進行操作
from openai import OpenAI
client = OpenAI(api_key="")

# 列出fine tune 任務
client.fine_tuning.jobs.list(limit=10)

# 檢索fine tune 狀態
client.fine_tuning.jobs.retrieve("")

# 取消finetune
# client.fine_tuning.jobs.cancel("ftjob-lhVYdXMCkMKIODq1GjAzThze")

# List up to 10 events from a fine-tuning job
client.fine_tuning.jobs.list_events(fine_tuning_job_id="", limit=10)

# Delete a fine-tuned model (must be an owner of the org the model was created in)
# client.models.delete("ft:gpt-3.5-turbo:acemeco:suffix:test1")

SyncCursorPage[FineTuningJobEvent](data=[FineTuningJobEvent(id='ftevent-lq3HgqDGrQvkao3gyfKzQFQe', created_at=1704179766, level='info', message='Fine-tuning job started', object='fine_tuning.job.event', data=None, type='message'), FineTuningJobEvent(id='ftevent-v0EdVu47HiNR9Y8Qc1bgGhS6', created_at=1704179765, level='info', message='Files validated, moving job to queued state', object='fine_tuning.job.event', data={}, type='message'), FineTuningJobEvent(id='ftevent-4IH9jeWqJf98My2R3NZopGo0', created_at=1704179764, level='warn', message='File file-0w4HebAfYP54nBaExzTaLORe contains examples greater than the supported context size for model `gpt-3.5-turbo-0613` (4096 tokens)', object='fine_tuning.job.event', data={}, type='message'), FineTuningJobEvent(id='ftevent-qLnczeTqrohvRKTuqMpHgAG7', created_at=1704179763, level='info', message='Validating training file: file-0w4HebAfYP54nBaExzTaLORe', object='fine_tuning.job.event', data={}, type='message'), FineTuningJobEvent(id='ftevent-CCbyvTv3

In [None]:
# 使用模型
from openai import OpenAI

client = OpenAI(organization = '', api_key='')

response = client.chat.completions.create(
  model="",
  messages=[
    {"role": "system", "content": "你是一位對每個問題給予客觀建議的專業法律顧問。"},
    {"role": "user", "content": "車禍該怎麼辦"},
  ]
)

print((response.choices[0].message.content))

車禍該怎麼辦依照所造成的損害（動物財物車輛傷亡等），請洽詢各級法院簡易事件調解室程序依程序依以下步驟進行：案件的受理：由申請人檢具名冊照片、原車照片（全車照片、傷害部位各張）、被申請人之駕照複印本或事實稱說之相關證據，及iType追車單（必要時）、交通事故原因分析書（必要時）；提交調解申請書；本院按日受理調解案件；不具備前揭條款規定資格之申請書，本院得要求申請人補正之。本院另不予進行案件受理之之人身傷害給付說明書或其他機關調解同一之請求。不予複受理。日曆調解期日之確定：本院自受理之次日起，會同申請人、被申請人，先期約定日曆調解談判期日一次。談判期日確定後，遇到調解員突發因素（例：受傷急難醫治、家庭緊急事故、天災）等受不可抗力因素，或其他申請人、被申請人遇調解員約定日曆調解時間先前無原因通知本院復日曆調解，致本院聲請元日曆調解之通知失效，致當日未能依期抵達調解地點者，其所受理之民事日曆調解案件，第一次約定調解期日不予補約，除其他社會情緒上更有幫助調解結果與目的者外，在依任何一方之請求，或經本院之求等，得適當裁定讓請求之一方指定日曆調解期日；若已符合天然人民公正訴訟法第56條之要件者予以獄同第三項前段之處分。原則上檢附資料越清楚越有助為你們分辨糾紛之結果。合議調解因數有理事實埋怨民眾及各式爭執械受理昆明市當鋪被告陳則燊之高等法院暨其訴訟及非訟代理人除中華民國律師或訴訟代理人外，不得參與案件執行資料なりわれわれは、自分たちはまだ自分の正当な権利を維持する是非に反省して、対立を生む可能性がある場所に入ることがあります。私たちの対話にはいったん、私たちは新しい出会いをどこまでも暖かく精力的にディテクシヨンのディテクター、トップ、Rulesへのアクセスが必要です.
