Make job and Start fine-tuning

In [37]:
import openai
import json
import logging
import time

with open('./env/key.json') as f:
    auth_key = json.load(f)
    openai.api_key = auth_key['gpt']

# 로그 설정
logging.basicConfig(
    filename="fine_tuning.log",  # 로그 저장 파일명
    level=logging.INFO,          # 로그 레벨 (INFO 이상 저장)
    format="%(asctime)s - %(levelname)s - %(message)s",  # 로그 형식
    datefmt="%Y-%m-%d %H:%M:%S"
)


# 로깅 함수
def log_and_print(message):
    print(message)
    logging.info(message)

# 데이터 업로드
response = openai.files.create(
    file=open("tunedata.jsonl", "rb"),
    purpose="fine-tune"
)
file_id = response.id
log_and_print(f"Uploaded file ID: {file_id}")

# Fine-tuning 실행
tune_response = openai.fine_tuning.jobs.create(
    training_file=file_id,
    model="gpt-4o-2024-08-06"
)
job_id = tune_response.id
log_and_print(f"Fine-tuning job ID: {job_id}")


while True:
    job_status = openai.fine_tuning.jobs.retrieve(job_id)
    log_and_print(f"Fine-tuning status: {job_status.status}")

    if job_status.status in ["succeeded", "failed", "cancelled"]:
        break  # 완료되면 루프 종료
    
    time.sleep(30)  # 1분마다 상태 확인 (API 호출 제한 방지)


Uploaded file ID: file-GL4Dph8hEPjLTz4hdLPgiw
Fine-tuning job ID: ftjob-OuonNJHsEjO1TeN5aaxlMyAB
Fine-tuning status: validating_files
Fine-tuning status: validating_files
Fine-tuning status: validating_files
Fine-tuning status: validating_files
Fine-tuning status: validating_files
Fine-tuning status: validating_files
Fine-tuning status: validating_files
Fine-tuning status: validating_files
Fine-tuning status: validating_files
Fine-tuning status: running


KeyboardInterrupt: 

Check fine-tuning information (eg. model name)

In [50]:
job_id = 'ftjob-OuonNJHsEjO1TeN5aaxlMyAB'  # 예시로 주신 ID로 변경해주세요

# 파인튜닝 작업 상태 조회
job_info = openai.fine_tuning.jobs.retrieve(job_id)
fine_tuned_model_id = job_info
print(fine_tuned_model_id)

FineTuningJob(id='ftjob-OuonNJHsEjO1TeN5aaxlMyAB', created_at=1739413873, error=Error(code=None, message=None, param=None), fine_tuned_model='ft:gpt-4o-2024-08-06:personal::B0KHkpXT', finished_at=1739417675, hyperparameters=Hyperparameters(batch_size=1, learning_rate_multiplier=2.0, n_epochs=3), model='gpt-4o-2024-08-06', object='fine_tuning.job', organization_id='org-5PCocf2RmwF68HWSPyxRPqwE', result_files=['file-HsXN7a7nqTbWtgpdi2r4Fg'], seed=412734729, status='succeeded', trained_tokens=882159, training_file='file-GL4Dph8hEPjLTz4hdLPgiw', validation_file=None, estimated_finish=None, integrations=[], method=Method(dpo=None, supervised=MethodSupervised(hyperparameters=MethodSupervisedHyperparameters(batch_size=1, learning_rate_multiplier=2.0, n_epochs=3)), type='supervised'), user_provided_suffix=None)


Check training step and loss

In [None]:
log_response = openai.fine_tuning.jobs.list_events(job_id)
for event in log_response.data:
    print(f"{event.created_at} - {event.level}: {event.message}")

1739417691 - info: The job has successfully completed
1739417677 - info: New fine-tuned model created
1739417677 - info: Checkpoint created at step 1020
1739417677 - info: Checkpoint created at step 510
1739417663 - info: Step 1530/1530: training loss=0.20
1739417661 - info: Step 1529/1530: training loss=0.31
1739417659 - info: Step 1528/1530: training loss=1.73
1739417657 - info: Step 1527/1530: training loss=0.48
1739417655 - info: Step 1526/1530: training loss=0.00
1739417653 - info: Step 1525/1530: training loss=0.23
1739417651 - info: Step 1524/1530: training loss=0.32
1739417649 - info: Step 1523/1530: training loss=0.43
1739417645 - info: Step 1522/1530: training loss=0.98
1739417643 - info: Step 1521/1530: training loss=0.60
1739417640 - info: Step 1520/1530: training loss=0.78
1739417638 - info: Step 1519/1530: training loss=0.13
1739417636 - info: Step 1518/1530: training loss=0.30
1739417634 - info: Step 1517/1530: training loss=0.09
1739417632 - info: Step 1516/1530: traini

Reduce dataset Randomly (too much data & too much invalid dataset (violate Openai usage policy))

In [26]:
import json
import random

input_file = "fine_tune_data.jsonl"   # 원본 jsonl 파일
output_file = "tunedata.jsonl" # 샘플링된 jsonl 파일

# JSONL 파일 읽기
with open(input_file, "r", encoding="utf-8") as f:
    lines = [json.loads(line) for line in f]

# 전체에서 1/3 랜덤 샘플링
sample_size = len(lines) // 3
sampled_lines = random.sample(lines, sample_size)

# 결과를 새로운 jsonl 파일로 저장
with open(output_file, "w", encoding="utf-8") as f:
    for item in sampled_lines:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"샘플링된 {sample_size}개의 줄을 {output_file}에 저장했습니다.")


샘플링된 515개의 줄을 tunedata.jsonl에 저장했습니다.
