In [3]:
import os
import re
from pprint import pprint


def extract_info(file_path):
    # Use regex to match the first two digits and the title
    match = re.match(r"(\d{2})-([\w-]+)", file_path)

    # If a match is found, extract the digits and the title
    if match:
        digits = int(match.group(1))  # Convert digits to integer
        title = match.group(2)
        return digits, title
    else:
        return None, None


# 1. 각 파일을 읽고
def read_readme_from_directories():
    dir_pattern = r"^\d{2}-"

    result = {}
    for dir_name in os.listdir():
        if os.path.isdir(os.path.join(dir_name)) and re.match(dir_pattern, dir_name):
            readme_path = os.path.join(dir_name, "translations/ko/README.md")
            if os.path.exists(readme_path):
                with open(readme_path, "r", encoding="utf-8") as readme_file:
                    content = readme_file.read()
                digits, title = extract_info(dir_name)
                result[digits] = {"title": title, "sections": split_markdown(content)}
    return result


# 2. 헤딩 (#, ##, ### …) 별로 나눈 뒤
def split_markdown(content):
    pattern = r"^(#+\s)"
    sections = re.split(pattern, content, flags=re.MULTILINE)[1:]
    return [sections[i] + sections[i + 1] for i in range(0, len(sections), 2)]


readme_files = read_readme_from_directories()
pprint(readme_files)

In [4]:
%pip install python-dotenv openai

Collecting python-dotenv
  Downloading python_dotenv-1.0.1-py3-none-any.whl.metadata (23 kB)
Collecting openai
  Downloading openai-1.48.0-py3-none-any.whl.metadata (24 kB)
Collecting anyio<5,>=3.5.0 (from openai)
  Downloading anyio-4.6.0-py3-none-any.whl.metadata (4.6 kB)
Collecting distro<2,>=1.7.0 (from openai)
  Downloading distro-1.9.0-py3-none-any.whl.metadata (6.8 kB)
Collecting httpx<1,>=0.23.0 (from openai)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting jiter<1,>=0.4.0 (from openai)
  Downloading jiter-0.5.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (3.6 kB)
Collecting pydantic<3,>=1.9.0 (from openai)
  Downloading pydantic-2.9.2-py3-none-any.whl.metadata (149 kB)
Collecting sniffio (from openai)
  Downloading sniffio-1.3.1-py3-none-any.whl.metadata (3.9 kB)
Collecting tqdm>4 (from openai)
  Downloading tqdm-4.66.5-py3-none-any.whl.metadata (57 kB)
Collecting typing-extensions<5,>=4.11 (from openai)
  Downloading typing_extensions-4.12.2-py3-none-an

In [7]:
def save_to_file(file_path, content):
    os.makedirs(os.path.dirname(file_path), exist_ok=True)

    with open(file_path, "w") as file:
        file.write(content)

In [8]:
import asyncio
from openai import AsyncOpenAI
from dotenv import load_dotenv

load_dotenv()
client = AsyncOpenAI()


# 3. GPT-4o 를 사용하여 번역
async def improve_text(text):
    completion = await client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "system",
                "content": "You are an AI expert. Your job is to translate English markdown to Korean markdown.",
            },
            {"role": "user", "content": text},
        ],
    )
    print(completion)
    return completion.choices[0].message.content


async def improve(file, i):
    tasks = [improve_text(section) for section in file["sections"]]
    translated_sections = await asyncio.gather(*tasks)
    content = "\n\n".join(translated_sections)
    save_to_file(f"{i:02d}-{file['title']}/translations/ko/README.md", content)


tasks = [improve(file, i) for i, file in list(readme_files.items())]
await asyncio.gather(*tasks)

ChatCompletion(id='chatcmpl-ABcR3AXCXNFzdqsQBRXERwAh7iDDK', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='# LLM 미세 조정하기\n\n대규모 언어 모델을 사용하여 생성적 AI 애플리케이션을 구축하는 데에는 새로운 도전 과제가 따릅니다. 주요 문제는 주어진 사용자 요청에 대해 모델이 생성한 콘텐츠의 응답 품질(정확도 및 관련성)을 보장하는 것입니다. 이전 수업에서는 기존 모델에 대한 프롬프트 입력을 _수정하는_ 프롬프트 엔지니어링 및 검색 증강 생성과 같은 기술에 대해 논의했습니다.\n\n오늘 수업에서는 세 번째 기술인, **미세 조정**에 대해 논의합니다. 이는 추가 데이터를 사용하여 _모델 자체를 다시 학습_ 시킴으로써 문제를 해결하려고 합니다. 자세히 알아보도록 하겠습니다.', refusal=None, role='assistant', function_call=None, tool_calls=None))], created=1727332717, model='gpt-4o-2024-05-13', object='chat.completion', service_tier=None, system_fingerprint='fp_f82f5b050c', usage=CompletionUsage(completion_tokens=169, prompt_tokens=149, total_tokens=318, completion_tokens_details=CompletionTokensDetails(reasoning_tokens=0)))
ChatCompletion(id='chatcmpl-ABcR6oWRT7MWNLg6QM5YaNndZeEtU', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(conte