In [3]:
import os
import re
from pprint import pprint


def extract_info(file_path):
    # Use regex to match the first two digits and the title
    match = re.match(r"(\d{2})-([\w-]+)", file_path)

    # If a match is found, extract the digits and the title
    if match:
        digits = int(match.group(1))  # Convert digits to integer
        title = match.group(2)
        return digits, title
    else:
        return None, None


# 1. 각 파일을 읽고
def read_readme_from_directories():
    dir_pattern = r"^\d{2}-"

    result = {}
    for dir_name in os.listdir():
        if os.path.isdir(os.path.join(dir_name)) and re.match(dir_pattern, dir_name):
            readme_path = os.path.join(dir_name, "README.md")
            readme_path_ko = os.path.join(dir_name, "translations/ko/README.md")
            if os.path.exists(readme_path_ko):
                with open(readme_path, "r", encoding="utf-8") as readme_file:
                    content = readme_file.read()
                with open(readme_path_ko, "r", encoding="utf-8") as readme_file_ko:
                    content_ko = readme_file_ko.read()
                digits, title = extract_info(dir_name)
                result[digits] = {
                    "title": title,
                    "sections": split_markdown(content),
                    "sections_ko": split_markdown(content_ko),
                }
    return result


# 2. 헤딩 (#, ##, ### …) 별로 나눈 뒤
def split_markdown(content):
    pattern = r"^(#+\s)"
    sections = re.split(pattern, content, flags=re.MULTILINE)[1:]
    return [sections[i] + sections[i + 1] for i in range(0, len(sections), 2)]


readme_files = read_readme_from_directories()
pprint(readme_files)

{0: {'sections': ['# Getting Started with this course\n'
                  '\n'
                  'We are very excited for you to start this course and see '
                  'what you get inspired to build with Generative AI!\n'
                  '\n'
                  'To ensure your success, this page outlines setup steps, '
                  'technical requirements, and where to get help if needed.\n'
                  '\n',
                  '## Setup Steps\n'
                  '\n'
                  'To start taking this course, you will need to complete the '
                  'following steps.\n'
                  '\n',
                  '### 1. Fork this Repo\n'
                  '\n'
                  '[Fork this entire '
                  'repo](https://github.com/microsoft/generative-ai-for-beginners/fork?WT.mc_id=academic-105485-koreyst) '
                  'to your own GitHub account to be able to change any code '
                  'and complete the challenges. You can 

In [4]:
%pip install python-dotenv openai pinecone

Note: you may need to restart the kernel to use updated packages.


In [5]:
def save_to_file(file_path, content):
    os.makedirs(os.path.dirname(file_path), exist_ok=True)

    with open(file_path, "w") as file:
        file.write(content)

In [8]:
import asyncio
from openai import AsyncOpenAI
from dotenv import load_dotenv
from pinecone import Pinecone
import time
import os

load_dotenv()
client = AsyncOpenAI()
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))


# 1. text-embedding-3-small 로 텍스트 임베딩 벡터로 변환
async def to_embedding(text):
    return (
        (
            await client.embeddings.create(
                model="text-embedding-3-small",
                input=text,
                encoding_format="float",
            )
        )
        .data[0]
        .embedding
    )


index_name = "gen-ai-ko"
# Wait for the index to be ready
while not pc.describe_index(index_name).status["ready"]:
    time.sleep(1)

index = pc.Index(index_name)


# 2. Pinecone DB 에 임베딩 벡터 + 한/영 원문 metadata 로 저장
def save_to_vector_db(id, embeddings, data_en, data_ko):
    vectors = []
    for e, de, dk, i in zip(embeddings, data_en, data_ko, range(len(embeddings))):
        vectors.append(
            {"id": f"{id}-{i}", "values": e, "metadata": {"en": de, "ko": dk}}
        )
        print(f"Embedding for {i:02d} saved: {vectors[-1]}")

    index.upsert(vectors=vectors, namespace="ns1")


for i, file in list(readme_files.items()):
    print(f"Embedding for {i:02d}...\n\n")
    en = file["sections"]
    ko = file["sections_ko"]
    embedding_tasks = [to_embedding(section_en) for section_en in en]
    embeddings = await asyncio.gather(*embedding_tasks)
    print(f"Embedding length: {len(embeddings)}")
    save_to_vector_db(i, embeddings, en, ko)

Embedding for 10...


Embedding length: 15
Embedding for 00 saved: {'id': '10-0', 'values': [-0.029905092, 0.018880535, 0.04378399, 0.023044204, 0.05383964, -0.037734885, -0.016628489, -0.0017266787, -0.025990196, 0.05166615, -0.001705402, -0.070651434, -0.0070703807, -0.0369231, -0.0005036828, 0.008019645, 0.0035679236, -0.023515563, 0.013433724, 0.021800341, -0.0019541746, -0.029564666, -0.005548285, 0.00879215, -0.008065471, -0.018042564, 0.03553521, 0.05399676, -0.019980373, 0.013983642, 0.019168587, -0.019993465, -0.044360094, -0.029800346, 0.023921454, -0.014376441, 0.009381348, 0.0013183315, -0.002219314, 0.00023383812, -0.008137484, -0.036556486, 0.0054500853, 0.009427174, -0.015895264, 0.05577745, 0.020543383, -0.0087266825, 0.012687406, 0.03370215, -0.12611464, 0.0051031127, 0.011666128, -0.009237321, -0.049492665, 0.044176787, -0.0035482836, 0.026094943, -0.027050752, 0.025348624, 0.028779069, -0.014481187, 0.040196422, 0.07096567, -0.019927999, 0.006873981, -0.04991165, 0.0