### LLM 연결

In [None]:
!pip install openai

In [None]:
import os
from openai import OpenAI

client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

stream = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {
            "role": "system",
            "content": "You are a helpful assistant. You must answer in Korean.",
        },
        {
            "role": "user",
            "content": "대한민국의 수도는 어디인가요?",
        },
    ],
    stream=True,
)
for chunk in stream:
    if chunk.choices[0].delta.content is not None:
        print(chunk.choices[0].delta.content, end="")

### Vector DB 연결

In [None]:
!pip install chromadb

In [None]:
import chromadb

chroma_client = chromadb.Client()

question_collection = chroma_client.create_collection(name="question")
answer_collection = chroma_client.create_collection(name="answer")

# collection = chroma_client.create_collection(name="collection_name")
# chroma_client.delete_collection(name="collection_name")

### Knowledge base 전처리

In [None]:
import pickle

with open('final_result.pkl', 'rb') as file:
    data = pickle.load(file)

In [None]:
for key in data:
    data[key] = data[key].split('\n', 1)[0]

In [None]:
import pandas as pd

df = pd.DataFrame(columns=['question', 'answer'])

for idx, key in enumerate(data):
    df.loc[idx] = [key, data[key]]

### Knowledge base embedding

In [None]:
from tqdm import tqdm

def get_embedding(text, model="text-embedding-3-small"):
    response = client.embeddings.create(
        input=[text],
        model=model
    )
    return response.data[0].embedding

for idx, row in tqdm(df.iterrows(), total=df.shape[0]):
    question_embedding = get_embedding(row["question"])
    answer_embedding = get_embedding(row["answer"])
    
    question_collection.add(ids=[str(idx)], embeddings=[question_embedding])
    answer_collection.add(ids=[str(idx)], embeddings=[answer_embedding])

### User Question embedding

### Similarity Search

### Prompt 작성

In [None]:
prompt_template = """
    You are a helpful Assistant who answers to users questions based on multiple contexts given to you.

    Keep your answer short and to the point.
    
    The evidence are the context of the pdf extract with metadata. 
    
    Carefully focus on the metadata specially 'filename' and 'page' whenever answering.
    
    Make sure to add filename and page number at the end of sentence you are citing to.
        
    Reply "Not applicable" if text is irrelevant.
     
    The PDF content is:
    {pdf_extract}
"""

### LLM Answer 받기