In [None]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import MongoDBAtlasVectorSearch
from pymongo import MongoClient
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import RetrievalQA
from dotenv import load_dotenv
import os
import pandas as pd

In [None]:
load_dotenv() 

In [None]:
# Embedding model
embedding_model = OpenAIEmbeddings(model="text-embedding-3-small")
llm = ChatOpenAI(temperature=0, model_name="gpt-4o")

In [None]:
# Kết nối MongoDB
mongo_uri = os.getenv("MONGO_DB_URI")
client = MongoClient(mongo_uri)
db = client["rag_db"]
collection = db["documents"]

In [None]:
docs = [""]

for i in range(1, 64):
    chunkDoc = ""
    chunkFileName = "p" + str(i) + ".txt"
    with open("data/quydinhdaotaothacsi/" + chunkFileName, "r", encoding="utf-8") as f:
        chunkDoc = f.read()
    docs.append(chunkDoc)   

# Chia nhỏ đoạn văn
splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=20)
documents = splitter.create_documents(docs)

In [None]:
documents=[]
# Tạo VectorStore từ embedding
vector_store = MongoDBAtlasVectorSearch.from_documents(
    documents=documents,
    embedding=embedding_model,
    collection=collection,
    index_name="default",
)

In [None]:
retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 3})

qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    retriever=retriever,
    return_source_documents=True
)
# print("Sources:", [doc.metadata for doc in response['source_documents']])

In [None]:
query = "Các hình thức thi kết thúc môn học là gì?"
response = qa_chain(query)

print(response["result"])

### Generate Answer

In [None]:
def generate_rag_answers_from_csv(csv_path: str, output_path: str):
    df = pd.read_csv(csv_path)
    real_answers = []

    for i, row in df.iterrows():
        try:
            result = qa_chain(row["question"])
            real_answers.append(result["result"])
        except Exception as e:
            real_answers.append(f"ERROR: {e}")

    df["rag_answer"] = real_answers
    df.to_csv(output_path, index=False)
    return df

In [None]:
generate_rag_answers_from_csv("evaluation_data/Quy_dinh_dao_tao_Thac_si.csv", "evaluation_data/out_put_Quy_dinh_dao_tao_Thac_si_rag1.csv")