# Imports

In [3]:
import os
import getpass
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.vectorstores import Chroma
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import LLMChain
from langchain_core.prompts import PromptTemplate
from langchain_openai import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI
from langchain_core.messages import HumanMessage

# Code

### Loading documents


In [None]:
# pdf_files = ["store/doc1.pdf", "store/doc2.pdf","store/DeepSeek_V3.pdf"]
pdf_files = ["store/DeepSeek_V3.pdf"]
#csv_files = ["store/cards.csv"]

pdf_docs = []
for pdf in pdf_files:
    loader = PyPDFLoader(pdf)
    pdf_docs.extend(loader.load())
'''
csv_docs = []
for csv in csv_files:
    loader = CSVLoader(file_path=csv, encoding='utf-8')
    csv_docs.extend(loader.load())
'''
#all_docs = pdf_docs + csv_docs
all_docs = pdf_docs

for i in range(len(all_docs)):
    all_docs[i].page_content = all_docs[i].page_content.replace("\n", "").replace("\xa0", " ")



print(f"Total characters: {len(all_docs[0].page_content)}")

### Splitting documents

In [None]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000, 
    chunk_overlap=400, 
    add_start_index=True 
)   
all_splits = splitter.split_documents(all_docs)

print(f"Split blog post into {len(all_splits)} sub-documents.")

### Creating Embeddings

In [None]:
if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = "your-openai-api-key"

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
vectorstore = Chroma(persist_directory="./chroma_db_open", embedding_function=embeddings)
vectorstore.add_documents(all_splits)

In [27]:
input = "Какой сегодня день недели?"
context = vectorstore.similarity_search(input, k = 5)
#for doc in context:
#   print(doc)
context="\n\n".join([doc.page_content for doc in context])

### Model response

In [None]:
model = ChatOpenAI(model="gpt-4o")

prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template="Ты помощник, который отвечает на вопросы на английском.\n\n---\n\nИспользуя контекст ниже, ответь на вопрос.\
              Если ответ не найден в документе, ответь 'Я не знаю.'\
              Если нашёл больше одной информации, опиши каждую подробно.\n\n---\n\nКонтекст:\n{context}\n\nВопрос: {question}\n\nОтвет:"
)

prompt = prompt_template.format(context=context, question=input)

response = model.invoke([HumanMessage(content=prompt)])

print(response.content)