# Imports

In [1]:
import os
import getpass
from langchain_community.llms import Ollama
from langchain.vectorstores import FAISS
from langchain.embeddings import OllamaEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.vectorstores import Chroma
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains import LLMChain
from langchain_core.prompts import PromptTemplate
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_ollama import OllamaEmbeddings
from langchain_ollama import OllamaLLM
from langchain_openai import OpenAIEmbeddings
from langchain.chat_models import init_chat_model, ChatOpenAI
from langchain_core.messages import HumanMessage

# Code

### Loading documents


In [2]:
# pdf_files = ["store/doc1.pdf", "store/doc2.pdf","store/DeepSeek_V3.pdf"]
pdf_files = ["store/DeepSeek_V3.pdf"]
#csv_files = ["store/cards.csv"]

pdf_docs = []
for pdf in pdf_files:
    loader = PyPDFLoader(pdf)
    pdf_docs.extend(loader.load())
'''
csv_docs = []
for csv in csv_files:
    loader = CSVLoader(file_path=csv, encoding='utf-8')
    csv_docs.extend(loader.load())
'''
#all_docs = pdf_docs + csv_docs
all_docs = pdf_docs

for i in range(len(all_docs)):
    all_docs[i].page_content = all_docs[i].page_content.replace("\n", "").replace("\xa0", " ")



print(f"Total characters: {len(all_docs[0].page_content)}")

Total characters: 1705


### Splitting documents

In [5]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000, 
    chunk_overlap=400, 
    add_start_index=True 
)   
all_splits = splitter.split_documents(all_docs)

print(f"Split blog post into {len(all_splits)} sub-documents.")

Split blog post into 103 sub-documents.


### Creating Embeddings

In [None]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
vectorstore = Chroma(persist_directory="./chroma_db_open", embedding_function=embeddings)
vectorstore.add_documents(all_splits)

['2eed7f51-3d0c-4348-a80d-dc4534f66075',
 '634f7d2e-d496-4a6f-bc9c-0ca5e47b8de7',
 '362fa1c9-bde1-470e-a522-dd1b80198061',
 'b9db6dad-73db-422c-a966-927530f79c5b',
 '8caed21a-dedc-4dbc-b062-43234d84b324',
 '08787650-12dc-4c92-a812-036c29f443e8',
 '55387876-eb72-4163-ae72-4b79ab9d9f0f',
 '47ff7840-f1c2-4835-8a8b-20495ce394bd',
 'a08bf25f-da52-4061-8c5d-56d66d106524',
 'b36dd642-1754-4630-b136-4a5651be4373',
 '49430d4c-0a45-4e75-b290-128edae839a8',
 'c19a5a75-c1f5-4e83-80a5-0b56c83e0f15',
 'bcd65f0c-5ea4-4f49-b067-3321a599ef91',
 '874947e2-aef6-4b27-808d-7155a54931f1',
 '1850089d-2142-4521-89e8-52421ac3907b',
 'fe0c201b-2ab7-4ce6-80a0-c5d69c48a7f6',
 '4fa4a8c2-f52d-42ad-afd9-cef6e9cd15d1',
 '773e5456-6a84-48b8-b65a-4a69bc7b9797',
 'c4e5948c-a6e6-48e5-a566-41491831b82d',
 '2170a0d7-a1e9-4888-9a26-d108cf4d37e8',
 'd3e1eb41-90e3-49ca-8dfd-77650765b3bc',
 '0b92f320-d95d-4407-b3f4-3a898784158a',
 'e6d826f3-ee04-4c60-bf4f-70c796a642e4',
 '84e3b009-9626-4de4-98dd-3183a3d1e41b',
 '3f1f3a52-48fe-

In [27]:
input = "Какой сегодня день недели?"
context = vectorstore.similarity_search(input, k = 5)
#for doc in context:
#   print(doc)
context="\n\n".join([doc.page_content for doc in context])

### Model response

In [28]:
model = ChatOpenAI(model="gpt-4o")

prompt_template = PromptTemplate(
    input_variables=["context", "question"],
    template="Ты помощник, который отвечает на вопросы на английском.\n\n---\n\nИспользуя контекст ниже, ответь на вопрос.\
              Если ответ не найден в документе, ответь 'Я не знаю.'\
              Если нашёл больше одной информации, опиши каждую подробно.\n\n---\n\nКонтекст:\n{context}\n\nВопрос: {question}\n\nОтвет:"
)

prompt = prompt_template.format(context=context, question=input)

response = model.invoke([HumanMessage(content=prompt)])

print(response.content)

Я не знаю.
