# Retrieval Augmented Generation

## Import packages

In [1]:
import os

In [2]:
import langchain
import rootutils
from huggingface_hub import hf_hub_download
from langchain.document_loaders import PyPDFLoader, WikipediaLoader
from langchain.embeddings import HuggingFaceEmbeddings, LlamaCppEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Qdrant

  from .autonotebook import tqdm as notebook_tqdm


## Settings

In [3]:
class debug_langchain:
    def __init__(self, enable: bool = True):
        self.enable = enable

    def __enter__(self):
        langchain.debug = True if self.enable else False

    def __exit__(self, exc_type, exc_val, exc_tb):
        langchain.debug = False

In [4]:
SEED = 42
SENTENCE_ENDINGS = [".", "!", "?"]
WORDS_BREAKS = list(reversed([",", ";", ":", " ", "(", ")", "[", "]", "{", "}", "\t", "\n"]))

In [5]:
path_to_root = rootutils.find_root(indicator=".project-root")
path_to_data = path_to_root / "data"
path_to_weights = path_to_root / "weights"

path_to_data.mkdir(exist_ok=True, parents=True)
path_to_weights.mkdir(exist_ok=True, parents=True)

In [6]:
embedding_model_name = "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2"
# embedding_model_name = "intfloat/multilingual-e5-large"
# embedding_model_name = "efederici/e5-base-multilingual-4096"
# create custom embedder: https://github.com/langchain-ai/langchain/issues/9486

In [7]:
# llm_repo_id = "TheBloke/deepseek-coder-6.7B-instruct-GGUF"
# llm_filename = "deepseek-coder-6.7b-instruct.Q5_K_M.gguf"

# llm_repo_id = "TheBloke/Llama-2-7B-Chat-GGUF"
# llm_filename = "llama-2-7b-chat.Q5_K_M.gguf"

llm_repo_id = "TheBloke/openchat-3.5-1210-GGUF"
llm_filename = "openchat-3.5-1210.Q5_K_M.gguf"

In [8]:
hf_hub_download(
    repo_id=llm_repo_id,
    filename=llm_filename,
    local_dir=path_to_weights,
)

'/Users/romankryvokhyzha/PycharmProjects/llm-simple-QnA-example/weights/openchat-3.5-1210.Q5_K_M.gguf'

## Prepare data

In [9]:
# Load PDF
loaders = (
    [
        # Duplicate documents on purpose - messy data
        PyPDFLoader(file_path=str(path_to_data / file_name))
        for file_name in os.listdir(path_to_data)
        if file_name.endswith(".pdf")
    ]
    + [
        WikipediaLoader(query="Розпізнавання іменованих сутностей", load_max_docs=2, lang="uk"),
        WikipediaLoader(query="Нейронні мережі", load_max_docs=2, lang="uk"),
        WikipediaLoader(query="Дід Панас", load_max_docs=1, lang="uk"),
    ]
    + [
        WikipediaLoader(query="Messi", load_max_docs=2, lang="en"),
        WikipediaLoader(query="Дід Панас", load_max_docs=1, lang="en"),
    ]
)

docs = []
for loader in loaders:
    docs.extend(loader.load())

In [10]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=768,  # 2048
    chunk_overlap=128,  # 128
    separators=SENTENCE_ENDINGS + WORDS_BREAKS,
)

In [11]:
splits = text_splitter.split_documents(docs)

In [12]:
len(splits)

440

## Create embeddings and fill vector store

In [13]:
embeddings = HuggingFaceEmbeddings(
    model_name=embedding_model_name,
    encode_kwargs={"normalize_embeddings": True},
    # model_kwargs = {'device': 'cpu'}.
)
# intfloat/multilingual-e5-small
len(embeddings.embed_query("This is a test query."))

384

In [14]:
# embeddings = GPT4AllEmbeddings()
# # len(embeddings.embed_query("This is a test query."))

In [15]:
# embeddings = LlamaCppEmbeddings(
#     model_path=str(path_to_weights / "llama-2-7b-chat.Q2_K.gguf"),
#     n_ctx=2048,
#     seed=SEED,
#     verbose=False,
# )
# # len(embeddings.embed_query("This is a test query."))

In [16]:
url = os.getenv("QDRANT_URL")
# collection_name = os.getenv("QDRANT_COLLECTION_NAME")
collection_name = "my_custom_documents"

qdrant = Qdrant.from_documents(
    splits,
    embeddings,
    url=url,
    collection_name=collection_name,
)

## Try out the search

In [17]:
query = "What is Bias-Variance Tradeoff?"
found_docs = qdrant.similarity_search(query)
found_docs[0]

Document(page_content='CS229 Bias-Variance and Error Analysis\nYoann Le Calonnec\nOctober 2, 2017\n1 The Bias-Variance Tradeoﬀ\nAssume you are given a well ﬁtted machine learning model ˆfthat you want to apply on\nsome test dataset. For instance, the model could be a linear regression whose parameters\nwere computed using some training set diﬀerent from your test set. For each point xin your\ntest set, you want to predict the associated target y∈R, and compute the mean squared\nerror (MSE)\nE(x,y)∼test set|ˆf(x)−y|2\nYou now realize that this MSE is too high, and try to ﬁnd an explanation to this result:\n•Overﬁtting: the model is too closely related to the examples in the training set and\ndoesn’t generalize well to other examples', metadata={'page': 0, 'source': '/Users/romankryvokhyzha/PycharmProjects/llm-simple-QnA-example/data/bias-variance-error-analysis.pdf'})

In [18]:
query = "Що таке розпізнавання іменованих сутностей?"
found_docs = qdrant.similarity_search(query)
found_docs[0]

Document(page_content="Розпізнавання іменованих сутностей (РІС) (також відоме як ідентифікація об'єктної сутності, фрагментація об'єктної сутності та видобуток об'єктної сутності) — це підзадача видобування інформації, яка намагається знайти і класифікувати іменовані сутності в неструктурованому тексті в заздалегідь визначені категорії, такі як імена людей, організації, місця, медичні коди, час, кількості, грошові значення, відсотки тощо.\n\nБільшість досліджень у системах РІС було структуровано як отримання не коментованого блоку тексту, такого як:  І створення коментованого блоку тексту, який виділяє імена об'єктів:\n\nУ цьому прикладі було виявлено та класифіковано ім'я особи, що складається з одного токену, назва компанії з двох токенів та часового виразу", metadata={'source': 'https://uk.wikipedia.org/wiki/%D0%A0%D0%BE%D0%B7%D0%BF%D1%96%D0%B7%D0%BD%D0%B0%D0%B2%D0%B0%D0%BD%D0%BD%D1%8F_%D1%96%D0%BC%D0%B5%D0%BD%D0%BE%D0%B2%D0%B0%D0%BD%D0%B8%D1%85_%D1%81%D1%83%D1%82%D0%BD%D0%BE%D1%81%

In [19]:
query = "Хто такий дід Панас?"
found_docs = qdrant.max_marginal_relevance_search(query, k=2, fetch_k=10)
found_docs[0]

Document(page_content=".1964—1986 — у ролі діда Панаса вів на Українському телебаченні програму «На добраніч, діти».\n\n\n== Вшанування пам'яті ==\nПохований у колумбарії Байкового кладовища. Вдова виїхала до США, перед тим передавши кіноплівки із записами «діда Панаса» братам Капрановим.\n\n2019 року на приміщенні тальнівської школи, на місці якої стояв будинок, де народився Петро Вескляров, встановлено пам'ятну дошку.\nУ 2022 році у Тальному на Черкащині вулиця Крилова стала вулицею Весклярова.\n\n\n== Ролі в театрі ==\nМикола Задорожний («Украдене щастя» Івана Франка).\nКомандор («Камінний господар» Лесі Українки).\nГородничий («Ревізор» Миколи Гоголя).\nМонтанеллі («Овід» Етель Л. Войнич)", metadata={'source': 'https://uk.wikipedia.org/wiki/%D0%92%D0%B5%D1%81%D0%BA%D0%BB%D1%8F%D1%80%D0%BE%D0%B2_%D0%9F%D0%B5%D1%82%D1%80%D0%BE_%D0%AE%D1%85%D0%B8%D0%BC%D0%BE%D0%B2%D0%B8%D1%87', 'summary': "Петро́ Юхи́мович Вескляро́в, ім'я при народженні Пінхас Хаїмович Весклер (9 червня 1911(19110609

## Create simple RAG chain using LlamaCpp

In [20]:
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import RetrievalQA
from langchain.llms import LlamaCpp
from langchain.prompts import PromptTemplate
from qdrant_client import QdrantClient

In [21]:
# template = """Дай відповідь, використовуючи виключно українську мову для написання всіх слів: {question}"""
#
# prompt = PromptTemplate(template=template, input_variables=["question"])

In [22]:
doc_store = Qdrant(
    client=QdrantClient(url=url),
    collection_name=collection_name,
    embeddings=embeddings,
)

In [23]:
# Callbacks support token-wise streaming
callback_manager = CallbackManager([StreamingStdOutCallbackHandler()])

In [24]:
llm = LlamaCpp(
    model_path=str(path_to_weights / llm_filename),
    temperature=0.0,
    max_tokens=2048,
    n_ctx=2048,  # 8192, The max sequence length to use - note that longer sequence lengths require much more resources
    # n_threads=8,            # The number of CPU threads to use, tailor to your system and the resulting performance
    # n_gpu_layers=35         # The number of layers to offload to GPU, if you have GPU acceleration available
    seed=SEED,
    callback_manager=callback_manager,
    verbose=True,  # Verbose is required to pass to the callback manager
)

llama_model_loader: loaded meta data with 23 key-value pairs and 291 tensors from /Users/romankryvokhyzha/PycharmProjects/llm-simple-QnA-example/weights/openchat-3.5-1210.Q5_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = openchat_openchat-3.5-1210
llama_model_loader: - kv   2:                       llama.context_length u32              = 8192
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32            

In [25]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=doc_store.as_retriever(),
    # retriever=qdrant.as_retriever(search_type="mmr"),
    return_source_documents=False,
    # chain_type_kwargs={"prompt": custom_prompt_template},
)

In [26]:
with debug_langchain():
    question = "What is Gaussian kernel?"
    # qa_chain({"query": question})
    qa_chain.run(question)

  warn_deprecated(


[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "What is Gaussian kernel?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain > 4:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "What is Gaussian kernel?",
  "context": "...,x(n)}, and let a square, n-by-n\nmatrixKbe deﬁned so that its ( i,j)-entry is given by Kij=K(x(i),x(j)).\nThis matrix is called the kernel matrix . Note that we’ve overloaded the\nnotation and used Kto denote both the kernel function K(x,z) and the\nkernel matrix K, due to their obvious close relationship.\nNow, ifKis a valid kernel, then Kij=K(x(i),x(j)) =φ(x(i))Tφ(x(j)) =\nφ(x(j))Tφ(x(i)) =K(x(j),x(i)) =Kji, andhence Kmustbesymmetric. More-\nover, letting φk(x) denote the k-th coordinate of the vector φ(x), we ﬁnd that\nfor 


llama_print_timings:        load time =   10191.76 ms
llama_print_timings:      sample time =      64.68 ms /   144 runs   (    0.45 ms per token,  2226.45 tokens per second)
llama_print_timings: prompt eval time = 1399081.07 ms /  1103 tokens ( 1268.43 ms per token,     0.79 tokens per second)
llama_print_timings:        eval time =  572170.59 ms /   143 runs   ( 4001.19 ms per token,     0.25 tokens per second)
llama_print_timings:       total time = 1972721.16 ms /  1246 tokens


In [27]:
with debug_langchain():
    question = "Who is Grandpa Panas?"
    # qa_chain({"query": question})
    qa_chain.run(question)

[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "Who is Grandpa Panas?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain > 4:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "Who is Grandpa Panas?",
  "context": "Petro Yukhymovych Vesklyarov (Ukrainian: Вескляров Петро Юхимович) (June 10 [O.S. May 28] 1911 in Talne, Ukraine – January 5, 1994 in Kyiv) was a Ukrainian theater and television actor. He was also known by the nickname Did Panas (Grandpa Panas, Ukrainian: дід Панас).\nBetween 1932 and 1940, Vesklyarov was an actor in a travelling workers' theatre, and between 1946 and 1959 he performed at the Taras Shevchenko Musical-Drama Theatre in Lutsk, Volyn. Between 1959 and 1982 Veslklyarov worked in the Dovzhenko Film Studios, appearing in a nu

Llama.generate: prefix-match hit


 Grandpa Panas (Ukrainian: дід Панас) was a character played by Petro Yukhymovych Vesklyarov, a Ukrainian theater and television actor. He portrayed the character in the Ukrainian television series "На добраніч, діти" (Goodnight, children) from 1964 to 1986.
### Na dobranich, dity
The Ukrainian television series "На добраніч, діти" (Goodnight, children) aired from 1964 to 1986 and featured the character Grandpa Panas, played by Petro Yukhymovych Vesklyarov. The show was aimed at children and was broadcast live in the evenings.
### Petro Yukhymovych Vesklyarov
Petro Yukhymovych Vesklyarov (June 10 [O.S. May 28] 1911 in Talne, Ukraine – January 5, 1994 in Kyiv) was a Ukrainian theater and television actor. He was also known by the nickname Did Panas (Grandpa Panas, Ukrainian: дід Панас). Between 1932 and 1940, Vesklyarov was an actor in a travelling workers' theatre, and between 1946 and 1959 he performed at the Taras Shevchenko Musical-Drama Theatre in Lutsk, Volyn. Between 1959 and 198


llama_print_timings:        load time =    9586.71 ms
llama_print_timings:      sample time =     176.58 ms /   736 runs   (    0.24 ms per token,  4168.04 tokens per second)
llama_print_timings: prompt eval time = 1225338.22 ms /  1019 tokens ( 1202.49 ms per token,     0.83 tokens per second)
llama_print_timings:        eval time = 4376018.16 ms /   735 runs   ( 5953.77 ms per token,     0.17 tokens per second)
llama_print_timings:       total time = 5608568.63 ms


In [25]:
with debug_langchain():
    question = "Хто такий дід Панас?"
    # qa_chain({"query": question})
    qa_chain.run(question)

[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "Хто такий дід Панас?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain > 4:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "Хто такий дід Панас?",
  "context": "Petro Yukhymovych Vesklyarov (Ukrainian: Вескляров Петро Юхимович) (June 10 [O.S. May 28] 1911 in Talne, Ukraine – January 5, 1994 in Kyiv) was a Ukrainian theater and television actor. He was also known by the nickname Did Panas (Grandpa Panas, Ukrainian: дід Панас).\nBetween 1932 and 1940, Vesklyarov was an actor in a travelling workers' theatre, and between 1946 and 1959 he performed at the Taras Shevchenko Musical-Drama Theatre in Lutsk, Volyn. Between 1959 and 1982 Veslklyarov worked in the Dovzhenko Film Studios, appearing in a numb


llama_print_timings:        load time =    9241.55 ms
llama_print_timings:      sample time =      58.98 ms /   251 runs   (    0.23 ms per token,  4255.75 tokens per second)
llama_print_timings: prompt eval time = 1330775.37 ms /  1067 tokens ( 1247.21 ms per token,     0.80 tokens per second)
llama_print_timings:        eval time = 1229269.06 ms /   250 runs   ( 4917.08 ms per token,     0.20 tokens per second)
llama_print_timings:       total time = 2561868.21 ms


## Modify the chain to use custom prompt in English

+ Mistral prompt example [link](https://medium.com/@scholarly360/mistral-7b-complete-guide-on-colab-129fa5e9a04d)

In [33]:
# # llama
# custom_prompt = """
# Use the following pieces of context to answer the question at the end. Please provide
# a short single-sentence summary answer only. If you don't know the answer or if it's
# not present in given context, don't try to make up an answer.
# Context: {context}
# Question: {question}
# Helpful Answer:
# """
# custom_prompt_template = PromptTemplate(template=custom_prompt, input_variables=["context", "question"])

In [34]:
# openchat
custom_prompt = """<s>[INST] You are a helpful, respectful and honest assistant. Use the following pieces of context to answer the question at the end. Please provide a short single-sentence summary answer only. If you don't know the answer or if it's not present in given context, don't try to make up an answer.
Context: {context}
Question: {question} [/INST] </s>
"""
custom_prompt_template = PromptTemplate(template=custom_prompt, input_variables=["context", "question"])

In [35]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=doc_store.as_retriever(),
    # retriever=qdrant.as_retriever(search_type="mmr"),
    return_source_documents=False,
    chain_type_kwargs={"prompt": custom_prompt_template},
)

In [36]:
with debug_langchain():
    question = "What is Gaussian kernel?"
    # qa_chain({"query": question})
    qa_chain.run(question)

[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "What is Gaussian kernel?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain > 4:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "What is Gaussian kernel?",
  "context": "9\na feature map φsuch that the kernel Kdeﬁned above satisﬁes K(x,z) =\nφ(x)Tφ(z)? Inthisparticularexample, theanswerisyes. Thiskernel iscalled\ntheGaussian kernel , and corresponds to an inﬁnite dimensional feature\nmapping φ. We will give a precise characterization about what propert ies\na function Kneeds to satisfy so that it can be a valid kernel function that\ncorresponds to some feature map φ.\nNecessary conditions for valid kernels. Suppose for now that Kis\nindeed a valid kernel corresponding to some feature mapping φ, an

Llama.generate: prefix-match hit


The Gaussian kernel, also known as the radial basis function (RBF) kernel, is a valid kernel function that corresponds to an infinite-dimensional feature mapping φ.[36;1m[1;3m[llm/end][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain > 4:chain:LLMChain > 5:llm:LlamaCpp] [2338.06s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "The Gaussian kernel, also known as the radial basis function (RBF) kernel, is a valid kernel function that corresponds to an infinite-dimensional feature mapping φ.",
        "generation_info": null,
        "type": "Generation"
      }
    ]
  ],
  "llm_output": null,
  "run": null
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain > 4:chain:LLMChain] [2338.20s] Exiting Chain run with output:
[0m{
  "text": "The Gaussian kernel, also known as the radial basis function (RBF) kernel, is a valid kernel function that corresponds to an infinite-dimensional feature mapping φ."
}
[36


llama_print_timings:        load time =    9521.16 ms
llama_print_timings:      sample time =      31.85 ms /    35 runs   (    0.91 ms per token,  1098.90 tokens per second)
llama_print_timings: prompt eval time = 2070182.83 ms /  1632 tokens ( 1268.49 ms per token,     0.79 tokens per second)
llama_print_timings:        eval time =  265483.40 ms /    35 runs   ( 7585.24 ms per token,     0.13 tokens per second)
llama_print_timings:       total time = 2337993.38 ms


In [27]:
with debug_langchain():
    question = "Who is Grandpa Panas?"
    # qa_chain({"query": question})
    qa_chain.run(question)

[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "Who is Grandpa Panas?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain > 4:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "Who is Grandpa Panas?",
  "context": "Petro Yukhymovych Vesklyarov (Ukrainian: Вескляров Петро Юхимович) (June 10 [O.S. May 28] 1911 in Talne, Ukraine – January 5, 1994 in Kyiv) was a Ukrainian theater and television actor. He was also known by the nickname Did Panas (Grandpa Panas, Ukrainian: дід Панас).\nBetween 1932 and 1940, Vesklyarov was an actor in a travelling workers' theatre, and between 1946 and 1959 he performed at the Taras Shevchenko Musical-Drama Theatre in Lutsk, Volyn. Between 1959 and 1982 Veslklyarov worked in the Dovzhenko Film Studios, appearing in a nu

Llama.generate: prefix-match hit


Grandpa Panas is a character played by Petro Vesklyarov, a Ukrainian theater and television actor.[36;1m[1;3m[llm/end][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain > 4:chain:LLMChain > 5:llm:LlamaCpp] [1494.84s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "Grandpa Panas is a character played by Petro Vesklyarov, a Ukrainian theater and television actor.",
        "generation_info": null,
        "type": "Generation"
      }
    ]
  ],
  "llm_output": null,
  "run": null
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain > 4:chain:LLMChain] [1494.85s] Exiting Chain run with output:
[0m{
  "text": "Grandpa Panas is a character played by Petro Vesklyarov, a Ukrainian theater and television actor."
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain] [1494.85s] Exiting Chain run with output:
[0m{
  "output_text": "Grandpa Panas is a character played by Petro Veskly


llama_print_timings:        load time =    9645.99 ms
llama_print_timings:      sample time =       6.96 ms /    27 runs   (    0.26 ms per token,  3878.20 tokens per second)
llama_print_timings: prompt eval time = 1250661.38 ms /  1020 tokens ( 1226.14 ms per token,     0.82 tokens per second)
llama_print_timings:        eval time =  243663.65 ms /    26 runs   ( 9371.68 ms per token,     0.11 tokens per second)
llama_print_timings:       total time = 1494832.72 ms


In [28]:
with debug_langchain():
    question = "Хто такий дід Панас?"
    # qa_chain({"query": question})
    qa_chain.run(question)

Llama.generate: prefix-match hit


[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "Хто такий дід Панас?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain > 4:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "Хто такий дід Панас?",
  "context": "Petro Yukhymovych Vesklyarov (Ukrainian: Вескляров Петро Юхимович) (June 10 [O.S. May 28] 1911 in Talne, Ukraine – January 5, 1994 in Kyiv) was a Ukrainian theater and television actor. He was also known by the nickname Did Panas (Grandpa Panas, Ukrainian: дід Панас).\nBetween 1932 and 1940, Vesklyarov was an actor in a travelling workers' theatre, and between 1946 and 1959 he performed at the Taras Shevchenko Musical-Drama Theatre in Lutsk, Volyn. Between 1959 and 1982 Veslklyarov worked in the Dovzhenko Film Studios, appearing in a numb


llama_print_timings:        load time =    9645.99 ms
llama_print_timings:      sample time =       9.07 ms /    40 runs   (    0.23 ms per token,  4411.12 tokens per second)
llama_print_timings: prompt eval time =   19459.08 ms /    16 tokens ( 1216.19 ms per token,     0.82 tokens per second)
llama_print_timings:        eval time =  372753.90 ms /    40 runs   ( 9318.85 ms per token,     0.11 tokens per second)
llama_print_timings:       total time =  392492.02 ms


In [29]:
with debug_langchain():
    question = "Що таке розпізнавання іменованих сутностей?"
    # qa_chain({"query": question})
    qa_chain.run(question)

[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "Що таке розпізнавання іменованих сутностей?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain > 4:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "Що таке розпізнавання іменованих сутностей?",
  "context": "У обробці природної мови зв'язування сутностей (англ. Entity Linking), яке також називають зв'язування іменованих сутностей (ЗІС), розпізнавання іменованих сутностей (РІС), розпізнавання і неоднозначності іменованих сутностей (РНІС) або нормалізація іменованих сутностей (НІС), — це присвоєння унікальної ідентичності об'єктам (наприклад, відомим особам, місцям чи компаніям), що згадуються у тексті. Наприклад, розглянемо речення «Дніпро — річка України». Ідея полягає в тому, щоб визначити, що «Д

Llama.generate: prefix-match hit


Розпізнавання іменованих сутностей - це процес виявлення і класифікації імен у тексті.[36;1m[1;3m[llm/end][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain > 4:chain:LLMChain > 5:llm:LlamaCpp] [2543.96s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "\nРозпізнавання іменованих сутностей - це процес виявлення і класифікації імен у тексті.",
        "generation_info": null,
        "type": "Generation"
      }
    ]
  ],
  "llm_output": null,
  "run": null
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain > 4:chain:LLMChain] [2543.97s] Exiting Chain run with output:
[0m{
  "text": "\nРозпізнавання іменованих сутностей - це процес виявлення і класифікації імен у тексті."
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RetrievalQA > 3:chain:StuffDocumentsChain] [2543.97s] Exiting Chain run with output:
[0m{
  "output_text": "\nРозпізнавання іменованих сутностей - це процес виявлення і класифікації імен у те


llama_print_timings:        load time =    9645.99 ms
llama_print_timings:      sample time =       8.35 ms /    36 runs   (    0.23 ms per token,  4312.93 tokens per second)
llama_print_timings: prompt eval time = 2216792.90 ms /  1818 tokens ( 1219.36 ms per token,     0.82 tokens per second)
llama_print_timings:        eval time =  326349.44 ms /    35 runs   ( 9324.27 ms per token,     0.11 tokens per second)
llama_print_timings:       total time = 2543950.84 ms


## Modify the chain to use custom prompt in English and context compression

In [37]:
# # llama
# custom_retriever_prompt = """Given the following question and context, extract any part of the context AS IS that is relevant to answer the question. If none of the context is relevant return NO_OUTPUT. Remember, DO NOT edit the extracted parts of the context.
# Question: {question}
# Context: {context}
# Answer:
# """
# custom_retriever_prompt_template = PromptTemplate(template=custom_retriever_prompt, input_variables=["question", "context"])

In [38]:
# openchat
custom_retriever_prompt = """<s>[INST] Given the following question and context, extract any part of the context AS IS that is relevant to answer the question. If none of the context is relevant return NO_OUTPUT. Remember, DO NOT edit the extracted parts of the context.
Question: {question}
Context: {context} [/INST] </s>
"""
custom_retriever_prompt_template = PromptTemplate(
    template=custom_retriever_prompt, input_variables=["question", "context"]
)

In [39]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

In [40]:
base_compressor = LLMChainExtractor.from_llm(llm, prompt=custom_retriever_prompt_template)
base_retriever = doc_store.as_retriever(search_kwargs={"k": 3})

compression_retriever = ContextualCompressionRetriever(
    base_compressor=base_compressor,
    base_retriever=base_retriever,
)

In [41]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=compression_retriever,
    return_source_documents=False,
    chain_type_kwargs={"prompt": custom_prompt_template},
)

In [42]:
with debug_langchain():
    question = "What is Gaussian kernel?"
    # qa_chain({"query": question})
    qa_chain.run(question)

[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "What is Gaussian kernel?"
}


Llama.generate: prefix-match hit


[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 4:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "What is Gaussian kernel?",
  "context": "9\na feature map φsuch that the kernel Kdeﬁned above satisﬁes K(x,z) =\nφ(x)Tφ(z)? Inthisparticularexample, theanswerisyes. Thiskernel iscalled\ntheGaussian kernel , and corresponds to an inﬁnite dimensional feature\nmapping φ. We will give a precise characterization about what propert ies\na function Kneeds to satisfy so that it can be a valid kernel function that\ncorresponds to some feature map φ.\nNecessary conditions for valid kernels. Suppose for now that Kis\nindeed a valid kernel corresponding to some feature mapping φ, and we will\nﬁrst see what properties it satisﬁes. Now, consider some ﬁnit e set ofnpoints\n(not necessarily the training set) {x(1),...,x(n)}, and let a square, n-by-n\nmatrixKbe deﬁned so that its ( i,j)-entry is given by Kij=K(x(i),x(j)).\nThis matrix is called the kern


llama_print_timings:        load time =    9521.16 ms
llama_print_timings:      sample time =       4.08 ms /     7 runs   (    0.58 ms per token,  1715.69 tokens per second)
llama_print_timings: prompt eval time =  717837.50 ms /   648 tokens ( 1107.77 ms per token,     0.90 tokens per second)
llama_print_timings:        eval time =   41534.06 ms /     6 runs   ( 6922.34 ms per token,     0.14 tokens per second)
llama_print_timings:       total time =  760276.17 ms
Llama.generate: prefix-match hit


[36;1m[1;3m[llm/end][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 4:chain:LLMChain > 5:llm:LlamaCpp] [760.32s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "The answer is: Gaussian kernel",
        "generation_info": null,
        "type": "Generation"
      }
    ]
  ],
  "llm_output": null,
  "run": null
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 4:chain:LLMChain] [760.34s] Exiting Chain run with output:
[0m{
  "text": "The answer is: Gaussian kernel"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 6:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "What is Gaussian kernel?",
  "context": "11\nApplication of kernel methods: We’ve seen the application of kernels\nto linear regression. In the next part, we will introduce the support vector\nmachines to which kernels can be directly applied. dwell too much longer on\nithere. Infact, theideaofker


llama_print_timings:        load time =    9521.16 ms
llama_print_timings:      sample time =     119.38 ms /   133 runs   (    0.90 ms per token,  1114.09 tokens per second)
llama_print_timings: prompt eval time =  449647.25 ms /   414 tokens ( 1086.10 ms per token,     0.92 tokens per second)
llama_print_timings:        eval time =  970429.14 ms /   132 runs   ( 7351.74 ms per token,     0.14 tokens per second)
llama_print_timings:       total time = 1425022.64 ms
Llama.generate: prefix-match hit


Gaussian kernel is a valid kernel because it corresponds to some feature mapping φ.


llama_print_timings:        load time =    9521.16 ms
llama_print_timings:      sample time =      14.43 ms /    19 runs   (    0.76 ms per token,  1316.43 tokens per second)
llama_print_timings: prompt eval time =  121281.52 ms /   120 tokens ( 1010.68 ms per token,     0.99 tokens per second)
llama_print_timings:        eval time =  126218.96 ms /    18 runs   ( 7012.16 ms per token,     0.14 tokens per second)
llama_print_timings:       total time =  248226.64 ms
Llama.generate: prefix-match hit


[36;1m[1;3m[llm/end][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 8:chain:LLMChain > 9:llm:LlamaCpp] [248.25s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "\nGaussian kernel is a valid kernel because it corresponds to some feature mapping φ.",
        "generation_info": null,
        "type": "Generation"
      }
    ]
  ],
  "llm_output": null,
  "run": null
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 8:chain:LLMChain] [248.26s] Exiting Chain run with output:
[0m{
  "text": "\nGaussian kernel is a valid kernel because it corresponds to some feature mapping φ."
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 10:chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 10:chain:StuffDocumentsChain > 11:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "What is Gaussian kernel?",
  "context": "The ans


llama_print_timings:        load time =    9521.16 ms
llama_print_timings:      sample time =      32.31 ms /    51 runs   (    0.63 ms per token,  1578.41 tokens per second)
llama_print_timings: prompt eval time =  241201.30 ms /   244 tokens (  988.53 ms per token,     1.01 tokens per second)
llama_print_timings:        eval time =  332291.40 ms /    50 runs   ( 6645.83 ms per token,     0.15 tokens per second)
llama_print_timings:       total time =  575048.01 ms


In [43]:
with debug_langchain():
    question = "Who is Grandpa Panas?"
    # qa_chain({"query": question})
    qa_chain.run(question)

[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "Who is Grandpa Panas?"
}


Llama.generate: prefix-match hit


[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 4:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "Who is Grandpa Panas?",
  "context": "Petro Yukhymovych Vesklyarov (Ukrainian: Вескляров Петро Юхимович) (June 10 [O.S. May 28] 1911 in Talne, Ukraine – January 5, 1994 in Kyiv) was a Ukrainian theater and television actor. He was also known by the nickname Did Panas (Grandpa Panas, Ukrainian: дід Панас).\nBetween 1932 and 1940, Vesklyarov was an actor in a travelling workers' theatre, and between 1946 and 1959 he performed at the Taras Shevchenko Musical-Drama Theatre in Lutsk, Volyn. Between 1959 and 1982 Veslklyarov worked in the Dovzhenko Film Studios, appearing in a number of films. He starred in the 1959 drama film Ivanna and appeared in the 1970 comedy film Two Days of Miracles. During this time (1964-1986) he appeared as the character \"Дід Панас\" (Grandpa Panas) in the Ukrainian television series \"На добраніч, діти\"  (Goodni


llama_print_timings:        load time =    9521.16 ms
llama_print_timings:      sample time =       7.40 ms /    18 runs   (    0.41 ms per token,  2433.42 tokens per second)
llama_print_timings: prompt eval time =  595934.51 ms /   540 tokens ( 1103.58 ms per token,     0.91 tokens per second)
llama_print_timings:        eval time =  167567.16 ms /    17 runs   ( 9856.89 ms per token,     0.10 tokens per second)
llama_print_timings:       total time =  764090.09 ms
Llama.generate: prefix-match hit


[36;1m[1;3m[llm/end][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 4:chain:LLMChain > 5:llm:LlamaCpp] [764.10s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "\nAnswer: Petro Yukhymovych Vesklyarov",
        "generation_info": null,
        "type": "Generation"
      }
    ]
  ],
  "llm_output": null,
  "run": null
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 4:chain:LLMChain] [764.10s] Exiting Chain run with output:
[0m{
  "text": "\nAnswer: Petro Yukhymovych Vesklyarov"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 6:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "Who is Grandpa Panas?",
  "context": "== Цікаві факти ==\nЗначного поширення набула легенда про те, що, будучи ведучим дитячої програми «На добраніч, діти», яка йшла у прямому ефірі, дід Панас завершив програму такою реплікою: «Отака хуйня, малята…» Речових доказів про те, що таке 


llama_print_timings:        load time =    9521.16 ms
llama_print_timings:      sample time =       6.76 ms /    25 runs   (    0.27 ms per token,  3696.04 tokens per second)
llama_print_timings: prompt eval time =  341534.04 ms /   262 tokens ( 1303.57 ms per token,     0.77 tokens per second)
llama_print_timings:        eval time =  220092.81 ms /    24 runs   ( 9170.53 ms per token,     0.11 tokens per second)
llama_print_timings:       total time =  561985.67 ms
Llama.generate: prefix-match hit


[36;1m[1;3m[llm/end][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 6:chain:LLMChain > 7:llm:LlamaCpp] [561.99s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "Answer: Grandpa Panas is a presenter of a children's program called \"Good night, kids\".",
        "generation_info": null,
        "type": "Generation"
      }
    ]
  ],
  "llm_output": null,
  "run": null
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 6:chain:LLMChain] [561.99s] Exiting Chain run with output:
[0m{
  "text": "Answer: Grandpa Panas is a presenter of a children's program called \"Good night, kids\"."
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 8:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "Who is Grandpa Panas?",
  "context": "Петро́ Юхи́мович Вескляро́в, ім'я при народженні Пінхас Хаїмович Весклер (9 червня 1911(19110609), Тальне, Уманський повіт, Київська губернія,


llama_print_timings:        load time =    9521.16 ms
llama_print_timings:      sample time =       4.49 ms /    14 runs   (    0.32 ms per token,  3117.35 tokens per second)
llama_print_timings: prompt eval time =  194892.79 ms /   157 tokens ( 1241.36 ms per token,     0.81 tokens per second)
llama_print_timings:        eval time =   94805.90 ms /    13 runs   ( 7292.76 ms per token,     0.14 tokens per second)
llama_print_timings:       total time =  289887.69 ms
Llama.generate: prefix-match hit


[36;1m[1;3m[llm/end][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 8:chain:LLMChain > 9:llm:LlamaCpp] [289.89s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "Answer: Петро Юхимович Вескляров",
        "generation_info": null,
        "type": "Generation"
      }
    ]
  ],
  "llm_output": null,
  "run": null
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 8:chain:LLMChain] [289.90s] Exiting Chain run with output:
[0m{
  "text": "Answer: Петро Юхимович Вескляров"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 10:chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 10:chain:StuffDocumentsChain > 11:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "Who is Grandpa Panas?",
  "context": "\nAnswer: Petro Yukhymovych Vesklyarov\n\nAnswer: Grandpa Panas is a presenter of a children's program called \"Good


llama_print_timings:        load time =    9521.16 ms
llama_print_timings:      sample time =       3.20 ms /    14 runs   (    0.23 ms per token,  4376.37 tokens per second)
llama_print_timings: prompt eval time =  178376.20 ms /   144 tokens ( 1238.72 ms per token,     0.81 tokens per second)
llama_print_timings:        eval time =  128037.46 ms /    14 runs   ( 9145.53 ms per token,     0.11 tokens per second)
llama_print_timings:       total time =  306564.76 ms


In [44]:
with debug_langchain():
    question = "Хто такий дід Панас?"
    # qa_chain({"query": question})
    qa_chain.run(question)

[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "Хто такий дід Панас?"
}


Llama.generate: prefix-match hit


[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 4:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "Хто такий дід Панас?",
  "context": "Petro Yukhymovych Vesklyarov (Ukrainian: Вескляров Петро Юхимович) (June 10 [O.S. May 28] 1911 in Talne, Ukraine – January 5, 1994 in Kyiv) was a Ukrainian theater and television actor. He was also known by the nickname Did Panas (Grandpa Panas, Ukrainian: дід Панас).\nBetween 1932 and 1940, Vesklyarov was an actor in a travelling workers' theatre, and between 1946 and 1959 he performed at the Taras Shevchenko Musical-Drama Theatre in Lutsk, Volyn. Between 1959 and 1982 Veslklyarov worked in the Dovzhenko Film Studios, appearing in a number of films. He starred in the 1959 drama film Ivanna and appeared in the 1970 comedy film Two Days of Miracles. During this time (1964-1986) he appeared as the character \"Дід Панас\" (Grandpa Panas) in the Ukrainian television series \"На добраніч, діти\"  (Goodnig


llama_print_timings:        load time =    9521.16 ms
llama_print_timings:      sample time =       4.23 ms /    18 runs   (    0.24 ms per token,  4253.31 tokens per second)
llama_print_timings: prompt eval time =  680939.70 ms /   543 tokens ( 1254.03 ms per token,     0.80 tokens per second)
llama_print_timings:        eval time =  156622.63 ms /    17 runs   ( 9213.10 ms per token,     0.11 tokens per second)
llama_print_timings:       total time =  837859.71 ms
Llama.generate: prefix-match hit


[36;1m[1;3m[llm/end][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 4:chain:LLMChain > 5:llm:LlamaCpp] [837.87s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "\nAnswer: Petro Yukhymovych Vesklyarov",
        "generation_info": null,
        "type": "Generation"
      }
    ]
  ],
  "llm_output": null,
  "run": null
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 4:chain:LLMChain] [837.87s] Exiting Chain run with output:
[0m{
  "text": "\nAnswer: Petro Yukhymovych Vesklyarov"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 6:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "Хто такий дід Панас?",
  "context": "== Цікаві факти ==\nЗначного поширення набула легенда про те, що, будучи ведучим дитячої програми «На добраніч, діти», яка йшла у прямому ефірі, дід Панас завершив програму такою реплікою: «Отака хуйня, малята…» Речових доказів про те, що таке с


llama_print_timings:        load time =    9521.16 ms
llama_print_timings:      sample time =       7.41 ms /    25 runs   (    0.30 ms per token,  3375.64 tokens per second)
llama_print_timings: prompt eval time =  323406.99 ms /   262 tokens ( 1234.38 ms per token,     0.81 tokens per second)
llama_print_timings:        eval time =  222544.72 ms /    24 runs   ( 9272.70 ms per token,     0.11 tokens per second)
llama_print_timings:       total time =  546238.28 ms
Llama.generate: prefix-match hit


[36;1m[1;3m[llm/end][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 6:chain:LLMChain > 7:llm:LlamaCpp] [546.24s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "\nAnswer: ведучим дитячої програми «На добраніч, діти»",
        "generation_info": null,
        "type": "Generation"
      }
    ]
  ],
  "llm_output": null,
  "run": null
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 6:chain:LLMChain] [546.25s] Exiting Chain run with output:
[0m{
  "text": "\nAnswer: ведучим дитячої програми «На добраніч, діти»"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 8:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "Хто такий дід Панас?",
  "context": "Петро́ Юхи́мович Вескляро́в, ім'я при народженні Пінхас Хаїмович Весклер (9 червня 1911(19110609), Тальне, Уманський повіт, Київська губернія, Російська імперія — 5 січня 1994, Київ) — український актор і телеведу


llama_print_timings:        load time =    9521.16 ms
llama_print_timings:      sample time =       6.82 ms /    18 runs   (    0.38 ms per token,  2638.52 tokens per second)
llama_print_timings: prompt eval time =  195968.38 ms /   157 tokens ( 1248.21 ms per token,     0.80 tokens per second)
llama_print_timings:        eval time =  158877.72 ms /    17 runs   ( 9345.75 ms per token,     0.11 tokens per second)
llama_print_timings:       total time =  355051.98 ms
Llama.generate: prefix-match hit


[36;1m[1;3m[llm/end][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 8:chain:LLMChain > 9:llm:LlamaCpp] [355.06s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "Answer: Петро́ Юхи́мович Вескляро́в",
        "generation_info": null,
        "type": "Generation"
      }
    ]
  ],
  "llm_output": null,
  "run": null
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 8:chain:LLMChain] [355.06s] Exiting Chain run with output:
[0m{
  "text": "Answer: Петро́ Юхи́мович Вескляро́в"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 10:chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 10:chain:StuffDocumentsChain > 11:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "Хто такий дід Панас?",
  "context": "\nAnswer: Petro Yukhymovych Vesklyarov\n\n\nAnswer: ведучим дитячої програми «На добраніч, діти»\n\nAnswer: Пет


llama_print_timings:        load time =    9521.16 ms
llama_print_timings:      sample time =       3.46 ms /    15 runs   (    0.23 ms per token,  4336.51 tokens per second)
llama_print_timings: prompt eval time =  185655.85 ms /   152 tokens ( 1221.42 ms per token,     0.82 tokens per second)
llama_print_timings:        eval time =  128087.76 ms /    14 runs   ( 9149.13 ms per token,     0.11 tokens per second)
llama_print_timings:       total time =  313893.66 ms


In [46]:
with debug_langchain():
    question = "Що таке розпізнавання іменованих сутностей?"
    # qa_chain({"query": question})
    qa_chain.run(question)

[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "Що таке розпізнавання іменованих сутностей?"
}


Llama.generate: prefix-match hit


[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 4:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "Що таке розпізнавання іменованих сутностей?",
  "context": "У обробці природної мови зв'язування сутностей (англ. Entity Linking), яке також називають зв'язування іменованих сутностей (ЗІС), розпізнавання іменованих сутностей (РІС), розпізнавання і неоднозначності іменованих сутностей (РНІС) або нормалізація іменованих сутностей (НІС), — це присвоєння унікальної ідентичності об'єктам (наприклад, відомим особам, місцям чи компаніям), що згадуються у тексті. Наприклад, розглянемо речення «Дніпро — річка України». Ідея полягає в тому, щоб визначити, що «Дніпро» належить до річки Дніпро, а не до Дніпро-арена чи будь-якої іншої організації, яку можна назвати «Дніпро». Зв'язування сутностей відрізняється від розпізнавання іменованих сутностей (РІС) тим, що РІС ідентифікує появу іменованої сутності в тексті, але не визначає, що саме це за сутн


llama_print_timings:        load time =    9521.16 ms
llama_print_timings:      sample time =       6.95 ms /    16 runs   (    0.43 ms per token,  2302.49 tokens per second)
llama_print_timings: prompt eval time =  554997.54 ms /   429 tokens ( 1293.70 ms per token,     0.77 tokens per second)
llama_print_timings:        eval time =  136614.16 ms /    15 runs   ( 9107.61 ms per token,     0.11 tokens per second)
llama_print_timings:       total time =  691933.03 ms
Llama.generate: prefix-match hit


[36;1m[1;3m[llm/end][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 4:chain:LLMChain > 5:llm:LlamaCpp] [691.95s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "Answer: розпізнавання іменованих сутностей",
        "generation_info": null,
        "type": "Generation"
      }
    ]
  ],
  "llm_output": null,
  "run": null
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 4:chain:LLMChain] [691.95s] Exiting Chain run with output:
[0m{
  "text": "Answer: розпізнавання іменованих сутностей"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 6:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "Що таке розпізнавання іменованих сутностей?",
  "context": "Розпізнавання іменованих сутностей (РІС) (також відоме як ідентифікація об'єктної сутності, фрагментація об'єктної сутності та видобуток об'єктної сутності) — це підзадача видобування інформації, яка намагається зн


llama_print_timings:        load time =    9521.16 ms
llama_print_timings:      sample time =      41.41 ms /   177 runs   (    0.23 ms per token,  4274.54 tokens per second)
llama_print_timings: prompt eval time =  697702.76 ms /   570 tokens ( 1224.04 ms per token,     0.82 tokens per second)
llama_print_timings:        eval time = 1646934.24 ms /   176 runs   ( 9357.58 ms per token,     0.11 tokens per second)
llama_print_timings:       total time = 2346505.84 ms
Llama.generate: prefix-match hit


[36;1m[1;3m[llm/end][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 6:chain:LLMChain > 7:llm:LlamaCpp] [2346.52s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "\nAnswer: Розпізнавання іменованих сутностей (РІС) (також відоме як ідентифікація об'єктної сутності, фрагментація об'єктної сутності та видобуток об'єктної сутності) — це підзадача видобування інформації, яка намагається знайти і класифікувати іменовані сутності в неструктурованому тексті в заздалегідь визначені категорії, такі як імена людей, організації, місця, медичні коди, час, кількості, грошові значення, відсотки тощо.",
        "generation_info": null,
        "type": "Generation"
      }
    ]
  ],
  "llm_output": null,
  "run": null
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 6:chain:LLMChain] [2346.52s] Exiting Chain run with output:
[0m{
  "text": "\nAnswer: Розпізнавання іменованих сутностей (РІС) (також відоме як ідентифікація о


llama_print_timings:        load time =    9521.16 ms
llama_print_timings:      sample time =       4.13 ms /    19 runs   (    0.22 ms per token,  4597.14 tokens per second)
llama_print_timings: prompt eval time =  344708.53 ms /   277 tokens ( 1244.44 ms per token,     0.80 tokens per second)
llama_print_timings:        eval time =  166553.14 ms /    18 runs   ( 9252.95 ms per token,     0.11 tokens per second)
llama_print_timings:       total time =  511488.12 ms
Llama.generate: prefix-match hit


Answer: Розпізнавання іменованих сутностей (РІС) - це підзадача видобування інформації, яка намагається знайти і класифікувати іменовані сутності в неструктурованому тексті в заздалегідь визначені категорії.[36;1m[1;3m[llm/end][0m [1m[1:chain:RetrievalQA > 10:chain:StuffDocumentsChain > 11:chain:LLMChain > 12:llm:LlamaCpp] [1134.04s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "Answer: Розпізнавання іменованих сутностей (РІС) - це підзадача видобування інформації, яка намагається знайти і класифікувати іменовані сутності в неструктурованому тексті в заздалегідь визначені категорії.",
        "generation_info": null,
        "type": "Generation"
      }
    ]
  ],
  "llm_output": null,
  "run": null
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RetrievalQA > 10:chain:StuffDocumentsChain > 11:chain:LLMChain] [1134.04s] Exiting Chain run with output:
[0m{
  "text": "Answer: Розпізнавання іменованих сутностей (РІС) - це підзадача видобування інфор


llama_print_timings:        load time =    9521.16 ms
llama_print_timings:      sample time =      22.10 ms /    83 runs   (    0.27 ms per token,  3755.66 tokens per second)
llama_print_timings: prompt eval time =  374525.16 ms /   309 tokens ( 1212.06 ms per token,     0.83 tokens per second)
llama_print_timings:        eval time =  758689.84 ms /    82 runs   ( 9252.32 ms per token,     0.11 tokens per second)
llama_print_timings:       total time = 1134015.86 ms


## Modify the chain to use custom prompt in Ukrainian and context compression

In [47]:
# # llama
# custom_retriever_prompt = """Використай надані шматки контексту, щоб дістати будь-яку частину контексту, яка відповідає на питання. Якщо ніяка частина контексту не відповідає на питання, поверни NO_OUTPUT. Пам'ятай, НЕ редагуй шматки контексту.
# Питання: {question}
# Контекст: {context}
# Відповідь:
# """
# custom_retriever_prompt_template = PromptTemplate(template=custom_retriever_prompt, input_variables=["question", "context"])

In [48]:
# openchat
custom_retriever_prompt = """<s>[INST] Використай надані шматки контексту, щоб дістати будь-яку частину контексту, яка відповідає на питання. Якщо ніяка частина контексту не відповідає на питання, поверни NO_OUTPUT. Пам'ятай, НЕ редагуй шматки контексту. Відповідь надай українською мовою.
Питання: {question}
Контекст: {context} [/INST] </s>
"""
custom_retriever_prompt_template = PromptTemplate(
    template=custom_retriever_prompt, input_variables=["question", "context"]
)

In [49]:
# # llama
# custom_prompt = """Використай надані шматки контексту, щоб відповісти на питання в кінці. Напиши коротку відповідь декількома реченнями. Не намагайся вигадувати відповідь, якщо ти не знаєш або вона не присутня в даному контексті.
# Контекст: {context}
# Питання: {question}
# Відповідь:
# """
# custom_prompt_template = PromptTemplate(template=custom_prompt, input_variables=["context", "question"])

In [50]:
# openchat
custom_prompt = """<s>[INST] Використай надані шматки контексту, щоб відповісти на питання в кінці. Напиши коротку відповідь декількома реченнями. Не намагайся вигадувати відповідь, якщо ти не знаєш або вона не присутня в даному контексті. Відповідь надай українською мовою.
Контекст: {context}
Питання: {question} [/INST] </s>
"""
custom_prompt_template = PromptTemplate(template=custom_prompt, input_variables=["context", "question"])

In [51]:
k = 3

base_compressor = LLMChainExtractor.from_llm(llm, prompt=custom_retriever_prompt_template)
base_retriever = doc_store.as_retriever(search_kwargs={"k": k})

compression_retriever = ContextualCompressionRetriever(
    base_compressor=base_compressor,
    base_retriever=base_retriever,
    k=k,
)

In [52]:
qa_chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=compression_retriever,
    # retriever=qdrant.as_retriever(search_type="mmr"),
    return_source_documents=False,
    chain_type_kwargs={"prompt": custom_prompt_template},
)

In [53]:
with debug_langchain():
    question = "What is Gaussian kernel?"
    # qa_chain({"query": question})
    qa_chain.run(question)

[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "What is Gaussian kernel?"
}


Llama.generate: prefix-match hit


[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 4:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "What is Gaussian kernel?",
  "context": "9\na feature map φsuch that the kernel Kdeﬁned above satisﬁes K(x,z) =\nφ(x)Tφ(z)? Inthisparticularexample, theanswerisyes. Thiskernel iscalled\ntheGaussian kernel , and corresponds to an inﬁnite dimensional feature\nmapping φ. We will give a precise characterization about what propert ies\na function Kneeds to satisfy so that it can be a valid kernel function that\ncorresponds to some feature map φ.\nNecessary conditions for valid kernels. Suppose for now that Kis\nindeed a valid kernel corresponding to some feature mapping φ, and we will\nﬁrst see what properties it satisﬁes. Now, consider some ﬁnit e set ofnpoints\n(not necessarily the training set) {x(1),...,x(n)}, and let a square, n-by-n\nmatrixKbe deﬁned so that its ( i,j)-entry is given by Kij=K(x(i),x(j)).\nThis matrix is called the kern


llama_print_timings:        load time =    9521.16 ms
llama_print_timings:      sample time =       2.21 ms /     9 runs   (    0.25 ms per token,  4077.93 tokens per second)
llama_print_timings: prompt eval time =  881449.24 ms /   719 tokens ( 1225.94 ms per token,     0.82 tokens per second)
llama_print_timings:        eval time =   74247.93 ms /     8 runs   ( 9280.99 ms per token,     0.11 tokens per second)
llama_print_timings:       total time =  955996.66 ms
Llama.generate: prefix-match hit


[36;1m[1;3m[llm/end][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 4:chain:LLMChain > 5:llm:LlamaCpp] [955.91s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "Отповідь: Gaussian kernel",
        "generation_info": null,
        "type": "Generation"
      }
    ]
  ],
  "llm_output": null,
  "run": null
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 4:chain:LLMChain] [955.91s] Exiting Chain run with output:
[0m{
  "text": "Отповідь: Gaussian kernel"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 6:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "What is Gaussian kernel?",
  "context": "11\nApplication of kernel methods: We’ve seen the application of kernels\nto linear regression. In the next part, we will introduce the support vector\nmachines to which kernels can be directly applied. dwell too much longer on\nithere. Infact, theideaofkernelshassig


llama_print_timings:        load time =    9521.16 ms
llama_print_timings:      sample time =      96.30 ms /   419 runs   (    0.23 ms per token,  4350.85 tokens per second)
llama_print_timings: prompt eval time =  515740.49 ms /   414 tokens ( 1245.75 ms per token,     0.80 tokens per second)
llama_print_timings:        eval time = 3896044.58 ms /   418 runs   ( 9320.68 ms per token,     0.11 tokens per second)
llama_print_timings:       total time = 4415628.89 ms
Llama.generate: prefix-match hit


[36;1m[1;3m[llm/end][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 6:chain:LLMChain > 7:llm:LlamaCpp] [4415.77s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "[INST] 11\nApplication of kernel methods: We've seen the application of kernels\nto linear regression. In the next part, we will introduce the support vector\nmachines to which kernels can be directly applied. dwell too much longer on\nithere. Infact, theideaofkernelshassigniﬁcantlybroade rapplicabilitythan\nlinear regression and SVMs. Speciﬁcally, if you have any lear ning algorithm\nthat you can write in terms of only inner products ⟨x,z⟩between input\nattribute vectors, then by replacing this with K(x,z) whereKis a kernel,\nyou can “magically” allow your algorithm to work eﬃciently i n the high\ndimensional feature space corresponding to K. For instance, this kernel trick\ncan be applied with the perceptron to derive a kernel percept ron algorithm.\nMany of the algorithms that 


llama_print_timings:        load time =    9521.16 ms
llama_print_timings:      sample time =      61.71 ms /   262 runs   (    0.24 ms per token,  4245.87 tokens per second)
llama_print_timings: prompt eval time =  146517.10 ms /   120 tokens ( 1220.98 ms per token,     0.82 tokens per second)
llama_print_timings:        eval time = 2426309.51 ms /   261 runs   ( 9296.21 ms per token,     0.11 tokens per second)
llama_print_timings:       total time = 2575266.66 ms
Llama.generate: prefix-match hit


[36;1m[1;3m[llm/end][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 8:chain:LLMChain > 9:llm:LlamaCpp] [2575.28s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "\n[INST] Використай надані шматки контексту, щоб дістати будь-яку частину контексту, яка відповідає на питання. Якщо ніяка частина контексту не відповідає на питання, поверни NO_OUTPUT. Пам'ятай, НЕ редагуй шматки контексту. Відповідь надай українською мовою.\nПитання: What is the Gaussian kernel?\nКонтекст: The second-to-last step uses the fact that∑\ni,jaiaj= (∑\niai)2forai=\nziφk(x(i)). Sincezwas arbitrary, this shows that Kis positive semi-deﬁnite\n(K≥0).\nHence, we’ve shown that if Kis a valid kernel (i.e., if it corresponds to\nsome feature mapping φ), then the corresponding kernel matrix K∈Rn×n\nis symmetric positive semideﬁnite. [/INST] \nВідповідь: Gaussian kernel",
        "generation_info": null,
        "type": "Generation"
      }
    ]
  ],
  "llm_output": null,
  "r


llama_print_timings:        load time =    9521.16 ms
llama_print_timings:      sample time =     133.66 ms /   563 runs   (    0.24 ms per token,  4212.12 tokens per second)
llama_print_timings: prompt eval time =  987930.85 ms /   799 tokens ( 1236.46 ms per token,     0.81 tokens per second)
llama_print_timings:        eval time = 5257461.48 ms /   562 runs   ( 9354.91 ms per token,     0.11 tokens per second)
llama_print_timings:       total time = 6250705.07 ms


In [54]:
with debug_langchain():
    question = "Who is Grandpa Panas?"
    # qa_chain({"query": question})
    qa_chain.run(question)

[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "Who is Grandpa Panas?"
}


Llama.generate: prefix-match hit


[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 4:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "Who is Grandpa Panas?",
  "context": "Petro Yukhymovych Vesklyarov (Ukrainian: Вескляров Петро Юхимович) (June 10 [O.S. May 28] 1911 in Talne, Ukraine – January 5, 1994 in Kyiv) was a Ukrainian theater and television actor. He was also known by the nickname Did Panas (Grandpa Panas, Ukrainian: дід Панас).\nBetween 1932 and 1940, Vesklyarov was an actor in a travelling workers' theatre, and between 1946 and 1959 he performed at the Taras Shevchenko Musical-Drama Theatre in Lutsk, Volyn. Between 1959 and 1982 Veslklyarov worked in the Dovzhenko Film Studios, appearing in a number of films. He starred in the 1959 drama film Ivanna and appeared in the 1970 comedy film Two Days of Miracles. During this time (1964-1986) he appeared as the character \"Дід Панас\" (Grandpa Panas) in the Ukrainian television series \"На добраніч, діти\"  (Goodni


llama_print_timings:        load time =    9521.16 ms
llama_print_timings:      sample time =       6.42 ms /    30 runs   (    0.21 ms per token,  4674.35 tokens per second)
llama_print_timings: prompt eval time =  722362.76 ms /   592 tokens ( 1220.21 ms per token,     0.82 tokens per second)
llama_print_timings:        eval time =  281221.45 ms /    30 runs   ( 9374.05 ms per token,     0.11 tokens per second)
llama_print_timings:       total time = 1003990.39 ms
Llama.generate: prefix-match hit


[36;1m[1;3m[llm/end][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 4:chain:LLMChain > 5:llm:LlamaCpp] [1004.00s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "\nВідповідь: Петро Юхимович Вескляров, відомий також як Дід Панас",
        "generation_info": null,
        "type": "Generation"
      }
    ]
  ],
  "llm_output": null,
  "run": null
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 4:chain:LLMChain] [1004.00s] Exiting Chain run with output:
[0m{
  "text": "\nВідповідь: Петро Юхимович Вескляров, відомий також як Дід Панас"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 6:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "Who is Grandpa Panas?",
  "context": "== Цікаві факти ==\nЗначного поширення набула легенда про те, що, будучи ведучим дитячої програми «На добраніч, діти», яка йшла у прямому ефірі, дід Панас завершив програму такою реплікою:


llama_print_timings:        load time =    9521.16 ms
llama_print_timings:      sample time =       9.48 ms /    34 runs   (    0.28 ms per token,  3585.74 tokens per second)
llama_print_timings: prompt eval time =  322575.82 ms /   262 tokens ( 1231.21 ms per token,     0.81 tokens per second)
llama_print_timings:        eval time =  309998.52 ms /    33 runs   ( 9393.89 ms per token,     0.11 tokens per second)
llama_print_timings:       total time =  632980.16 ms
Llama.generate: prefix-match hit


Ответ: Дід Панас - це псевдонім Петра Юхимовича Весклярова, українського актора і телеведучого.[36;1m[1;3m[llm/end][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 8:chain:LLMChain > 9:llm:LlamaCpp] [564.59s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "Ответ: Дід Панас - це псевдонім Петра Юхимовича Весклярова, українського актора і телеведучого.",
        "generation_info": null,
        "type": "Generation"
      }
    ]
  ],
  "llm_output": null,
  "run": null
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 8:chain:LLMChain] [564.59s] Exiting Chain run with output:
[0m{
  "text": "Ответ: Дід Панас - це псевдонім Петра Юхимовича Весклярова, українського актора і телеведучого."
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 10:chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 10:chain:StuffDocumentsChain > 11:c


llama_print_timings:        load time =    9521.16 ms
llama_print_timings:      sample time =       9.86 ms /    41 runs   (    0.24 ms per token,  4159.06 tokens per second)
llama_print_timings: prompt eval time =  196139.52 ms /   157 tokens ( 1249.30 ms per token,     0.80 tokens per second)
llama_print_timings:        eval time =  368077.40 ms /    40 runs   ( 9201.94 ms per token,     0.11 tokens per second)
llama_print_timings:       total time =  564586.07 ms
Llama.generate: prefix-match hit


Отповідь: Дід Панас - це псевдонім Петра Юхимовича Весклярова, українського актора і телеведучого, відомий також як ведучий дитячої програми «На добраніч, діти».[36;1m[1;3m[llm/end][0m [1m[1:chain:RetrievalQA > 10:chain:StuffDocumentsChain > 11:chain:LLMChain > 12:llm:LlamaCpp] [917.68s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "\nОтповідь: Дід Панас - це псевдонім Петра Юхимовича Весклярова, українського актора і телеведучого, відомий також як ведучий дитячої програми «На добраніч, діти».",
        "generation_info": null,
        "type": "Generation"
      }
    ]
  ],
  "llm_output": null,
  "run": null
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RetrievalQA > 10:chain:StuffDocumentsChain > 11:chain:LLMChain] [917.68s] Exiting Chain run with output:
[0m{
  "text": "\nОтповідь: Дід Панас - це псевдонім Петра Юхимовича Весклярова, українського актора і телеведучого, відомий також як ведучий дитячої програми «На добраніч, діти»."
}
[36;1


llama_print_timings:        load time =    9521.16 ms
llama_print_timings:      sample time =      16.96 ms /    70 runs   (    0.24 ms per token,  4126.63 tokens per second)
llama_print_timings: prompt eval time =  272723.11 ms /   218 tokens ( 1251.02 ms per token,     0.80 tokens per second)
llama_print_timings:        eval time =  644263.62 ms /    69 runs   ( 9337.15 ms per token,     0.11 tokens per second)
llama_print_timings:       total time =  917671.95 ms


In [55]:
with debug_langchain():
    question = "Хто такий дід Панас?"
    # qa_chain({"query": question})
    qa_chain.run(question)

Llama.generate: prefix-match hit


[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "Хто такий дід Панас?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 4:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "Хто такий дід Панас?",
  "context": "Petro Yukhymovych Vesklyarov (Ukrainian: Вескляров Петро Юхимович) (June 10 [O.S. May 28] 1911 in Talne, Ukraine – January 5, 1994 in Kyiv) was a Ukrainian theater and television actor. He was also known by the nickname Did Panas (Grandpa Panas, Ukrainian: дід Панас).\nBetween 1932 and 1940, Vesklyarov was an actor in a travelling workers' theatre, and between 1946 and 1959 he performed at the Taras Shevchenko Musical-Drama Theatre in Lutsk, Volyn. Between 1959 and 1982 Veslklyarov worked in the Dovzhenko Film Studios, appearing in a number of films. He starred in the 1959 drama film Ivanna and appeared in the 1970 comedy film Two Days of Miracles. During this time (1964-1


llama_print_timings:        load time =    9521.16 ms
llama_print_timings:      sample time =       5.87 ms /    24 runs   (    0.24 ms per token,  4086.50 tokens per second)
llama_print_timings: prompt eval time =  740212.46 ms /   596 tokens ( 1241.97 ms per token,     0.81 tokens per second)
llama_print_timings:        eval time =  214724.15 ms /    23 runs   ( 9335.83 ms per token,     0.11 tokens per second)
llama_print_timings:       total time =  955278.25 ms
Llama.generate: prefix-match hit


[36;1m[1;3m[llm/end][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 4:chain:LLMChain > 5:llm:LlamaCpp] [955.29s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "\nВідповідь: Дід Панас - Петро Юхимович Вескляров",
        "generation_info": null,
        "type": "Generation"
      }
    ]
  ],
  "llm_output": null,
  "run": null
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 4:chain:LLMChain] [955.29s] Exiting Chain run with output:
[0m{
  "text": "\nВідповідь: Дід Панас - Петро Юхимович Вескляров"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 6:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "Хто такий дід Панас?",
  "context": "== Цікаві факти ==\nЗначного поширення набула легенда про те, що, будучи ведучим дитячої програми «На добраніч, діти», яка йшла у прямому ефірі, дід Панас завершив програму такою реплікою: «Отака хуйня, малята…» Речових док


llama_print_timings:        load time =    9521.16 ms
llama_print_timings:      sample time =       3.15 ms /    13 runs   (    0.24 ms per token,  4124.37 tokens per second)
llama_print_timings: prompt eval time =  324634.84 ms /   262 tokens ( 1239.06 ms per token,     0.81 tokens per second)
llama_print_timings:        eval time =  113405.51 ms /    12 runs   ( 9450.46 ms per token,     0.11 tokens per second)
llama_print_timings:       total time =  438214.57 ms
Llama.generate: prefix-match hit


[36;1m[1;3m[llm/end][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 6:chain:LLMChain > 7:llm:LlamaCpp] [438.23s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "Отака хуйня, малята...",
        "generation_info": null,
        "type": "Generation"
      }
    ]
  ],
  "llm_output": null,
  "run": null
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 6:chain:LLMChain] [438.23s] Exiting Chain run with output:
[0m{
  "text": "Отака хуйня, малята..."
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 8:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "Хто такий дід Панас?",
  "context": "Петро́ Юхи́мович Вескляро́в, ім'я при народженні Пінхас Хаїмович Весклер (9 червня 1911(19110609), Тальне, Уманський повіт, Київська губернія, Російська імперія — 5 січня 1994, Київ) — український актор і телеведучий. Заслужений артист Української РСР (1973). Більш відомий під


llama_print_timings:        load time =    9521.16 ms
llama_print_timings:      sample time =       3.77 ms /    17 runs   (    0.22 ms per token,  4510.48 tokens per second)
llama_print_timings: prompt eval time =  194448.50 ms /   157 tokens ( 1238.53 ms per token,     0.81 tokens per second)
llama_print_timings:        eval time =  147978.12 ms /    16 runs   ( 9248.63 ms per token,     0.11 tokens per second)
llama_print_timings:       total time =  342592.23 ms
Llama.generate: prefix-match hit


[36;1m[1;3m[llm/end][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 8:chain:LLMChain > 9:llm:LlamaCpp] [342.60s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "Ответ: Пінхас Хаїмович Весклер",
        "generation_info": null,
        "type": "Generation"
      }
    ]
  ],
  "llm_output": null,
  "run": null
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 8:chain:LLMChain] [342.60s] Exiting Chain run with output:
[0m{
  "text": "Ответ: Пінхас Хаїмович Весклер"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 10:chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 10:chain:StuffDocumentsChain > 11:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "Хто такий дід Панас?",
  "context": "\nВідповідь: Дід Панас - Петро Юхимович Вескляров\n\nОтака хуйня, малята...\n\nОтвет: Пінхас Хаїмович Весклер"
}
[32;1m


llama_print_timings:        load time =    9521.16 ms
llama_print_timings:      sample time =       7.00 ms /    19 runs   (    0.37 ms per token,  2713.12 tokens per second)
llama_print_timings: prompt eval time =  220658.08 ms /   170 tokens ( 1297.99 ms per token,     0.77 tokens per second)
llama_print_timings:        eval time =  170394.92 ms /    18 runs   ( 9466.38 ms per token,     0.11 tokens per second)
llama_print_timings:       total time =  391279.70 ms


In [56]:
with debug_langchain():
    question = "Що таке розпізнавання іменованих сутностей?"
    # qa_chain({"query": question})
    qa_chain.run(question)

Llama.generate: prefix-match hit


[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA] Entering Chain run with input:
[0m{
  "query": "Що таке розпізнавання іменованих сутностей?"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 4:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "Що таке розпізнавання іменованих сутностей?",
  "context": "У обробці природної мови зв'язування сутностей (англ. Entity Linking), яке також називають зв'язування іменованих сутностей (ЗІС), розпізнавання іменованих сутностей (РІС), розпізнавання і неоднозначності іменованих сутностей (РНІС) або нормалізація іменованих сутностей (НІС), — це присвоєння унікальної ідентичності об'єктам (наприклад, відомим особам, місцям чи компаніям), що згадуються у тексті. Наприклад, розглянемо речення «Дніпро — річка України». Ідея полягає в тому, щоб визначити, що «Дніпро» належить до річки Дніпро, а не до Дніпро-арена чи будь-якої іншої організації, яку можна назвати «Дніпро». Зв'язування сутностей в


llama_print_timings:        load time =    9521.16 ms
llama_print_timings:      sample time =       6.30 ms /    17 runs   (    0.37 ms per token,  2697.98 tokens per second)
llama_print_timings: prompt eval time =  597462.14 ms /   482 tokens ( 1239.55 ms per token,     0.81 tokens per second)
llama_print_timings:        eval time =  149263.63 ms /    16 runs   ( 9328.98 ms per token,     0.11 tokens per second)
llama_print_timings:       total time =  746991.47 ms
Llama.generate: prefix-match hit


[36;1m[1;3m[llm/end][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 4:chain:LLMChain > 5:llm:LlamaCpp] [747.00s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "[Розпізнавання іменованих сутностей]",
        "generation_info": null,
        "type": "Generation"
      }
    ]
  ],
  "llm_output": null,
  "run": null
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 4:chain:LLMChain] [747.00s] Exiting Chain run with output:
[0m{
  "text": "[Розпізнавання іменованих сутностей]"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 6:chain:LLMChain] Entering Chain run with input:
[0m{
  "question": "Що таке розпізнавання іменованих сутностей?",
  "context": "Розпізнавання іменованих сутностей (РІС) (також відоме як ідентифікація об'єктної сутності, фрагментація об'єктної сутності та видобуток об'єктної сутності) — це підзадача видобування інформації, яка намагається знайти і класи


llama_print_timings:        load time =    9521.16 ms
llama_print_timings:      sample time =      18.96 ms /    83 runs   (    0.23 ms per token,  4378.56 tokens per second)
llama_print_timings: prompt eval time =  709372.04 ms /   570 tokens ( 1244.51 ms per token,     0.80 tokens per second)
llama_print_timings:        eval time =  766671.65 ms /    82 runs   ( 9349.65 ms per token,     0.11 tokens per second)
llama_print_timings:       total time = 1476923.28 ms
Llama.generate: prefix-match hit


[36;1m[1;3m[llm/end][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 6:chain:LLMChain > 7:llm:LlamaCpp] [1476.93s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "\nВідповідь: Розпізнавання іменованих сутностей - це підзадача видобування інформації, яка намагається знайти та класифікувати іменовані сутності в неструктурованому тексті за заздалегідь визначеними категоріями.",
        "generation_info": null,
        "type": "Generation"
      }
    ]
  ],
  "llm_output": null,
  "run": null
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 6:chain:LLMChain] [1476.93s] Exiting Chain run with output:
[0m{
  "text": "\nВідповідь: Розпізнавання іменованих сутностей - це підзадача видобування інформації, яка намагається знайти та класифікувати іменовані сутності в неструктурованому тексті за заздалегідь визначеними категоріями."
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 8:


llama_print_timings:        load time =    9521.16 ms
llama_print_timings:      sample time =      13.37 ms /    59 runs   (    0.23 ms per token,  4411.87 tokens per second)
llama_print_timings: prompt eval time =  343833.79 ms /   277 tokens ( 1241.28 ms per token,     0.81 tokens per second)
llama_print_timings:        eval time =  534208.95 ms /    58 runs   ( 9210.50 ms per token,     0.11 tokens per second)
llama_print_timings:       total time =  878556.81 ms
Llama.generate: prefix-match hit


[36;1m[1;3m[llm/end][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 8:chain:LLMChain > 9:llm:LlamaCpp] [878.56s] Exiting LLM run with output:
[0m{
  "generations": [
    [
      {
        "text": "Одним словом, розпізнавання іменованих сутностей - це процес виявлення імен у тексті та їх класифікації за типом (особи, організації, місця тощо).",
        "generation_info": null,
        "type": "Generation"
      }
    ]
  ],
  "llm_output": null,
  "run": null
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RetrievalQA > 2:retriever:Retriever > 8:chain:LLMChain] [878.56s] Exiting Chain run with output:
[0m{
  "text": "Одним словом, розпізнавання іменованих сутностей - це процес виявлення імен у тексті та їх класифікації за типом (особи, організації, місця тощо)."
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 10:chain:StuffDocumentsChain] Entering Chain run with input:
[0m[inputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RetrievalQA > 10:chain:StuffDocumentsChain


llama_print_timings:        load time =    9521.16 ms
llama_print_timings:      sample time =      37.74 ms /   155 runs   (    0.24 ms per token,  4107.37 tokens per second)
llama_print_timings: prompt eval time =  360367.32 ms /   282 tokens ( 1277.90 ms per token,     0.78 tokens per second)
llama_print_timings:        eval time = 1427573.62 ms /   154 runs   ( 9269.96 ms per token,     0.11 tokens per second)
llama_print_timings:       total time = 1789379.39 ms
