In [37]:
import os

os.chdir("..")

from dotenv import load_dotenv

load_dotenv()

os.environ.setdefault(
    "USER_AGENT", "intellijupyter:llm-lab:0.1.0 (by github.com/kvdomingo)"
)

'intellijupyter:llm-lab:0.1.0 (by github.com/kvdomingo)'

In [74]:
import bs4
import chromadb
import ollama
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.documents import Document
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama import ChatOllama, OllamaEmbeddings

from common.settings import settings

## Indexing: Load

In [57]:
client = ollama.AsyncClient(host=str(settings.OLLAMA_URL))
embedding_model_name = "mxbai-embed-large"
chat_model_name = "llama3.2"
await client.pull(embedding_model_name)
await client.pull(chat_model_name)

{'status': 'success'}

In [40]:
bs4_strainer = bs4.SoupStrainer(
    name=(
        "h1",
        "h2",
        "h3",
        "h4",
        "h5",
        "h6",
        "p",
        "a",
        "ul",
        "ol",
        "li",
        "div",
        "code",
        "caption",
        "figcaption",
        "table",
        "thead",
        "tbody",
        "tr",
        "td",
        "th",
    ),
)
loader = WebBaseLoader(
    web_path="https://kvd.studio/svip/ap186/image-types",
    bs_kwargs={"parse_only": bs4_strainer},
)
docs = loader.load()
len(docs[0].page_content)

6238

## Indexing: Split

In [44]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    add_start_index=True,
)
all_splits = text_splitter.split_documents(docs)
len(all_splits)

8

## Indexing: Store

In [52]:
db = chromadb.HttpClient(host="localhost", port=8000)

vectorstore = Chroma.from_documents(
    documents=all_splits,
    client=db,
    embedding=OllamaEmbeddings(
        base_url=str(settings.OLLAMA_URL),
        model=embedding_model_name,
    ),
)

## Retrieval

In [55]:
retriever = vectorstore.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 6},
)
retrieved_docs = retriever.invoke("What is the RAW image format used by Nikon?")
retrieved_docs[0].page_content

'channels. In the RAW file format, the raw information captured by the sensor is stored without any\ncompression or manipulation. This technology is still used at present especially by professional camera\nmanufacturers—such as Nikon’s NEF format and Canon’s CR2 format—in order\nto preserve all the information received by the camera’s sensor, which provides an advantage when\npost-processing later on, especially when shooting in low light. As imaging technology developed, so has our\nability to reproduce larger images. Eventually, storage capacity became an issue, which birthed the challenge of\ndata compression.\nOne of the first formats to have taken up this challenge is the Graphics Interchange Format (GIF)\nin 1987 [1]. It uses the Lempel-Ziv-Welch (LZW) lossless compression algorithm, which provides up to\n25% compression. However, GIF is a pixel-dependent, 8-bit indexed format, so images could not show'

## Generation

In [81]:
llm = ChatOllama(
    base_url=str(settings.OLLAMA_URL),
    model=chat_model_name,
)
base_prompt = """
You are a helpful assistant for question-answering tasks. Use the following pieces of
retrieved content to answer the question. If you don't know the answer, just say that
you don't know, don't try to make up an answer. Do not add any supplementary information.

Context: {context}
"""
prompt = ChatPromptTemplate.from_messages(
    [("system", base_prompt), ("human", "{input}")]
)
prompt

ChatPromptTemplate(input_variables=['context', 'input'], input_types={}, partial_variables={}, messages=[SystemMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template="\nYou are a helpful assistant for question-answering tasks. Use the following pieces of \nretrieved content to answer the question. If you don't know the answer, just say that \nyou don't know, don't try to make up an answer. Do not add any supplementary information.\n\nContext: {context}\n"), additional_kwargs={}), HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['input'], input_types={}, partial_variables={}, template='{input}'), additional_kwargs={})])

In [82]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain


def format_docs(docs: list[Document]):
    return "\n\n".join(doc.page_content for doc in docs)


question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)

In [101]:
sources = []
async for out in rag_chain.astream(
    {"input": "What is the story behind the GIF format?"}
):
    if out.get("context"):
        sources.append(out["context"][0])

    if out.get("answer"):
        print(out["answer"], end="", flush=True)

print(
    "\n\nSource:"
    + sources[0].metadata.get("source")
    + f' (l. {sources[0].metadata.get("start_index")})'
)

The GIF (Graphics Interchange Format) was created in 1987. It uses the Lempel-Ziv-Welch (LZW) lossless compression algorithm, which provides up to 25% compression. However, GIF is a pixel-dependent, 8-bit indexed format, so images could not show their full color range in this format. 

The file format quickly fell out of favor when its compression technique was patented in 1995 by Unisys Corporation [2], who attempted to collect royalties from GIF users.

Source:https://kvd.studio/svip/ap186/image-types (l. 3266)
