# Installation
Setup the Colab kernel to work with Ollama on GPU and download required Python libraries.
- faiss-cpu : the vector index
- langchain-community : necessary for integration with ollama

In [None]:
!sudo apt-get install -y lshw
!curl https://ollama.ai/install.sh | sh
!pip install llama-index qdrant-client sentence-transformers langchain \
        faiss-cpu langchain-community


Initialize LLM setup : we are working with a lightweight LLM : mistral-7B (4-bit).
First we pull the model locally and then we start the ollama service with `ollama serve`

In [None]:
import subprocess
def start_ollama_serving():
  """Start serving on ollama in a separate process.
  """
  process = subprocess.Popen(
      "ollama serve",
      shell=True,
      stdout=subprocess.PIPE,
      stderr=subprocess.PIPE
    )

In [None]:
start_ollama_serving()

In [None]:
!ollama pull mistral:7b-instruct-q4_0

# Data
We fetch some data to do RAG on.

In [None]:
!mkdir -p 'data/paul_graham/'
!wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt' -P 'data/paul_graham/'

In [None]:
path = "data/paul_graham/paul_graham_essay.txt"

# RAG chain
We setup the RAG chain using lanchain.
We use custom embedding and choose a custom vector store just for the sake of showing how customizable
the chain is and clearly show the dependencies of the chain.

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda, RunnablePassthrough

from langchain.text_splitter import CharacterTextSplitter

from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Qdrant, FAISS
from langchain_community.llms import Ollama

In [None]:
# load the data
loader = TextLoader(path)
documents = loader.load()

# split the documents in chunks, here based on characters
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

In [None]:
# configure vector store and embedder
# distiluse-base-multilingual-cased-v2 is 539MB
# all-MiniLM-L6-v2 is 90MB
embed_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
vectorstore = Qdrant.from_documents(
    docs,
    embed_model,
    location=":memory:",  # Local mode with in-memory storage only
    collection_name="my_documents",
)
retriever = vectorstore.as_retriever()

In [None]:
# this is where you define the prompting scheme
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

In [None]:
# the model which is served by Ollama
model = Ollama(
    base_url='http://localhost:11434',
    model="mistral:7b-instruct-q4_0"
)

In [None]:
# THE CHAIN
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

# Serving
This where you play with the chain. Remember to restart the ollama serving if the chain is not responding.

In [None]:
start_ollama_serving()

In [None]:
model("What is 2 + 2 ?")

In [None]:
chain.invoke("where did harrison work?")