## Setup

### Install libraries

In [1]:
!pip install --upgrade --quiet langchain langchain_community pdfplumber chromadb tqdm streamlit ollama pyngrok giskard

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m701.6 kB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.3/67.3 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.4/60.4 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m32.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?

In [2]:
!curl https://ollama.ai/install.sh | sh

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0>>> Installing ollama to /usr/local
100 13320    0 13320    0     0  32988      0 --:--:-- --:--:-- --:--:-- 32970
>>> Downloading Linux amd64 bundle
############################################################################################# 100.0%
>>> Creating ollama user...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.


In [3]:
!ollama serve > rocama.log 2>&1 &
!ollama pull llama3.1 & ollama pull nomic-embed-text

[?25lpulling manifest ⠋ [?25h[?25lpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠸ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠼ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest ⠴ [?25h[?25l[2K[1Gpulling manifest ⠦ [?25h[?25l[2K[1Gpulling manifest ⠦ [?25h[?25l[2K[1Gpulling manifest ⠧ [?25h[?25l[2K[1Gpulling manifest ⠧ [?25h[?25l[2K[1Gpulling manifest ⠇ [?25h[?25l[2K[1Gpulling manifest ⠇ [?25h[?25l[2K[1Gpulling manifest ⠏ [?25h[?25l[2K[1Gpulling manifest ⠏ [?25h[?25l[2K[1Gpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠋ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠙ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[?25l[2K[1Gpulling manifest ⠹ [?25h[

In [4]:
!ollama list

NAME                   	ID          	SIZE  	MODIFIED               
llama3.1:latest        	42182419e950	4.7 GB	Less than a second ago	
nomic-embed-text:latest	0a109f422b47	274 MB	54 seconds ago        	


### Load libraries

In [5]:
import os
import datetime
from tqdm import tqdm
import tempfile
import streamlit as st
from IPython.display import Markdown

import ollama
from langchain.chains import RetrievalQA
from langchain.memory.chat_message_histories import StreamlitChatMessageHistory
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.chat_models import ChatOllama
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.document_loaders import PDFPlumberLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

### Constants

In [6]:
# Define the directory where your PDFs are stored
pdf_directory = "/content/sample_data/pdfs/"
save_dir = pdf_directory

vector_database_path = "/content/sample_data/vector_database"

for path in [pdf_directory, vector_database_path]:
  if not os.path.exists(path):
    os.makedirs(path, exist_ok=True)

## Workflow

### 1. Extract Texts from PDFs

In [33]:
# Get a list of all PDF files in the directory
pdf_files = [f for f in os.listdir(pdf_directory) if f.endswith('.pdf')]

pdf_pages = []
# Iterate through each PDF file and load it
for pdf_file in pdf_files:
    file_path = os.path.join(pdf_directory, pdf_file)
    print(f"Loading file: {pdf_file}")

    # Load the PDF and split it into pages
    loader = PDFPlumberLoader(file_path=file_path)
    pages = loader.load()
    pdf_pages.extend(pages)

Loading file: Advances in Emerging Memory Technologies - From Data Storage to Artificial Intelligence.pdf


### 2. Split Text

In [34]:
# Initialize the text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50,
    length_function=len,
    is_separator_regex=False
)

text_chunks = []
# Split the pages into chunks
for page in pdf_pages:
  chunks = text_splitter.split_text(page.page_content)
  text_chunks.extend(chunks)

text_chunks

['applied\nsciences\nReview\nAdvances in Emerging Memory Technologies: From Data\nStorage to Artificial Intelligence\nGabrielMolas*andEtienneNowak\nCEA,LETI,UniversitéGrenobleAlpes,38000Grenoble,France;etienne.nowak@cea.fr\n* Correspondence:gabriel.molas@cea.fr\nAbstract: Thispaperpresentsanoverviewofemergingmemorytechnologies. Itbeginswiththe\npresentationofstand-aloneandembeddedmemorytechnologyevolution,sincetheappearance',
 'of Flash memory in the 1980s. Then, the progress of emerging memory technologies (based on\nfilamentary,phasechange,magnetic,andferroelectricmechanisms)ispresentedwithareviewofthe\nmajordemonstrationsintheliterature.Thepotentialofthesetechnologiesforstorageapplications\naddressingvariousmarketsandproductsisdiscussed.Finally,wediscusshowtheriseofartificial\nintelligenceandbio-inspiredcircuitsoffersanopportunityforemergingmemorytechnologyand',
 'shiftstheapplicationfrompuredatastoragetostorageandcomputingtasks,andalsoenlargesthe\nrangeofrequiredspecificationsatthe

In [47]:
# Metadata management
def add_metadata(chunks, doc_title):
  metadata_chunks = []
  for chunk in chunks:
    metadata = {
        "title": doc_title,
        "author": "Gabriel Molas, Etienne Nowak",
        "date": str(datetime.date.today())
    }
    metadata_chunks.append({"text": chunk, "metadata": metadata})
  return metadata_chunks

# Add metadata to chunks
chunks_with_metadata = add_metadata(text_chunks, "Advances in Emerging Memory Technologies - From Data Storage to Artificial Intelligence")
chunks_with_metadata

[{'text': 'applied\nsciences\nReview\nAdvances in Emerging Memory Technologies: From Data\nStorage to Artificial Intelligence\nGabrielMolas*andEtienneNowak\nCEA,LETI,UniversitéGrenobleAlpes,38000Grenoble,France;etienne.nowak@cea.fr\n* Correspondence:gabriel.molas@cea.fr\nAbstract: Thispaperpresentsanoverviewofemergingmemorytechnologies. Itbeginswiththe\npresentationofstand-aloneandembeddedmemorytechnologyevolution,sincetheappearance',
  'metadata': {'title': 'Advances in Emerging Memory Technologies - From Data Storage to Artificial Intelligence',
   'author': 'Gabriel Molas, Etienne Nowak',
   'date': '2024-09-17'}},
 {'text': 'of Flash memory in the 1980s. Then, the progress of emerging memory technologies (based on\nfilamentary,phasechange,magnetic,andferroelectricmechanisms)ispresentedwithareviewofthe\nmajordemonstrationsintheliterature.Thepotentialofthesetechnologiesforstorageapplications\naddressingvariousmarketsandproductsisdiscussed.Finally,wediscusshowtheriseofartificial\ninte

### 3. Create embeddings from text chunks

In [48]:
# Generate embeddings for text chunks
def generate_embeddings(text_chunks, model_name='nomic-embed-text'):
  embeddings = []
  for chunk in tqdm(text_chunks, desc="Embedding text chunks"):
    embedding = ollama.embeddings(model=model_name, prompt=chunk)
    embeddings.append(embedding)
  return embeddings

In [49]:
texts = [chunk["text"] for chunk in chunks_with_metadata]
embeddings = generate_embeddings(texts)

Embedding text chunks: 100%|██████████| 252/252 [00:25<00:00,  9.92it/s]


### 4. Store and use embeddings in ChromaDB

In [50]:
# Wrap texts with their respective metadata into Document objects
documents = [Document(page_content=chunk['text'], metadata=chunk['metadata']) for chunk in chunks_with_metadata]

vector_store = Chroma.from_documents(documents=documents,
                                    embedding=OllamaEmbeddings(model="nomic-embed-text", show_progress=False),
                                    persist_directory=vector_database_path,
                                    collection_name="local-rag"
)

vector_store.persist()

### 5. Query Processing Multi-Query Retriever

In [51]:
# LLM from Ollama
local_model = "llama3.1"
llm = ChatOllama(model=local_model)

In [52]:
QUERY_PROMPT = PromptTemplate(
  input_variables=["question"],
  template="""You are an AI language model assistant. Your task is to generate five
  different versions of the given user question to retrieve relevant documents from
  a vector database. By generating multiple perspectives on the user question, your
  goal is to help the user overcome some of the limitations of the distance-based
  similarity search. Provide these alternative questions separated by newlines.
  Original question: {question}""",
)

In [53]:
retriever = MultiQueryRetriever.from_llm(
  vector_store.as_retriever(search_type="mmr", search_kwargs={"k": 4, "fetch_k": 10}),
  ChatOllama(model=local_model),
  prompt=QUERY_PROMPT
)

# RAG prompt
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

In [54]:
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [57]:
questions = '''What is NAND flash memory?'''

rows = []
for document_score in vector_store.similarity_search_with_score(questions):
  rows.append((document_score[0].page_content, document_score[1]))

import pandas as pd
df = pd.DataFrame(rows, columns=['Text', 'Score'])
df

Unnamed: 0,Text,Score
0,and all manner of flash card products. For sta...,363.202332
1,"DRAM and Flash, often referred to as storage c...",386.425232
2,"The memory is faster than NAND, has higher end...",407.259949
3,in the order of 30–50 ns and the one of flash-...,413.397888


In [58]:
import langchain
langchain.debug = True

display(Markdown(chain.invoke({"question": questions})))

[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence] Entering Chain run with input:
[0m{
  "question": "What is NAND flash memory?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence > chain:RunnableParallel<context,question>] Entering Chain run with input:
[0m{
  "question": "What is NAND flash memory?"
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence > chain:RunnableParallel<context,question> > retriever:MultiQueryRetriever > chain:RunnableSequence] Entering Chain run with input:
[0m{
  "question": {
    "question": "What is NAND flash memory?"
  }
}
[32;1m[1;3m[chain/start][0m [1m[chain:RunnableSequence > chain:RunnableParallel<context,question> > retriever:MultiQueryRetriever > chain:RunnableSequence > prompt:PromptTemplate] Entering Prompt run with input:
[0m{
  "question": {
    "question": "What is NAND flash memory?"
  }
}
[36;1m[1;3m[chain/end][0m [1m[chain:RunnableSequence > chain:RunnableParallel<context,question> > retriever:MultiQue

Based on the provided context, it appears that NAND flash memory is not explicitly defined. However, I can provide some relevant information from the text.

NAND flash memory is mentioned in several documents as a type of nonvolatile memory (NVM) that has been scaled for decades to improve performance and bring innovation to the system memory architectures. It is stated that scaling of the NAND technology was pursued below grates the PCRAM with a backend selector in crosspoint arrays, and it continues to improve and is expected to continue as a leading technology for data storage.

In one document, NAND flash memory is mentioned alongside other nonvolatile memories such as DRAM and Flash, often referred to as storage class memory. The text also mentions that NAND technology was scaled for decades, retaining the same system memory architectures with improved performances, bringing innovation to the concept, stack, and architecture.

From this information, it can be inferred that NAND flash memory is a type of nonvolatile memory that has been widely used in data storage applications due to its scalability, performance improvements, and constant technology advancements. However, the exact definition of NAND flash memory is not explicitly provided in the context.

## Streamlit

### Build Streamlit app

In [64]:
%%writefile app.py
import os
import datetime
from tqdm import tqdm
import tempfile
import streamlit as st
from IPython.display import Markdown

import ollama
from langchain.chains import RetrievalQA
from langchain.memory.chat_message_histories import StreamlitChatMessageHistory
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.chat_models import ChatOllama
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.document_loaders import PDFPlumberLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

# Create a Streamlit app example
st.set_page_config(page_title="RAG with LLaMa 3.1", layout="wide")
with st.sidebar:
  st.write("**RAG with LLaMa 3.1**")

@st.cache_resource(ttl="1h")
def configure_retriever(uploaded_files):
  # read documents
  docs = []
  temp_dir = tempfile.TemporaryDirectory()
  for file in uploaded_files:
    temp_filepath = os.path.join(temp_dir.name, file.name)
    with open(temp_filepath, "wb") as f:
      f.write(file.getvalue())
    loader = PDFPlumberLoader(file_path=temp_filepath)
    docs.extend(loader.load())

  # split documents
  text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
  splits = text_splitter.split_documents(docs)

  # create embeddings and store in vector_store
  embeddings = OllamaEmbeddings(model="nomic-embed-text", show_progress=False)
  vector_store = Chroma.from_documents(splits, embeddings)

  # define retriever
  retriever = vector_store.as_retriever(
    search_type="mmr", search_kwargs={"k": 4, "fetch_k": 10}
  )

  return retriever

uploaded_files = st.sidebar.file_uploader(
  label="Upload PDF files", type=["pdf"], accept_multiple_files=True
)

if not uploaded_files:
  st.info("Please upload PDF documents to continue.")
  st.stop()

# LLM from Ollama
local_model = "llama3.1"
llm = ChatOllama(model=local_model)

# Prompt template
QUERY_PROMPT = PromptTemplate(
  input_variables=["question"],
  template="""You are an AI language model assistant. Your task is to generate five
  different versions of the given user question to retrieve relevant documents from
  a vector database. By generating multiple perspectives on the user question, your
  goal is to help the user overcome some of the limitations of the distance-based
  similarity search. Provide these alternative questions separated by newlines.
  Original question: {question}""",
)
# Retriever
retriever = MultiQueryRetriever.from_llm(configure_retriever(uploaded_files), llm, prompt=QUERY_PROMPT)

# RAG prompt
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

chain = (
  {"context": retriever, "question": RunnablePassthrough()}
  | prompt
  | llm
  | StrOutputParser()
)

msgs = StreamlitChatMessageHistory()

if len(msgs.messages) == 0 or st.sidebar.button("New Chat"):
  msgs.clear()
  msgs.add_ai_message("How can I help you?")

avatars = {"human": "user", "ai": "assistant"}
for msg in msgs.messages:
  st.chat_message(avatars[msg.type]).write(msg.content)

if user_query := st.chat_input(placeholder="Ask me anything!"):
  msgs.add_user_message(user_query)
  st.chat_message("user").write(user_query)

  with st.chat_message("assistant"):
    answer = chain.invoke({"question": user_query})
    msgs.add_ai_message(answer)
    st.write(answer)


about = st.sidebar.expander("About")
about.write("You can easily chat with a PDF using this AI chatbot. \
            It is build by [AI Geek Labs](https://aigeeklabs.com). Github Repo is [here](https://github.com/aigeek0x0/rag-with-langchain-colbert-and-ragatouille)")

Overwriting app.py


### Run Streamlit app with ngrok

In [65]:
from pyngrok import ngrok
from google.colab import userdata

# Set authentication token if you haven't already done so
ngrok.set_auth_token(userdata.get('NGROK_AUTH_KEY'))

# Start Streamlit server on a specific port
!nohup streamlit run app.py --server.port 8501 &

# Start ngrok tunnel to expose the Streamlit server
ngrok_tunnel = ngrok.connect(addr='8501', proto='http', bind_tls=True)

# Print the URL of the ngrok tunnel
print(' * Tunnel URL:', ngrok_tunnel.public_url)

nohup: appending output to 'nohup.out'
 * Tunnel URL: https://0e1e-34-145-21-212.ngrok-free.app
