<a href="https://colab.research.google.com/github/jessiechd/RAG_Model/blob/main/0205_agenticRAG_langchain.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Chunking (LangChain)
- Document/MD chunking [Guide](https://www.youtube.com/watch?v=pIGRwMjhMaQ)
- Agentic chunking [Guide](https://github.com/zenUnicorn/Agentic-RAG-LangChain/blob/main/Agentic-RAG.ipynb)

In [25]:
!pip install langchain langchain-community chromadb ollama rich --quiet
!pip install --upgrade langchain




In [30]:
from rich import print
from langchain.docstore.document import Document
from langchain_community.chat_models import ChatOllama
from langchain_community.vectorstores import Chroma
from langchain_community import embeddings
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

local_llm = ChatOllama(model="mistral")

def rag(documents, collection_name):
  vectorstore = Chroma.from_documents(
      documents=documents,
      collection_name=collection_name,
      embedding=embeddings.ollama.OllamaEmbeddings(model="nomic-embed-text"),
  )
  retriever = vectorstore.as_retriever()

  prompt_template = """Answer the question based only on the following context:
  {context}
  Question: {question}"""
  prompt = ChatPromptTemplate.from_template(prompt_template)

  rag_pipeline = (
      {"context": retriever, "question": RunnablePassthrough()}
      | prompt
      | local_llm
      | StrOutputParser()
  )
  result = rag_pipeline.invoke("What is the main topic?")
  print(result)

# Document Chunking

In [53]:
from langchain.text_splitter import MarkdownTextSplitter

splitter = MarkdownTextSplitter(chunk_size=50, chunk_overlap=10)

md_file = "/content/17_qwen1.md"
with open(md_file, "r", encoding="utf-8") as file:
    markdown_text = file.read()

print(splitter.create_documents([markdown_text]))


# Agentic Chunking

In [46]:
!pip install langchain-groq langchain-pinecone pinecone --quiet

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/427.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m419.8/427.3 kB[0m [31m15.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m427.3/427.3 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.3/1.3 MB[0m [31m61.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/121.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m121.9/121.9 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━

In [47]:
# GLOBAL
import os
import pandas as pd
import numpy as np
import tiktoken
from uuid import uuid4
# from tqdm import tqdm
from dotenv import load_dotenv
from tqdm.autonotebook import tqdm


# LANGCHAIN
import langchain
from langchain.llms import OpenAI
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.chains.conversation.memory import ConversationBufferWindowMemory
from langchain.chains import RetrievalQA
from langchain_groq import ChatGroq
from langchain_pinecone import PineconeVectorStore
from langchain_core.prompts import PromptTemplate

# VECTOR STORE
import pinecone
from pinecone import Pinecone, ServerlessSpec
# AGENTS
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain.agents import AgentExecutor, Tool, AgentType
from langchain.agents.react.agent import create_react_agent
from langchain import hub

In [48]:
# Load environmental variables from a .env file
load_dotenv()
import os

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
#OPENAI_API_KEY = os.environ["OPENAI_API_KEY"] = "***********"

In [60]:
md_file = "/content/17_qwen1.md"
with open(md_file, "r", encoding="utf-8") as file:
    data = file.read()

len(data)

24931

In [54]:
# Tokenization

# Count the number of tokens in a given string
def num_tokens_from_string(question, encoding_name):

    encoding = tiktoken.get_encoding(encoding_name)

    num_tokens = encoding.encode(question)

    return encoding, num_tokens


question = "What is this text about?"

encoding, num_tokens = num_tokens_from_string(question, "cl100k_base")

print(f'Number of Words: {len(question.split())}')
print(f'Number of Characters: {len(question)}')
print(f'List of Tokens: {num_tokens}')
print(f'Nr of Tokens: {len(num_tokens)}')

In [59]:
# Decoding tokenizer

encoding.decode([3923, 374, 420, 1495, 922, 30])

'What is this text about?'

In [61]:
# Define cosine similarity function

def cosine_similarity(query_emb, document_emb):

    # Calculate the dot product of the query and document embeddings
    dot_product = np.dot(query_emb, document_emb)

    # Calculate the L2 norms (magnitudes) of the query and document embeddings
    query_norm = np.linalg.norm(query_emb)
    document_norm = np.linalg.norm(document_emb)

    # Calculate the cosine similarity
    cosine_sim = dot_product / (query_norm * document_norm)

    return cosine_sim

In [67]:
!pip install faiss-cpu sentence-transformers sklearn --quiet

  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


In [68]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Load pre-trained model (distilbert is a good lightweight choice for free use)
model = SentenceTransformer('all-MiniLM-L6-v2')

# Define the question and document
question = "What is this text about?"
document = "Views of clinical trainers and trainees"

# Generate embeddings for the question and document
query_emb = model.encode([question])
document_emb = model.encode([document])

# Compute cosine similarity
cosine_sim = cosine_similarity(query_emb, document_emb)

# Output the results
print(f'Query Embedding Dimensions: {len(query_emb[0])}')
print(f'Document Embedding Dimensions: {len(document_emb[0])}')
print("Cosine Similarity:", cosine_sim[0][0])


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [71]:
# splitter
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    model_name="gpt-3.5-turbo-0125",
    chunk_size=512,
    chunk_overlap=20,
    separators= ["\n\n", "\n", " ", ""])

In [73]:
# Indexing

from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


In [75]:
# Sample documents to index
documents = [
    "The views of clinical trainers and trainees",
    "Machine learning models for medical image analysis",
    "Overview of AI in healthcare"
]

# Generate embeddings for the documents
document_embeddings = model.encode(documents)

# FAISS: Convert embeddings to numpy array (FAISS works with numpy arrays)
document_embeddings = np.array(document_embeddings).astype("float32")

# Create FAISS index (using cosine similarity, which is equivalent to inner product for normalized vectors)
dimension = document_embeddings.shape[1]  # 768 for the 'all-MiniLM-L6-v2' model
index = faiss.IndexFlatIP(dimension)  # Flat index with Inner Product (cosine similarity)


In [77]:
# Add document embeddings to the FAISS index
index.add(document_embeddings)

# Now you can search the index using a query
query = "What are the views of trainers and trainees?"
query_emb = model.encode([query]).astype("float32")

# Perform search (finding top 3 most similar documents)
k = 3  # Number of results to return
distances, indices = index.search(query_emb, k)

# Output the results
print("Top 3 most similar documents:")
for i in range(k):
    print(f"Document: {documents[indices[0][i]]} - Similarity: {distances[0][i]}")
