In [None]:
!pip install -q --upgrade langchain langchain-groq langchain-core langchain_community docx2txt pypdf langchain_chroma sentence_transformers


In [None]:
import langchain
print(langchain.__version__)

In [None]:
import os
os.environ["GROQ_API_KEY"] = "REPLACE"

In [None]:
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"]="REPLACE"
os.environ["LANGCHAIN_PROJECT"]="chatbot"

In [None]:
from langchain_groq import ChatGroq

llm = ChatGroq(model="llama-3.1-8b-instant")
llm_response = llm.invoke("Tell me a joke")
print(llm_response)


Parsing output - LLM response contains more details which are not necessary to the user. Hence response is parsed to get the show the relevant output only.

In [None]:
from langchain_core.output_parsers import StrOutputParser
output_parser=StrOutputParser()
output_parser.invoke(llm_response)

Simple chain - to directly parse

In [None]:
chain = llm | output_parser
chain.invoke("Tell me a joke")

structured output


In [None]:
from typing import List
from pydantic import BaseModel, Field

class MobileReview(BaseModel):
    phone_model: str = Field(description="Name and model of the phone")
    rating: float = Field(description="Overall rating out of 5")
    pros: List[str] = Field(description="List of positive aspects")
    cons: List[str] = Field(description="List of negative aspects")
    summary: str = Field(description="Brief summary of the review")

review_text = """
Just got my hands on the new Galaxy S21 and wow, this thing is slick! The screen is gorgeous,
colors pop like crazy. Camera's insane too, especially at night - my Insta game's never been
stronger. Battery life's solid, lasts me all day no problem.
Not gonna lie though, it's pretty pricey. And what's with ditching the charger? C'mon Samsung.
Also, still getting used to the new button layout, keep hitting Bixby by mistake.
Overall, I'd say it's a solid 4 out of 5. Great phone, but a few annoying quirks keep it from
being perfect. If you're due for an upgrade, definitely worth checking out!
"""

structured_llm = llm.with_structured_output(MobileReview)
output = structured_llm.invoke(review_text)
print(output)
print(output.pros)


Prompt Template

In [None]:

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

prompt=ChatPromptTemplate.from_template("Tell me a short joke about {topic}")
output_parser=StrOutputParser()
chain=prompt | llm | output_parser
result=chain.invoke({"topic":"Programming"})
print(result)

result=chain.invoke({"topic":"car driving"})
print(result)

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.messages import HumanMessage,SystemMessage

system_message=SystemMessage(content="You are a helpful assistane that tells jokes.")
human_message=HumanMessage(content="Tell me a joke about birds")
result=llm.invoke([system_message,human_message])
print(result)

In [None]:
template=ChatPromptTemplate([
    ("system","You are a helpful assistane that tells jokes."),
    ("human","Tell me about: {user_input}")
])

prompt_value=template.invoke({"user_input":"birds"})
prompt_value


In [None]:
llm.invoke(prompt_value)

In [None]:
from langchain_community.document_loaders import PyPDFLoader, Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

from typing import List
from langchain_core.documents import Document
import os

text_splitter=RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len
)

docx_loader=Docx2txtLoader("/content/data/Company_Lumora.docx")
documents=docx_loader.load()
print(len(documents))

splits=text_splitter.split_documents(documents)
print(f"Split the documents into {len(splits)} chunks.")



In [None]:
splits[0]

In [None]:
splits[0].metadata

In [None]:
splits[0].page_content

In [None]:
def load_documents(folder_path: str) -> List[Document]:
    documents = []
    for filename in os.listdir(folder_path):
        file_path = os.path.join(folder_path, filename)
        if filename.endswith('.pdf'):
            loader = PyPDFLoader(file_path)
        elif filename.endswith('.docx'):
            loader = Docx2txtLoader(file_path)
        else:
            print(f"Unsupported file type: {filename}")
            continue
        documents.extend(loader.load())
    return documents

folder_path = "/content/data"
documents = load_documents(folder_path)
print(f"Loaded {len(documents)} documents from the folder.")

splits = text_splitter.split_documents(documents)
print(f"Split the folder documents into {len(splits)} chunks.")


In [None]:
!pip install langchain_huggingface
# Ensure you have clean strings
from langchain_huggingface import HuggingFaceEmbeddings

# Ensure you have clean strings
texts = [
    (s.page_content if hasattr(s, "page_content") else str(s)).strip()
    for s in splits
    if (hasattr(s, "page_content") and isinstance(s.page_content, str) and s.page_content.strip())
       or (not hasattr(s, "page_content") and str(s).strip())
]
if not texts:
    raise ValueError("No non-empty text chunks to embed.")

embeddings = HuggingFaceEmbeddings(
    model_name="nomic-ai/nomic-embed-text-v1.5",
    model_kwargs={"trust_remote_code": True},
    encode_kwargs={"normalize_embeddings": True},
)

document_embeddings = embeddings.embed_documents(texts)
print(f"Embedded {len(document_embeddings)} chunks; dim={len(document_embeddings[0])}")


In [None]:
document_embeddings[0]

Create and persist Chroma vector store

In [None]:
!pip install -q -U langchain-chroma
from langchain_chroma import Chroma

# build `docs` from your existing `splits` (non-empty only)
docs = [d for d in splits if getattr(d, "page_content", "").strip()]

# DEFINE these (were missing before)
collection_name = "my_collection"
persist_directory = "./chroma_db"

vectorstore = Chroma.from_documents(
    documents=docs,
    embedding=embeddings,                 # keep your variable name
    collection_name=collection_name,
    persist_directory=persist_directory,
)


print(f"Vector store created and persisted to '{persist_directory}'")

In [None]:
query="When was Lumora innovations founded"
search_results=vectorstore.similarity_search(query,k=2)

print(f'\n Top 2 most relevant chunks for the query: "{query}"\n')
for i,result in enumerate(search_results,1):
  print(f"Result {i}")
  print(f"Source: {result.metadata.get('source','Unknown')}")
  print(f"Content: {result.page_content}")
  print()

In [None]:
retriever=vectorstore.as_retriever(search_kwags={"k":2})
retriever.invoke("When was Lumora innovations founded")

In [None]:
from langchain_core.prompts import ChatPromptTemplate
template="""Answer the question based only on the following context:
{context}

Question: {question}
Answer:
"""

prompt=ChatPromptTemplate.from_template(template)

In [None]:
from langchain.schema.runnable import RunnablePassthrough
rag_chain=({"context":retriever,"question":RunnablePassthrough()} | prompt )
rag_chain.invoke("When was Lumora innovations founded")

In [None]:
def doc2str(docs):
  return "\n\n" .join(doc.page_content for doc in docs)

rag_chain=({"context":retriever | doc2str,"question":RunnablePassthrough()} | prompt )
rag_chain.invoke("When was Lumora innovations founded")

In [None]:
rag_chain=({"context":retriever | doc2str,"question":RunnablePassthrough()} | prompt  | llm |StrOutputParser())
question="When was Lumora innovations founded"
response=rag_chain.invoke(question)
print(response)

Conversational RAG

In [None]:
from langchain_core.messages import HumanMessage,AIMessage
chat_history=[]
chat_history.extend([HumanMessage(content=question),AIMessage(content=response)])
chat_history

In [None]:
from langchain_core.prompts import MessagesPlaceholder
contextualize_q_system_prompt=(
    "Given a chat history and the latest user question"
    "which might reference context in the chat history"
    "formulate a standalone question which can be understood"
    "without the chat history. Do not answer the question,"
    "just reformulate it if needed and otherwise return it as is"
)

contextualize_q_prompt=ChatPromptTemplate.from_messages([
    ("system",contextualize_q_system_prompt),
    MessagesPlaceholder("chat_history"),
    ("human","{input}")
])

contextualize_chain=contextualize_q_prompt | llm | StrOutputParser()
contextualize_chain.invoke({"input":"Where it is headquatered","chat_history":[]})



In [None]:
from langchain.chains import create_history_aware_retriever
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

history_aware_retriever=create_history_aware_retriever(
    llm,retriever,contextualize_q_prompt
)

qa_prompt=ChatPromptTemplate.from_messages([
    ("system","You are helpful AI assistant. Use the following context to answer the user's question."),
    ("system","Context: {context}"),
    MessagesPlaceholder(variable_name="chat_history"),
    ("human","{input}")
])
question_answer_chain=create_stuff_documents_chain(llm,qa_prompt)
rag_chain=create_retrieval_chain(history_aware_retriever,question_answer_chain)

In [None]:
retriever.invoke("Where is it headquartered?")

In [None]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

qa_prompt=ChatPromptTemplate.from_messages({
    ("system","You are helpful AI assistant. Use the following context to answer the user's question."),
    ("system","Context: {context}"),
    MessagesPlaceholder(variable_name="chat_history"),
    ("human","{input}")
})
question_answer_chain=create_stuff_documents_chain(llm,qa_prompt)
rag_chain=create_retrieval_chain(history_aware_retriever,question_answer_chain)

In [None]:
rag_chain.invoke({"input":"When it is headquartered?","chat_history":chat_history})

Building Multi User Chatbot

In [None]:
import sqlite3
from datetime import datetime

DB_NAME="rag_app.db"

def get_db_connection():
    conn=sqlite3.connect(DB_NAME)
    conn.row_factory=sqlite3.Row
    return conn

def init_db():
    conn=get_db_connection()
    conn.execute('''CREATE TABLE IF NOT EXISTS application_logs
    (id INTEGER PRIMARY KEY AUTOINCREMENT,
    session_id TEXT,
    user_query TEXT,
    gpt_response TEXT,
    model TEXT,
    created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP)''')
    conn.close()

def insert_application_logs(session_id,user_query,gpt_response,model):
    conn=get_db_connection()
    conn.execute('''INSERT INTO application_logs
    (session_id,user_query,gpt_response,model)
    VALUES(?,?,?,?)''',(session_id,user_query,gpt_response,model))
    conn.commit()
    conn.close()

def get_chat_history(session_id):
    conn=get_db_connection()
    cursor=conn.cursor()
    cursor.execute('''SELECT user_query,gpt_response
    FROM application_logs
    WHERE session_id=?
    ORDER BY created_at''',(session_id,))
    messages=[]
    for row in cursor.fetchall():
      messages.extend([
          {"role":"human","content":row['user_query']},
          {"role":"ai","content":row['gpt_response']}
          ])
    conn.close()
    return messages

init_db()


In [None]:
import uuid
session_id=str(uuid.uuid4())
chat_history=get_chat_history(session_id)
chat_history

question1="When was Greengrow innovations founded?"
answer1=rag_chain.invoke({"input":question1,"chat_history":chat_history})['answer']
insert_application_logs(session_id,question1,answer1,"gpt-4o-mini")
print(f"Human: {question1}\n AI: {answer1}")

In [None]:
question2="Where is it headquatered?"
chat_history=get_chat_history(session_id)
print(chat_history)
answer2=rag_chain.invoke({"input":question2,"chat_history":chat_history})['answer']
insert_application_logs(session_id,question2,answer2,"gpt-4o-mini")
print(f"Human: {question1}\n AI: {answer1}")