In [1]:
from dotenv import load_dotenv

load_dotenv()

True

In [2]:
from langchain_groq import ChatGroq 

llm = ChatGroq(model="openai/gpt-oss-20b",
               max_tokens=500)

In [3]:
from langchain_core.messages import HumanMessage,SystemMessage

msg = llm.invoke([SystemMessage(content="You are a helpful AI bot that assists the user in choosing the perfect book to read in a single sentence"),
                  HumanMessage(content="I enjoy mystery novels, what should I read?")])

print(msg)

content='' additional_kwargs={'reasoning_content': 'The user: "I enjoy mystery novels, what should I read?" They want a recommendation. The instruction: "You are a helpful AI bot that assists the user in choosing the perfect book to read in a single sentence". So answer in a single sentence. Provide a recommendation: maybe "I recommend \'The Girl with the Dragon Tattoo\' by Stieg Larsson" or "The Hound of the Baskervilles" or "The No. 1 Ladies\' Detective Agency". The user just says "I enjoy mystery novels, what should I read?" They didn\'t specify any sub-genre, age, etc. So a general recommendation: "I recommend \'The Girl with the Dragon Tattoo\' by Stieg Larsson." But it\'s also a thriller. Or "The Murder of Roger Ackroyd" by Agatha Christie. Or "The Name of the Rose" by Umberto Eco. Or "Gone Girl" by Gillian Flynn. Or "The Da Vinci Code" (but not pure mystery). Maybe "The Hound of the Baskervilles" by Arthur Conan Doyle. Or "The Maltese Falcon". Or "The Westing Game". Or "The Shad

In [4]:
from langchain_core.messages import AIMessage

msg = llm.invoke([
    SystemMessage(content="You are a supportive AI bot that suggests fitness activities to a user in one short sentence"),
    HumanMessage(content="I like HIT workouts"),
    AIMessage(content="You should try a crossfit class"),
    HumanMessage(content="How often should I attend?")
])
print(msg)

content='Aim for 2–3 HIT sessions per week, spacing them with a rest day or lighter activity between each.' additional_kwargs={'reasoning_content': 'User: "How often should I attend?" They like HIT workouts. Provide supportive suggestion in one short sentence. Need to give frequency recommendation. Probably 2-3 times per week, ensuring rest days. Keep short.'} response_metadata={'token_usage': {'completion_tokens': 77, 'prompt_tokens': 118, 'total_tokens': 195, 'completion_time': 0.077434839, 'completion_tokens_details': {'reasoning_tokens': 45}, 'prompt_time': 0.005727446, 'prompt_tokens_details': None, 'queue_time': 0.043052734, 'total_time': 0.083162285}, 'model_name': 'openai/gpt-oss-20b', 'system_fingerprint': 'fp_e99e93f2ac', 'service_tier': 'on_demand', 'finish_reason': 'stop', 'logprobs': None, 'model_provider': 'groq'} id='lc_run--019c9e16-a5a9-7a51-91f0-2eb710d5cd51-0' tool_calls=[] invalid_tool_calls=[] usage_metadata={'input_tokens': 118, 'output_tokens': 77, 'total_tokens'

In [5]:
from langchain_core.prompts import PromptTemplate

prompt = PromptTemplate.from_template("Tell me one {adjective} joke about {topic}")
input_ = {"adjective":"funny","topic":"dogs"}
prompt.invoke(input_)

StringPromptValue(text='Tell me one funny joke about dogs')

In [6]:
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_messages([
    ("system","You are a helpful AI assistant"),
    ("user","Tell me joke about {topic}")
])

prompt.invoke({"topic":"dog"}) # why does this not work

input_ = {"topic":"dogs"}

prompt.invoke(input_)


ChatPromptValue(messages=[SystemMessage(content='You are a helpful AI assistant', additional_kwargs={}, response_metadata={}), HumanMessage(content='Tell me joke about dogs', additional_kwargs={}, response_metadata={})])

In [7]:
from langchain_core.prompts import MessagesPlaceholder
from langchain_core.messages import HumanMessage 

prompt = ChatPromptTemplate.from_messages([
    ("system","You are a helpful AI assistant"),
    MessagesPlaceholder("msgs")
])

input_ = {"msgs":[HumanMessage("What is the day after tomorrow?")]}
prompt.invoke(input_)

ChatPromptValue(messages=[SystemMessage(content='You are a helpful AI assistant', additional_kwargs={}, response_metadata={}), HumanMessage(content='What is the day after tomorrow?', additional_kwargs={}, response_metadata={})])

In [8]:
from langchain_groq import ChatGroq

llm = ChatGroq(model="openai/gpt-oss-20b")
chain = prompt | llm 
response = chain.invoke(input=input_)
print(response)

content='I’m not sure what today’s date is. Could you let me know which day you’re referring to (or your current date), so I can tell you what the day after tomorrow will be?' additional_kwargs={'reasoning_content': 'We have a user question: "What is the day after tomorrow?" This is ambiguous without a date context. The user might be asking generically: if today is X, what is the day after tomorrow? But we need a date. The user didn\'t specify a date. We could ask for clarification: what is today\'s date? Or we could assume a date? Usually we ask for clarification. According to policy, if the user asks a question that depends on a date, we should ask for clarification or mention that we need the current date. So we ask: "What is today\'s date?" Or "What day is today?" Then we can compute. So respond with a question asking for clarification.'} response_metadata={'token_usage': {'completion_tokens': 190, 'prompt_tokens': 87, 'total_tokens': 277, 'completion_time': 0.192965951, 'completio

In [9]:
from pydantic import BaseModel,Field 
from langchain_core.output_parsers import JsonOutputParser

class Joke(BaseModel):
    setup: str = Field(description="question to setup a joke")
    punchline: str = Field(description="answer to resolve the joke")

In [10]:
from langchain_core.prompts import PromptTemplate

joke_query = "Tell me a joke"

output_parser = JsonOutputParser(pydantic_object=Joke)
format_instructions = output_parser.get_format_instructions()

prompt = PromptTemplate(
    template="Answer the user query.\n{format_instructions}\n{query}\n",
    input_variables=['query'],
    partial_variables={"format_instructions":format_instructions}
)

chain = prompt | llm | output_parser

response = chain.invoke({"query":joke_query})
print(response)

{'setup': "Why don't scientists trust atoms?", 'punchline': 'Because they make up everything.'}


In [11]:
from langchain_core.output_parsers import CommaSeparatedListOutputParser
from langchain_core.prompts import PromptTemplate

output_parser = CommaSeparatedListOutputParser()
format_instructions = output_parser.get_format_instructions()

prompt = PromptTemplate(template="Answer the user query. {format_instructions}\n\nList five {subject}.",
                        input_variables=['subject'],
                        partial_variables={"format_instructions":format_instructions})

chain = prompt | llm | output_parser

response = chain.invoke({"subject":"swiss chocolates"})
print(response)

['Lindt', 'Toblerone', 'Nestlé', 'Frey', 'Cailler']


In [12]:
from langchain_groq import ChatGroq
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from pydantic import BaseModel,Field 


class Movie(BaseModel):
    title: str = Field(description="Movie title")
    director: str = Field(description="Director name")
    year: int  = Field(description="Release year")
    genre: str = Field(description="Movie genre")

output_parser = JsonOutputParser(pydantic_object=Movie)
format_instructions = """RESPONSE FORMAT: Return ONLY a single JSON object—no markdown, no examples, no extra keys.  It must look exactly like:
{
  "title": "movie title",
  "director": "director name",
  "year": 2000,
  "genre": "movie genre"
}

IMPORTANT: Your response must be *only* that JSON.  Do NOT include any illustrative or example JSON."""

prompt_template = PromptTemplate(template="""You are JSON only assistant.
                                 Task: Generate info about movie "{movie_name}" in JSON format
                                 {format_instructions}
                                 """,
                                 input_variables=["movie_name"],
                                 partial_variables={"format_instructions":format_instructions})

movie_chain = prompt_template | llm | output_parser
movie_name = "Spider-Man-2"
response = movie_chain.invoke({"movie_name":movie_name})
print(response)

{'title': 'Spider-Man 2', 'director': 'Sam Raimi', 'year': 2004, 'genre': 'Superhero, Action, Adventure'}


In [13]:
from langchain_core.documents import Document

Document(page_content="""Python is an interpreted high-level general-purpose programming language.
 Python's design philosophy emphasizes code readability with its notable use of significant indentation.""",
metadata={
    'my_document_id' : 234234,                      # Unique identifier for this document
    'my_document_source' : "About Python",          # Source or title information
    'my_document_create_time' : 1680013019          # Unix timestamp for document creation (March 28, 2023)
 })


Document(metadata={'my_document_id': 234234, 'my_document_source': 'About Python', 'my_document_create_time': 1680013019}, page_content="Python is an interpreted high-level general-purpose programming language.\n Python's design philosophy emphasizes code readability with its notable use of significant indentation.")

In [14]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/96-FDF8f7coh0ooim7NyEQ/langchain-paper.pdf")
documents = loader.load()

In [15]:
print(documents[0].page_content[:1000])

* corresponding author - jkim72@kent.edu 
Revolutionizing Mental Health Care through 
LangChain: A Journey with a Large Language 
Model
Aditi Singh 
 Computer Science  
 Cleveland State University  
 a.singh22@csuohio.edu 
Abul Ehtesham  
The Davey Tree Expert 
Company  
abul.ehtesham@davey.com 
Saifuddin Mahmud  
Computer Science & 
Information Systems  
 Bradley University  
smahmud@bradley.edu  
Jong-Hoon Kim* 
 Computer Science,  
Kent State University,  
jkim72@kent.edu 
Abstract— Mental health challenges are on the rise in our 
modern society, and the imperative to address mental disorders, 
especially regarding anxiety, depression, and suicidal thoughts, 
underscores the need for effective interventions. This paper 
delves into the application of recent advancements in pretrained 
contextualized language models to introduce MindGuide, an 
innovative chatbot serving as a mental health assistant for 
individuals seeking guidance and support in these critical areas. 
MindGuide leve

In [16]:
from langchain_community.document_loaders import WebBaseLoader

web_loader = WebBaseLoader("https://docs.langchain.com/")
web_data = web_loader.load()
print(web_data[0].page_content)

USER_AGENT environment variable not set, consider setting it to identify your requests.


Home - Docs by LangChainSkip to main contentDocs by LangChain home pageHomeSearch...⌘KAsk AIGitHubTry LangSmithTry LangSmithSearch...NavigationDocumentationLangChain is the platform for agent engineering. AI teams at Replit, Clay, Rippling, Cloudflare, Workday, and more trust LangChain’s products to engineer reliable agents.LangSmithLangSmith is a platform that helps AI teams use live production data for continuous testing and improvement. LangSmith provides:ObservabilitySee exactly how your agent thinks and acts with detailed tracing and aggregate trend metrics.Learn moreEvaluationTest and score agent behavior on production data or offline datasets to continuously improve performance.Learn morePrompt EngineeringIterate on prompts with version control, prompt optimization, and collaboration features.Learn moreDeploymentShip your agent in one click, using scalable infrastructure built for long-running tasks.Learn moreLangSmith meets the highest standards of data security and privacy wit

In [17]:
from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=200,chunk_overlap=20,separator='\n')
chunks = text_splitter.split_documents(documents)
len(chunks)

147

In [19]:
from langchain_core.documents import Document 
from langchain_community.document_loaders import PyPDFLoader,WebBaseLoader
from langchain_text_splitters import CharacterTextSplitter,RecursiveCharacterTextSplitter

paper_url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/96-FDF8f7coh0ooim7NyEQ/langchain-paper.pdf"
pdf_loader = PyPDFLoader(paper_url)
pdf_docs= pdf_loader.load()

web_url = "https://docs.langchain.com/"
web_loader = WebBaseLoader(web_url)
web_docs = web_loader.load()
splitter_1 = CharacterTextSplitter(chunk_size=300,chunk_overlap=30,separator='\n')
splitter_2 = RecursiveCharacterTextSplitter(chunk_size=500,chunk_overlap=50,separators=["\n\n","\n"," ",""])
chunks_1 = splitter_1.split_documents(pdf_docs)
chunks_2 = splitter_2.split_documents(pdf_docs)


def display_document_stats(docs, name):
    """Display statistics about a list of document chunks"""
    total_chunks = len(docs)
    total_chars = sum(len(doc.page_content) for doc in docs)
    avg_chunk_size = total_chars / total_chunks if total_chunks > 0 else 0
    
    # Count unique metadata keys across all documents
    all_metadata_keys = set()
    for doc in docs:
        all_metadata_keys.update(doc.metadata.keys())
    
    # Print the statistics
    print(f"\n=== {name} Statistics ===")
    print(f"Total number of chunks: {total_chunks}")
    print(f"Average chunk size: {avg_chunk_size:.2f} characters")
    print(f"Metadata keys preserved: {', '.join(all_metadata_keys)}")
    
    if docs:
        print("\nExample chunk:")
        example_doc = docs[min(5, total_chunks-1)]  # Get the 5th chunk or the last one if fewer
        print(f"Content (first 150 chars): {example_doc.page_content[:150]}...")
        print(f"Metadata: {example_doc.metadata}")
        
        # Calculate length distribution
        lengths = [len(doc.page_content) for doc in docs]
        min_len = min(lengths)
        max_len = max(lengths)
        print(f"Min chunk size: {min_len} characters")
        print(f"Max chunk size: {max_len} characters")

# Display stats for both chunk sets
display_document_stats(chunks_1, "Splitter 1")
display_document_stats(chunks_2, "Splitter 2")



=== Splitter 1 Statistics ===
Total number of chunks: 95
Average chunk size: 263.80 characters
Metadata keys preserved: creationdate, page, creator, total_pages, page_label, moddate, source, author, producer, title

Example chunk:
Content (first 150 chars): comprehensive support within the field of mental health. 
Additionally, the paper discusses the implementation of 
Streamlit to enhance the user ex pe...
Metadata: {'producer': 'PyPDF', 'creator': 'Microsoft Word', 'creationdate': '2023-12-31T03:50:13+00:00', 'author': 'IEEE', 'moddate': '2023-12-31T03:52:06+00:00', 'title': 's8329 final', 'source': 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/96-FDF8f7coh0ooim7NyEQ/langchain-paper.pdf', 'total_pages': 6, 'page': 0, 'page_label': '1'}
Min chunk size: 49 characters
Max chunk size: 299 characters

=== Splitter 2 Statistics ===
Total number of chunks: 57
Average chunk size: 452.74 characters
Metadata keys preserved: creationdate, page, creator, total_pages, page

In [22]:
from langchain_huggingface import HuggingFaceEmbeddings 
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma 

paper_url = "https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/96-FDF8f7coh0ooim7NyEQ/langchain-paper.pdf"
pdf_loader = PyPDFLoader(paper_url)
documents = pdf_loader.load()

splitter = RecursiveCharacterTextSplitter(chunk_size=500,
                                          chunk_overlap=50)
chunks = splitter.split_documents(documents)

print("Chunks:",len(chunks))

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vector_store = Chroma.from_documents(documents=chunks,
                      embedding=embedding_model,
                      persist_directory="./chroma_db")

print("ChromaDB created")

Chunks: 57
ChromaDB created


In [24]:
query = "What is Langchain used for?"

docs = vector_store.similarity_search(query,k=3)

for i,doc in enumerate(docs):
    print(f"\nResult:{i + 1}")
    print(doc.page_content[:200])


Result:1
LangChain helps us to unlock the ability to harness the 
LLM’s immense potential in tasks such as document analysis, 
chatbot development, code analysis, and countless other 
applications. Whether you

Result:2
LangChain helps us to unlock the ability to harness the 
LLM’s immense potential in tasks such as document analysis, 
chatbot development, code analysis, and countless other 
applications. Whether you

Result:3
LangChain helps us to unlock the ability to harness the 
LLM’s immense potential in tasks such as document analysis, 
chatbot development, code analysis, and countless other 
applications. Whether you


In [26]:
retriever = vector_store.as_retriever()
docs = retriever.invoke("Langchain")
print(docs[0])

page_content='and human. The conclusion is drawn in Section V. 
II. LANGCHAIN 
LangChain, with its open -source essence, emerges as a 
promising solution, aiming to simplify the complex process of 
developing applications powered by large language models 
(LLMs). This framework though the rapid delivery of building 
blocks and pre-built chains for building large language model 
applications shows the easy way developers can do it.' metadata={'title': 's8329 final', 'source': 'https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/96-FDF8f7coh0ooim7NyEQ/langchain-paper.pdf', 'creationdate': '2023-12-31T03:50:13+00:00', 'page_label': '1', 'page': 0, 'creator': 'Microsoft Word', 'moddate': '2023-12-31T03:52:06+00:00', 'author': 'IEEE', 'producer': 'PyPDF', 'total_pages': 6}


In [3]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("https://cf-courses-data.s3.us.cloud-object-storage.appdomain.cloud/96-FDF8f7coh0ooim7NyEQ/langchain-paper.pdf")
docs = loader.load()

In [4]:
from langchain_groq import ChatGroq 

llm = ChatGroq(model="openai/gpt-oss-20b",
               max_tokens=512)


In [5]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=800,
                                          chunk_overlap=100,
                                          separators=["\n\n","\n"," ",""])

splits = splitter.split_documents(docs)

In [8]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma 

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_store = Chroma.from_documents(documents=splits,
                                     embedding=embedding_model,
                                     collection_name="pdf_rag")

retriever = vector_store.as_retriever()

In [9]:
from langchain_core.output_parsers import StrOutputParser
from operator import itemgetter
from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_template(
    """You are a helpful assistant.
    Answer the question ONLY uisng the context below. 
    
    Context:
    {context}
    
    Question:
    {question}
    """
)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {
        "context":itemgetter("question") | retriever | format_docs,
        "question":itemgetter("question")
    }
    | prompt
    | llm
    | StrOutputParser()
)

In [10]:
from langchain_core.runnables import RunnableWithMessageHistory
from langchain_core.chat_history import InMemoryChatMessageHistory

# InMemoryChatMessageHistory stores chat history in RAM
# RunnableWithMessageHistory wraps chain/agent this
# automatically adds chat history to every request 
store = {}

def get_session_history(session_id):
    if session_id not in store:
        store[session_id] = InMemoryChatMessageHistory()
    return store[session_id]

chat_chain = RunnableWithMessageHistory(rag_chain,
                                        get_session_history,
                                        input_messages_key="question",
                                        history_messages_key="history")

chat_chain.invoke({"question":"What is this paper about?"},
                  config={"configurable":{"session_id":"1"}})

'The paper focuses on the urgent mental‑health crisis—particularly the strong link between mental disorders and suicide—and proposes using deep‑learning, natural‑language‑processing techniques to detect suicidal ideation early. It highlights how contextualized pre‑trained language models can be harnessed for effective early identification of at‑risk individuals, thereby helping to prevent suicide and improve mental‑health support.'

### Exercise 4
#### **Building a Simple Retrieval System with LangChain**

In this exercise, you'll implement a simple retrieval system using LangChain's vector store and retriever components to help answer questions based on a document.

**Instructions:**

1. Import the necessary components for document loading, embedding, and retrieval.
2. Load the provided document about artificial intelligence.
3. Split the document into manageable chunks.
4. Use an embedding model to create vector representations.
5. Create a vector store and a retriever.
6. Implement a simple question-answering system.
7. Test your system with at least 3 different questions.

**Starter code: provide your solution in the TODO parts**


In [12]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma 
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_core.prompts import ChatPromptTemplate 
from langchain_core.output_parsers import StrOutputParser

from operator import itemgetter


loader = WebBaseLoader("https://python.langchain.com/v0.2/docs/introduction/")
docs = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=200,
                                               chunk_overlap=20,
                                               separators=["\n\n","\n"," ",""])

chunks = text_splitter.split_documents(docs)

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

vector_store = Chroma.from_documents(documents=chunks,
                                     embedding=embedding_model,
                                     collection_name="simple-rag")

retriever = vector_store.as_retriever()

prompt = ChatPromptTemplate.from_template(
    """You are a helpful assistant.
    Answer ONLY using the context below.
    
    Context:
    {context}
    
    Question:
    {question}""")

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {
        "context":itemgetter("question") | retriever | format_docs,
        "question":itemgetter("question")
    }
    | prompt
    | llm 
    | StrOutputParser()
)

test_queries = [
    "What is LangChain?",
    "How do retrievers work?",
    "Why is document splitting important?"
]

for q in test_queries:
    print("Q:", q)
    print("A:", rag_chain.invoke({"question": q}))
    print("-" * 50)


Q: What is LangChain?
A: LangChain overview - Docs by LangChain
--------------------------------------------------
Q: How do retrievers work?
A: I’m sorry, but the provided context does not contain any information about how retrievers work.
--------------------------------------------------
Q: Why is document splitting important?
A: The context provided does not discuss document splitting, so I’m unable to answer that question based on the given information.
--------------------------------------------------
