#### Dependencies

In [None]:
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.agents import load_tools, initialize_agent, create_sql_agent
from langchain.agents.agent_types import AgentType
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader
from langchain.chains import RetrievalQA, ConversationChain
from langchain.memory import ConversationBufferMemoryimport
import sqlite3
from langchain.agents.agent_toolkits import SQLDatabaseToolkit
from langchain.sql_database import SQLDatabase
from langchain.chat_models import ChatOpenAI
from langchain.agents.agent_types import AgentType
import os




### 1. LLMs
- Abstraction around different model providers.

In [None]:
llm = OpenAI( model_name='gpt-4', temperature=0.8 )
response = llm( 'What is the capital of France?' )
print( response )

### 2. Prompt Templates
- Reusable templates for structured prompting.

In [None]:

template = "What is a good name for a company that makes {product}?"
prompt = PromptTemplate.from_template(template)

filled_prompt = prompt.format(product="smart shoes")
print(filled_prompt)

### 3. Chains
- Sequence of calls (LLMs, tools, functions). Most basic: LLMChain.

In [None]:
chain = LLMChain( llm=llm, prompt=prompt )
print( chain.run( product="AI-powered drones" ) )

### 4. Agents
- Dynamically select tools based on user input using a "reasoning" loop.

In [None]:
tools = load_tools( [ "serpapi", "llm-math" ], llm=llm )
agent = initialize_agent(
    tools,
    llm,
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True
)

agent.run( "What is the square root of the population of Canada?" )

### 5. Vector Stores + RAG
- Build question-answering systems over your own documents.

In [None]:
# Load and split
loader = TextLoader("my_docs.txt")
docs = loader.load()
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
chunks = splitter.split_documents(docs)

# Embed and store
embedding = OpenAIEmbeddings()
db = FAISS.from_documents(chunks, embedding)

# Ask a question


qa = RetrievalQA.from_chain_type(llm=llm, retriever=db.as_retriever())
qa.run("Summarize the key points from the document")

### 6. Memory
- Preserves context between turns of conversation.

In [None]:
memory = ConversationBufferMemory( )
conv_chain = ConversationChain( llm=llm, memory=memory )

print(conv_chain.run( "Hi, I'm working on an AI project." ) )
print(conv_chain.run( "What did I say my project was about?" ) )

#### SQLite Integration Script

In [None]:


# STEP 1: Set your OpenAI API key
os.environ["OPENAI_API_KEY"] = "your-openai-key"

# STEP 2: Create and populate a SQLite database
conn = sqlite3.connect("people.db")
cursor = conn.cursor()
cursor.execute("DROP TABLE IF EXISTS employees")
cursor.execute("""
    CREATE TABLE employees (
        id INTEGER PRIMARY KEY,
        name TEXT,
        title TEXT,
        department TEXT,
        salary INTEGER
    )
""")
cursor.executemany( "INSERT INTO employees ( name, title, department, salary ) VALUES (?, ?, ?, ?)", [
    ("Alice Johnson", "Engineer", "R&D", 90000),
    ("Bob Smith", "Manager", "HR", 85000),
    ("Charlie Kim", "Analyst", "Finance", 75000),
    ("Diana Lopez", "Engineer", "R&D", 95000)
])
conn.commit()
conn.close()

# STEP 3: Connect LangChain to the database
db = SQLDatabase.from_uri("sqlite:///people.db")

# STEP 4: Create the agent with SQL toolkit
llm = ChatOpenAI(temperature=0, model="gpt-4")

toolkit = SQLDatabaseToolkit(db=db, llm=llm)

agent_executor = create_sql_agent(
    llm=llm,
    toolkit=toolkit,
    agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True
)

# STEP 5: Ask natural language questions

response = agent_executor.run("Which employees in R&D earn more than 90000?")
print(response)


### Retrieval Script

In [None]:
import os
import sqlite3
from langchain.chat_models import ChatOpenAI
from langchain.agents import initialize_agent, Tool
from langchain.agents.agent_types import AgentType
from langchain.agents.agent_toolkits import SQLDatabaseToolkit
from langchain.sql_database import SQLDatabase
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.chains import RetrievalQA
from langchain.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Set your OpenAI key
os.environ["OPENAI_API_KEY"] = "your-openai-key"

# STEP 1: Create SQLite database
conn = sqlite3.connect("people.db")
cursor = conn.cursor()
cursor.execute("DROP TABLE IF EXISTS employees")
cursor.execute("""
    CREATE TABLE employees (
        id INTEGER PRIMARY KEY,
        name TEXT,
        title TEXT,
        department TEXT,
        salary INTEGER
    )
""")
cursor.executemany("INSERT INTO employees (name, title, department, salary) VALUES (?, ?, ?, ?)", [
    ("Alice Johnson", "Engineer", "R&D", 90000),
    ("Bob Smith", "Manager", "HR", 85000),
    ("Charlie Kim", "Analyst", "Finance", 75000),
    ("Diana Lopez", "Engineer", "R&D", 95000)
])
conn.commit()
conn.close()

# STEP 2: Create SQL database object
sql_db = SQLDatabase.from_uri("sqlite:///people.db")
llm = ChatOpenAI(temperature=0, model="gpt-4")

# STEP 3: Load documents and create vector store
loader = TextLoader("my_notes.txt")  # Replace with your own file
documents = loader.load()
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = splitter.split_documents(documents)
embedding = OpenAIEmbeddings()
vectordb = FAISS.from_documents(docs, embedding)
retriever = vectordb.as_retriever()

# STEP 4: Create RetrievalQA chain
doc_qa = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)

# STEP 5: Register tools for agent
tools = [
    Tool(
        name="SQL_Database",
        func=SQLDatabaseToolkit(db=sql_db, llm=llm).get_tools()[0].func,
        description="Useful for answering questions about employees, departments, and salaries."
    ),
    Tool(
        name="Document_QA",
        func=doc_qa.run,
        description="Useful for answering questions about policy notes and general knowledge from documents."
    )
]

# STEP 6: Initialize hybrid agent
agent = initialize_agent(
    tools=tools,
    llm=llm,
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
    verbose=True
)

# STEP 7: Ask unified questions
print(agent.run("Who in R&D earns over $90k?"))  # SQL tool
print(agent.run("Summarize the key points from the notes"))  # Document tool

## LangChain


##### This document assumes you have a some understanding about the following:
- Python and Object Oriented Programming
- NLP concepts such as embeddings
- somewhat know how LLMs work
- know a bit about VectorDBs

##### Explicitly used packages:
- Langchain
- Transformers
##### Implicitly used packages:

- ChromaDB
- openai
- sentence-transformers
> Implicit packages: These packages must be installed to allow for the explicitly defined packages to perform some functions. The explicitly used packages implement the functionality of these implicit packages and simplify it for the end user by abstracting many complicated lines of code into one simple function call (read: Wrapper Classes) but you do not need to understand the inner workings of the implicit packages if you are a beginner



##### Load Dependencies

In [None]:
import os
from langchain.llms import OpenAI
from langchain.document_loaders import TextLoader
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain.embeddings.openai import OpenAIEmbeddings


1. Start with an object of the OpenAI class and set some parameters.
2. Here, we set up the parameter called temperature.
3. Having a lower temperature value ensures that our LLM output is deterministic in nature and not too random and "creative".
4. Call the instance (object) of the OpenAI class "llm"

#### API

In [None]:
api_key = os.getenv( 'OPENAI_API_KEY' )

#### Create OpenAI object

In [None]:
llm = OpenAI( temperature=0.9 )
text = 'What would be a good company name for a company that makes colorful socks?'
print( llm( text ) )

##### The pipeline for this task is as follows:

1. Load doc
2. Split lengthy docs
3. Get doc embeddings
4. Store doc embeddings in vector db (chroma db)
5. Query over the db to obtain the correct chunk to answer from


In [None]:
data = []

##### 1. Load the documents using TextLoader from LangChain.
##### 2. Define a variable to store the documents

- We can append the output of loader.load() into our list variable

In [None]:
loader = TextLoader(r'C:\Users\JkReddy\Desktop\Weill Cornell Medicine\Subjects\Capstone\LangChain.txt')
data.append( loader.load( )[ 0 ] )

- Importing Vector DB - Chroma along with TextSplitter and QA related packages from LangChain.
- Import package for Embeddings from OpenAI

#### Split the lengthy document into smaller chunks.
- This is done because LLMs have a limit to the number of words they can take in as input, and they can retain more information if they have fewer words to work with.
- Chunk overlap parameter dictates how much overlap should exist between the chunks.
- Having more of an overlap ensures that important information is not lost during the splitting process.
- Split documents function takes in a list as an input.
- Each list element must contain a document loaded in by Langchain

In [None]:
text_splitter = CharacterTextSplitter( chunk_size = 1000, chunk_overlap=200 )
texts = text_splitter.split_documents( data )

1. Creating database and getting the embedding functions ready.
2. Store the docs in the vector db.

- In the below cell, we mention a directory in which we want our vector db to reside.
- This is then followed by the creation of an embeddings object from the OpenAIEmbeddings class from the Langchain package.
- This can used for creating the doc embeddings

In [None]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'myvectordb'
embeddings = OpenAIEmbeddings( )

##### By default, the model used for embeddings is the text-ada-embeddings-002

- The vector db is Chroma DB which is integrated into LangChain
- The Chroma.from_documents function takes in these parameters:

##### The split up texts
- Embedding instance from Langchain
- Directory in which we want the persistence of our db to be asserted

In [None]:
vectordb = Chroma.from_documents( texts, embeddings, persist_directory = persist_directory )

- Write Embeddings to a disk using db.persist() and wiping it clean. Reload again to test if it has been stored

In [None]:
vectordb.persist()
vectordb = None

- Reload

In [None]:
vectordb = Chroma( persist_directory=persist_directory, embedding_function = embeddings )

- Create object with LLM model being passed as a parameter along with temperature parameter for controlling the nature of the LLM output.
- OpenAI class also takes in model_name as a parameter.


In [None]:
gpt_qa = RetrievalQA.from_chain_type(llm=OpenAI(temperature = 0.1, model_name = "text-davinci-003"),
                                 chain_type = "stuff",
                                 retriever = vectordb.as_retriever())

- Our newly created qa object has the function query using which we can run a query over the db.
- Once the right doc chunk has been retrieved, it is passed to the llm along with the query.

In [None]:

query = "What can I eat?"
gpt_qa.run(query)

- Creating an instance of an Open Source Embedding

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.indexes import VectorstoreIndexCreator
from langchain.vectorstores import Chroma
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)
text_splitter = CharacterTextSplitter(chunk_size = 500, chunk_overlap=50)
texts = text_splitter.split_documents(data)
persist_directory = 'myvectordb_opensource'
vectordb = Chroma.from_documents(texts, embeddings, persist_directory = persist_directory)
vectordb.persist()

- Langchain HF pipeline only supports models in the hub which function as text2text gen or text gen models.

In [None]:
llm = HuggingFacePipeline.from_model_id(model_id="declare-lab/flan-gpt4all-xl",
                                        task="text2text-generation",
                                        model_kwargs={"temperature":0, "max_length":50, "min_length":10})

In [None]:
qa = RetrievalQA.from_chain_type(llm,
                                 chain_type = "refine",
                                 retriever = vectordb.as_retriever())

In [None]:
query = "Can I have fruits?"
qa.run(query)


