# Langchain Workshop

In [9]:
# Install dependencies
!pip install -q openai langchain "unstructured[all-docs]" chromadb tiktoken

In [10]:
import os

# Replace with your API key
os.environ['OPENAI_API_KEY'] = '...'

In [11]:
# Add your content here and specify the 'source' directory
!unzip -o -q wiki.zip -d wiki/

source = 'wiki'

In [12]:
# langchain provides many ways of loading content into documents.
# We'll use DirectoryLoader
from langchain.document_loaders import DirectoryLoader

loader = DirectoryLoader(source, glob="**/*", silent_errors=True)
raw_docs = loader.load()

# Here are other loaders
# https://python.langchain.com/docs/modules/data_connection/document_loaders/
# https://python.langchain.com/docs/integrations/document_loaders

In [13]:
# We need to split the content into manageable chunks for embeddings.

from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
docs = splitter.split_documents(raw_docs)

# Here are other splitters and document transformers
# https://python.langchain.com/docs/modules/data_connection/document_transformers/
# https://python.langchain.com/docs/integrations/document_transformers

In [14]:
# We then convert the split documents into embeddings.
# We'll store these in ChromaDB for retrieval.
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma

embedding = OpenAIEmbeddings()
store = Chroma(embedding_function=embedding, persist_directory="./chroma_db_oai")

# Here are other text embedding models
# https://python.langchain.com/docs/integrations/text_embedding

# Here are other vector stores
# https://python.langchain.com/docs/modules/data_connection/vectorstores/
# https://python.langchain.com/docs/integrations/vectorstores

In [15]:
# We'll use the SQLRecordManager to re-compute embeddings only for changed docs
from langchain.indexes import SQLRecordManager, index

namespace = f'chromadb-{source}'
record_manager = SQLRecordManager(namespace, db_url=f'sqlite:///{source}/.records.sql')
record_manager.create_schema()

# Now let's run this
index(docs, record_manager, store, cleanup="full", source_id_key='source')

# Remember: You only need to run this block of code if documents may've changed.

{'num_added': 2531, 'num_updated': 0, 'num_skipped': 1, 'num_deleted': 0}

In [16]:
# The vector store can give us relevant documents. For example:
retriever = store.as_retriever(search_kwargs={'top_k': 3})
retriever.get_relevant_documents("What is our leave policy?")

# Here are more retrievers (ways of getting docs from a query)
# https://python.langchain.com/docs/modules/data_connection/retrievers/
# https://python.langchain.com/docs/integrations/retrievers

[Document(page_content='Elections Voting Guidelines: We encourage you to cast your vote. Email your manager and HR (hr@gramener.com) that you will be taking time out to vote. Ensure that your manager has planned for client needs. Don’t apply for leave on HRMS. This leave is not deducted.\n\nAuto-Approval of Leaves: When you apply for a leave in HRMS, it will be auto-approved in 2 business days. (Your Reporting Manager can approve/reject/cancel before that.)\n\nNight Shift Policy', metadata={'source': 'wiki/policy.md'}),
 Document(page_content='You will find your pending leave requests.\n\nOn your leave request, under the Options dropdown (...), select Edit Leave\n\nChange your leave request and click on Update leave\n\nCancel Pending Leave Request\n\nGo to Me > Leave\n\nYou will find your pending leave requests.\n\nOn your leave request, under the Options dropdown (...), select Cancel Leave\n\nView your Leave Policy\n\nGo to Me > Leave\n\nClick on the Leave Policy Explanation option.\n

In [17]:
# We'll pass these results and the question to ChatGPT
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)
qa_chain = RetrievalQA.from_chain_type(llm, retriever=retriever, return_source_documents=True)

# Here are more chat models
# https://python.langchain.com/docs/integrations/chat/
# Chains are a huge topic that I haven't explored yet

In [18]:
qa_chain("What is our leave policy?")

{'query': 'What is our leave policy?',
 'result': 'To view the leave policy, please follow these steps:\n\n1. Go to Me > Leave.\n2. Click on the "Leave Policy Explanation" option.\n3. A detailed explanation of the leave policy will be presented to you.',
 'source_documents': [Document(page_content='Elections Voting Guidelines: We encourage you to cast your vote. Email your manager and HR (hr@gramener.com) that you will be taking time out to vote. Ensure that your manager has planned for client needs. Don’t apply for leave on HRMS. This leave is not deducted.\n\nAuto-Approval of Leaves: When you apply for a leave in HRMS, it will be auto-approved in 2 business days. (Your Reporting Manager can approve/reject/cancel before that.)\n\nNight Shift Policy', metadata={'source': 'wiki/policy.md'}),
  Document(page_content='You will find your pending leave requests.\n\nOn your leave request, under the Options dropdown (...), select Edit Leave\n\nChange your leave request and click on Update lea

In [19]:
qa_chain("How many leaves do we get?")

{'query': 'How many leaves do we get?',
 'result': 'The number of leaves you get depends on your length of service:\n\n- 0-2 years of service: 16 leaves per year\n- 2-4 years of service: 18 leaves per year\n- More than 4 years of service: 20 leaves per year\n\nPlease note that these leaves are allotted on a pro-rata basis from the date of joining and are effective from 1st April every year.',
 'source_documents': [Document(page_content="Types of Leaves\n\nPrivilege Leaves: These are allotted on pro-rata basis from DOJ & are effective from 1st April every year.\n0-2 years service = 16 leaves/year (If you join on 1-Jan, you'll accrue 16 x 3 / 12 = 4 days by 1-Apr)\n2-4 years service = 18 leaves/year\nMore than 4 years service = 20 leaves/year", metadata={'source': 'wiki/policy.md'}),
  Document(page_content="Carry Forward Leaves: Please take leave rather than carry it forward.\nYou can carry forward up to 50% of annual PL eligibility (where PL + CF = max 15 days) to the next year (Apr-Ma

In [20]:
qa_chain("Do we have a coaching or training plan?")

{'query': 'Do we have a coaching or training plan?',
 'result': "Yes, we do have a coaching or training plan. To conduct any training, you can email hr@gramener.com with the details mentioned in the Trainer's Tutorial section. Additionally, there are different types of training available, such as training for developers (Python + Gramex, Gramex, D3) and data visualization for non-programmers. You can discuss with your manager and identify the areas for improvement and create a plan for training. There is also existing material available on the Internal Wiki, GEARS, books, videos, and other resources that can be used for training purposes.",
 'source_documents': [Document(page_content="Trainer's Tutorial\n\nTo conduct any training (online/offline), email hr@gramener.com with these details:\n\nTrainer: who will be mentoring this training?\n\nTitle: of the Training Program\n\nFormat: is this an Online/Offline Session? Lecture? Videos? Workshop?\n\nAudience: list of participants, or Self-e