In [1]:
from langchain_community.document_loaders import DirectoryLoader, UnstructuredFileLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
import time
import pickle



from langchain.embeddings import OllamaEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pinecone

from langchain_community.vectorstores import Pinecone
from langchain_pinecone import PineconeVectorStore
from dotenv import load_dotenv
from langchain_community.document_loaders import WebBaseLoader

from langchain_community.chat_message_histories import ChatMessageHistory
from langchain_core.chat_history import BaseChatMessageHistory
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_core.runnables.history import RunnableWithMessageHistory


from langchain_community.llms import Ollama

from langchain.schema.output_parser import StrOutputParser

from langchain import hub
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain

  from tqdm.autonotebook import tqdm
USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:
directory_path = "./source"
current_dir = os.getcwd()
llm = Ollama(model="llama3")
embedder = OllamaEmbeddings(model="nomic-embed-text")
embedding_dimension = 768   # to be used in accessing Pinecone

In [3]:
embedder

OllamaEmbeddings(base_url='http://localhost:11434', model='nomic-embed-text', embed_instruction='passage: ', query_instruction='query: ', mirostat=None, mirostat_eta=None, mirostat_tau=None, num_ctx=None, num_gpu=None, num_thread=None, repeat_last_n=None, repeat_penalty=None, temperature=None, stop=None, tfs_z=None, top_k=None, top_p=None, show_progress=False, headers=None, model_kwargs=None)

In [4]:
pwd

'S:\\Github\\RAGChatWithMemory'

In [5]:
################## Go to the ./source directory and read all the files except the urllist.txt which is used solely for website loading #####################

In [6]:
loader = DirectoryLoader(
    directory_path,
    glob="**/*",  # This pattern will match all files
    loader_cls=UnstructuredFileLoader,
    show_progress=True,
    use_multithreading=True,
    exclude=["**/urllist.txt"]  # This will exclude urllist.txt from any subdirectory
)

In [7]:
documents = loader.load()

 80%|███████████████████████████████████████████████████████████████████▏                | 4/5 [02:48<00:42, 42.23s/it]


In [8]:
documents[3]

Document(page_content='Juan dela Cruz\n\n• jcruz@gmail.com • www.linkedin.com/in/jcruz • 1-(823)-444-5275\n\nSUMMARY OF QUALIFICATION\n\nOperated an immigration consultancy business for 4 years \uf0b7 Achieved a 100% success rate in PR applications for Immigration, Refugees, and Citizenship in Canada \uf0b7 20+ years of experience in information service delivery \uf0b7 Seasoned IT professional exposed to highly regulated industries like banking and finance, telecommunication, and immigration\n\nA growth mindset with the ability and curiosity needed to enter new fields, work in integrate or lead teams and find\n\nopportunities to improve processes and create innovative solutions\n\nTECHNICAL SKILLS\n\nOperating Systems Applications & Tools Tools Project Management Languages Content Creation : WordPress, OBS Studio, Restream, Adobe Audition, Premiere, Photoshop, Canva Content Management\n\n: Windows 7/8/10/11 : Microsoft Office Suite, Google Suite, Azure Sandbox : Google Workspace, Nitro

In [9]:
len(documents)

4

In [10]:

# Read URLs from urllist.txt and load web pages
with open('./source/urllist.txt', 'r') as file:
    urls = [line.strip() for line in file if line.strip()]

web_loader = WebBaseLoader(urls)
web_documents = web_loader.load()

# Combine directory documents and web documents
all_documents = documents + web_documents

In [11]:
len(all_documents)

6

In [None]:
all_documents

In [12]:
#To save the loaded document into an external so we can by pass the long winded process of reading and loading files.


# Define the file path
file_path = os.path.join(current_dir, 'processed_documents.pkl')

# Save the documents
with open(file_path, 'wb') as f:
    pickle.dump(all_documents, f)

print(f"Documents saved to {file_path}")

Documents saved to S:\Github\RAGChatWithMemory\processed_documents.pkl


In [6]:
####################### START HERE TO LOAD THE DOCUMENTS INITIALLY EXTRACTED FROM PICKUP DIRECTORY ########################

In [4]:
# Load the documents
file_path = os.path.join(current_dir, 'processed_documents.pkl')

if os.path.exists(file_path):
    with open(file_path, 'rb') as f:
        all_documents = pickle.load(f)
    print(f"Loaded {len(all_documents)} documents from {file_path}")
else:
    print("No saved documents found.")

Loaded 6 documents from S:\aProjects\Lab12\RAGChatWithMemory\processed_documents.pkl


In [13]:
all_documents[1]

Document(page_content='Madam Speaker, Madam Vice President, our First Lady and Second Gentleman. Members of Congress and the Cabinet. Justices of the Supreme Court. My fellow Americans.\n\nLast year COVID-19 kept us apart. This year we are finally together again.\n\nTonight, we meet as Democrats Republicans and Independents. But most importantly as Americans.\n\nWith a duty to one another to the American people to the Constitution.\n\nAnd with an unwavering resolve that freedom will always triumph over tyranny.\n\nSix days ago, Russia’s Vladimir Putin sought to shake the foundations of the free world thinking he could make it bend to his menacing ways. But he badly miscalculated.\n\nHe thought he could roll into Ukraine and the world would roll over. Instead he met a wall of strength he never imagined.\n\nHe met the Ukrainian people.\n\nFrom President Zelenskyy to every Ukrainian, their fearlessness, their courage, their determination, inspires the world.\n\nGroups of citizens blocking

In [14]:
# Step : Split documents
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=100,
    length_function=len,
    separators=["\n\n", "\n", " ", ""]
)
split_documents = text_splitter.split_documents(all_documents)

In [15]:
len(split_documents)

429

In [None]:
############################ START HERE TO CREATE/ACCESS PINECONE #####################

In [22]:
# Step : Initialize Pinecone

load_dotenv()
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_API_ENV = os.getenv("PINECONE_API_ENV")

from pinecone import Pinecone, ServerlessSpec


database = Pinecone(api_key=PINECONE_API_KEY)
serverless_spec = ServerlessSpec(cloud="aws", region="us-east-1")
INDEX_NAME = "enterprise"
is_new_database = False
if INDEX_NAME not in database.list_indexes().names():
    is_new_database = True
    database.create_index(
        name=INDEX_NAME,
        dimension=embedding_dimension,
        metric="cosine",
        spec=serverless_spec,
    )

time.sleep(1)
pinecone_index = database.Index(INDEX_NAME)
pinecone_index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [19]:
PINECONE_API_KEY

'2b9568db-149b-471c-a351-e6583ba58ca7'

In [23]:
is_new_database

True

In [24]:
if is_new_database:
    vectorstore = PineconeVectorStore.from_documents(split_documents, embedding=embedder, index_name=INDEX_NAME)
else: 
    vectorstore =  PineconeVectorStore(index_name=INDEX_NAME, embedding=embedder)

In [21]:
############### IF WE NEED TO DO OVER ###############
#database.delete_index(INDEX_NAME)

In [25]:
pinecone_index.describe_index_stats()

{'dimension': 768,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 429}},
 'total_vector_count': 429}

In [26]:
retriever= vectorstore.as_retriever(search_kwargs={"k":2}, search_type="similarity")

In [27]:
results = retriever.invoke("what is agile?")

In [28]:
results

[Document(page_content='Hands off approach\n\nBudget is set\n\nThe Waterfall method\n\nDisadvantages:\n\nLittle room for changes\n\nInflexible\n\nTime bound\n\nLess customer involvement\n\nNo product until the end\n\nWhat is Agile?\n\nAgile - The ability to create and respond to change in order to succeed in an uncertain and turbulent environment.\n\nAgile methodology is a structured and iterative approach to project management and product development, most commonly used for software.\n\nHow Agile Works\n\nIteration is the repetition of a process in order to generate an outcome.\n\nUnlike traditional Waterfall approach, the Agile methodologies follow an iterative approach. The agile approach basically involves a number of cycles usually called sprints that are designed, developed and tested individually.', metadata={'source': 'source\\Project Management Framework and Tools MASTER COPY.pptx'}),
 Document(page_content='Simply, consider each sprint to be a miniature project having its own

In [30]:
results  = vectorstore.similarity_search("who is carlsen?",k=2)

In [31]:
results

[Document(page_content='Retrieved from "https://en.wikipedia.org/w/index.php?title=Magnus_Carlsen&oldid=1230555529"', metadata={'language': 'en', 'source': 'https://en.wikipedia.org/wiki/Magnus_Carlsen', 'title': 'Magnus Carlsen - Wikipedia'}),
 Document(page_content='Carlsen\'s endgame prowess has been singled out for praise.[495][496][497] Jon Speelman, analysing several of Carlsen\'s endgames from the 2012 London Classic (in particular, his wins against McShane, Aronian, and Adams), described what he calls the "Carlsen effect": ...\xa0through the combined force of his skill and no less important his reputation, he drives his opponents into errors.\xa0... He plays on for ever, calmly, methodically and, perhaps most importantly of all, without fear: calculating superbly, with very few outright mistakes and a good proportion of the "very best" moves. This makes him a monster and makes many opponents wilt.[498]', metadata={'language': 'en', 'source': 'https://en.wikipedia.org/wiki/Magnu

In [None]:
# Quick check on handling greetings... Further tests reveal that this is not necessary.

In [32]:

#Output a single category only from the following types ('greeting','non_greeting') 
prompt = ChatPromptTemplate.from_template("""
Categorize the user input into the following category:
greeting - if it is relatd to greetings
non_greeting - if not a greeting

Output a single category only from the following types ('greeting','non_greeting')
user input: {user_input}
ai:
""")

In [33]:
from langchain.schema.output_parser import StrOutputParser
input_category_chain = prompt | llm | StrOutputParser()

In [34]:
input_category_chain.invoke("hi, how are you?")

'greeting'

In [35]:
input_category_chain.invoke("what's up!")

'non_greeting'

In [36]:
input_category_chain.invoke("when is the special day on winter festival?")

'non_greeting'