In [1]:
import os
import openai
import huggingface_hub
import sys
sys.path.append('../..')

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file


## Step 1 : By using specific loader, convert the data into a standard format ie into a langchain DOCUMENT

In [2]:
from langchain.document_loaders import PyPDFLoader #for reading documents from PDF
loader = PyPDFLoader("F:/Machine Learning/Resources/ML book.pdf")
pages = loader.load() #pages is the combined list of documents, each page is stored as a individual document 

In [3]:
len(pages)

431

In [4]:
type(pages[0])

langchain_core.documents.base.Document

In [5]:
page = pages[99] #THERE ARE 2 ATTRIBUTES to the document object, page_Content and metadata

In [6]:
page.page_content[:100]

'3.3 Other Considerations in the Regression Model 91\n50 100 150 20010 20 30 40 50HorsepowerMiles per '

In [7]:
page.metadata

{'source': 'F:/Machine Learning/Resources/ML book.pdf', 'page': 99}

In [8]:
#from web link (where web link has a text (kind of web scraping))

In [9]:
from langchain.document_loaders import WebBaseLoader #extract data from sites

In [10]:
loader = WebBaseLoader("https://en.wikipedia.org/wiki/Main_Page")

In [11]:
docs = loader.load()

In [12]:
docs[0].page_content[:100]

'\n\n\n\nWikipedia, the free encyclopedia\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\nJump to content\n\n\n\n\n\n\n\nMai'

## Step 2: Splitting Docs into chunks (2 parameters - chunk size and chunk overlap)

In [13]:
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
#splitters split in 2 methods, split_text () for splitting text, split_documents() for splitting documents, ie input is a document

In [14]:
chunk_size =450
chunk_overlap = 4

In [15]:
r_splitter = RecursiveCharacterTextSplitter(   #r_splitter counts space or any thing as a character and then sees the chunk size
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    separators = ["/n/n ",'/n ',"(?<=\. )",' '] #separator can be played with (it can have multiple separators as well) # it splits recursively jaise pehle \n\n par then uske andar even if chunk size is greater
)
c_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=150,
    length_function=len
)

In [16]:
some_text = """When writing documents, writers will use document structure to group content. \
This can convey to the reader, which idea's are related. For example, closely related ideas \
are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n  \
Paragraphs are often delimited with a carriage return or two carriage returns. \
Carriage returns are the "backslash n" you see embedded in this string. \
Sentences have a period at the end, but also, have a space.\
and words are separated by space."""

In [17]:
r_splitter.split_text(some_text)

['When writing documents, writers will use document structure to group content. This can convey to the reader, which idea\'s are related. For example, closely related ideas are in sentances. Similar ideas are in paragraphs. Paragraphs form a document. \n\n  Paragraphs are often delimited with a carriage return or two carriage returns. Carriage returns are the "backslash n" you see embedded in this string. Sentences have a period at the end, but also,',
 'have a space.and words are separated by space.']

In [18]:
from langchain.document_loaders import PyPDFLoader
loader = PyPDFLoader("F:\Machine Learning\Internship SABUDH\Research (Knowledge graph)\Knowledge graph enhanced neural collaborative.pdf")
pages = loader.load()

In [19]:
len(pages)

13

In [20]:
from langchain.text_splitter import CharacterTextSplitter

In [21]:
c_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=1000,
    chunk_overlap=150,
    length_function=len
)

In [22]:
type(pages[0])

langchain_core.documents.base.Document

In [23]:
chunks = c_splitter.split_documents(pages)

In [24]:
len(chunks)

102

In [25]:
chunks_2 = r_splitter.split_documents(pages)

In [26]:
chunks_2[0].page_content

'Expert Systems With Applications 164 (2021) 113992\nAvailable online 12 September 2020\n0957-4174/© 2020 Published by Elsevier Ltd.\nContents lists available at ScienceDirect\nExpert Systems With Applications\njournal homepage: www.elsevier.com/locate/eswa\nKnowledge graph enhanced neural collaborative recommendation\nLei Sanga,b,c, Min Xub,∗, Shengsheng Qiand, Xindong Wua,c,e\naKey Laboratory of Knowledge Engineering with Big Data (Hefei University of'

In [27]:
type(chunks[0].page_content)

str

In [28]:
#chunks are in split format

In [29]:
#there are numerous types of splitters, there is spacy splitter, there is token splitter

In [30]:
from langchain.text_splitter import TokenTextSplitter

In [31]:
token_splitter = TokenTextSplitter(chunk_size=40,chunk_overlap = 5)

In [32]:
chunks = token_splitter.split_documents(pages)

In [33]:
print(len(pages))
print(len(chunks))

13
694


In [34]:
#Its a good practise if we keep on adding relevant metadata to the chunks

In [35]:
from langchain.text_splitter import MarkdownHeaderTextSplitter # this splitter maintains or updates the headers into the metadata of chunks

In [36]:
markdown_document = """# Title\n\n \
## Chapter 1\n\n \
Hi this is Jim\n\n Hi this is Joe\n\n \
### Section \n\n \
Hi this is Lance \n\n 
## Chapter 2\n\n \
Hi this is Molly"""

In [37]:
headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

In [38]:
text_splitted = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on).split_text(markdown_document)

In [39]:
text_splitted[1].metadata

{'Header 1': 'Title', 'Header 2': 'Chapter 1', 'Header 3': 'Section'}

## Step 3: Converting Chunks into VECTOR Database (from which query would be made) Vectors would be made for each chunk

In [40]:
#  How to create embedding of the text (we can use Openai Embedding store) - use huugging face embeddings
# Vector Database is mostly created by leveraging Chroma

### Embeddings

In [3]:
from langchain.embeddings import HuggingFaceEmbeddings
embedding = HuggingFaceEmbeddings()



In [42]:
sentence1 = "i like dogs"
sentence2 = "i like canines"
sentence3 = "the weather is ugly outside"

In [43]:
import numpy as np

In [44]:
#Open AI has some cost so used hugging face embeddings
embed1 = embedding.embed_query(sentence1) #embed_query is the method
embed2 = embedding.embed_query(sentence2)
embed3 = embedding.embed_query(sentence3)

In [45]:
print(np.dot(embed1, embed2)) #will give similarity score (the more the similarity the more the similar functions it would be)
print(np.dot(embed1, embed3))
print(np.dot(embed2, embed3))

0.8981182100053602
0.036410254013381334
0.0058476421451507665


In [6]:
from langchain.vectorstores import Chroma #chroma is vector data base, there are various vector databases choice of which depends upon usage

In [4]:
persist_directory = 'F:/Machine Learning/rough/'

In [48]:
vectordb = Chroma.from_documents(
    documents=chunks,
    embedding=embedding,
    persist_directory=persist_directory
)

In [49]:
len(chunks)

694

In [50]:
print(vectordb._collection.count()) # should be same as chunk count

13289


In [51]:
vectordb.persist()

  warn_deprecated(


## Step4 : Retrieving required information from vector database (note info of relevant chunks is leveraged from vector database only)

In [52]:
#Now the Vector database has been created, now we should ask questions to get relevant chunks from the vector database

In [9]:
question = "what is knowledge graph"

In [54]:
# SEMANTIC similarity search

In [55]:
docs = vectordb.similarity_search(question,k=3) #k determines the no of chunks to return from similarity search (similarity search is a method for Chrome vector store)

In [56]:
vectordb._collection.count()   #there are 2082 chunks

13289

In [57]:
len(docs)

3

In [58]:
docs[0].page_content

'Malakoudis , 2019 ; Zheng et al. , 2018 ). Recently, Knowledge Graphs\n(KGs) have attracted increasing attention, which usually consist of\nfruitful connected facts about items'

In [59]:
for doc in docs:
    print(doc.metadata)

{'page': 1, 'source': 'F:\\Machine Learning\\Internship SABUDH\\Research (Knowledge graph)\\Knowledge graph enhanced neural collaborative.pdf'}
{'page': 1, 'source': 'F:\\Machine Learning\\Internship SABUDH\\Research (Knowledge graph)\\Knowledge graph enhanced neural collaborative.pdf'}
{'page': 1, 'source': 'F:\\Machine Learning\\Internship SABUDH\\Research (Knowledge graph)\\Knowledge graph enhanced neural collaborative.pdf'}


## Retrieving information from existing vector database

In [7]:
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding) # retrieving vector database

In [8]:
vectordb._collection.count()

#13289

13289

In [10]:
#max marginal relevance helps in removing duplicacy of results. 1st normal similarity search is made and then max marginal relevance (outputs with max variance)
docs = vectordb.max_marginal_relevance_search(query=question,fetch_k=10,k=3)

In [11]:
print(docs[0].page_content)
print(docs[1].page_content)

Malakoudis , 2019 ; Zheng et al. , 2018 ). Recently, Knowledge Graphs
(KGs) have attracted increasing attention, which usually consist of
fruitful connected facts about items
Malakoudis , 2019 ; Zheng et al. , 2018 ). Recently, Knowledge Graphs
(KGs) have attracted increasing attention, which usually consist of
fruitful connected facts about items


In [16]:
#splitting the question into a filter on specifics by another LLM and then applying the similarity search

## Self Query Retriever  ###( applying filter leveraging another llm) (some technical issue)

In [12]:
from langchain.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

In [17]:
metadata_field_info = [
    AttributeInfo(
        name="source",
        description="The chunk is from knowledge graph",
        type = "string",
    ),
    AttributeInfo(
        name="page",
        description="The page from the reasearch paper",
        type="string",
    )
]

In [18]:
document_content_description = "Research Paper"

In [19]:
from langchain import HuggingFaceHub

In [None]:
# Leveraging LLMs to retrieve information from vector database 
# Note leveraging llms for retrieval is different from llms asking information from vector database. In the 1st one the LLMs processes on the intial set of outputs from vector database
# and compresses it & also use sql to find the relevant informaion (filter operation  etc)

In [20]:
from langchain.llms import HuggingFaceHub
from langchain.retrievers import SelfQueryRetriever

In [68]:
from transformers import AutoModelForCausalLM, AutoTokenizer
model_name = "gpt2"  # Replace 'your_model_name' with the desired model

In [69]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [70]:
question = "What is there one the 1st page?"

In [76]:
inputs = tokenizer(question,return_tensors="pt")

In [83]:
llm = HuggingFaceHub(repo_id="google/flan-t5-xxl")  # Flant5 model

In [84]:
document_content_description = "Research notes"
retriever = SelfQueryRetriever.from_llm(
    model,
    vectordb,
    document_content_description,
    metadata_field_info,
    verbose=True
)

In [85]:
question = "what is there on 1st page?"

In [86]:
docs = retriever.get_relevant_documents(question)

AttributeError: 'StringPromptValue' object has no attribute 'size'

In [67]:
# docs = retriever.get_relevant_documents(inputs)

In [None]:
docs[0].page_content[:100]