In [1]:
from google.colab import drive
drive.mount('./content/')


Mounted at ./content/


In [2]:
## installing required modules

!pip install PdfReader
!pip install langchain
!pip install PyPDF2
!pip install InstructorEmbedding
!pip install sentence_transformers
!pip install faiss
!pip install faiss-gpu

Collecting PdfReader
  Downloading pdfreader-0.1.12.tar.gz (2.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting bitarray>=1.1.0 (from PdfReader)
  Downloading bitarray-2.8.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (287 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m287.4/287.4 kB[0m [31m38.7 MB/s[0m eta [36m0:00:00[0m
Collecting pycryptodome>=3.9.9 (from PdfReader)
  Downloading pycryptodome-3.19.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m89.1 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: PdfReader
  Building wheel for PdfReader (setup.py) ... [?25l[?25hdone
  Created wheel for PdfReader: filename=pdfreader-0.1.12-py3-none-any.whl size=134538 sha256=d42f53bbe1658461a

In [3]:
## import required libraries

from PyPDF2 import PdfReader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings
from langchain.vectorstores import FAISS
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFaceHub
import os
from langchain.prompts.prompt import PromptTemplate

In [4]:
## extracting text from pdf files
def get_pdf_text(pdf_docs):
    text = ""
    for pdf in pdf_docs:
        pdf_reader = PdfReader(pdf)
        for page in pdf_reader.pages:
            text += page.extract_text()
    return text

## creating overlapping text chunks
def get_text_chunks(text):
    text_splitter = CharacterTextSplitter(
        separator="\n",
        chunk_size=1000,
        chunk_overlap=200,
        length_function=len
    )
    chunks = text_splitter.split_text(text)
    return chunks

## creating embeddings for chunks of text
def get_vectorstore(text_chunks):
    #embeddings = OpenAIEmbeddings()
    embeddings = HuggingFaceInstructEmbeddings(model_name="hkunlp/instructor-xl")
    vectorstore = FAISS.from_texts(texts=text_chunks, embedding=embeddings)
    return vectorstore

## ceating a retrival llm chain
def retrieval_qa_chain(db,return_source_documents):
    llm = HuggingFaceHub(repo_id="tiiuae/falcon-7b-instruct", model_kwargs={"temperature":0.6,"max_length":500, "max_new_tokens":700})
    qa_chain = RetrievalQA.from_chain_type(llm=llm,
                                       chain_type='stuff',
                                       retriever=db,
                                       return_source_documents=return_source_documents,
                                       )
    return qa_chain


In [11]:
## DATA VECTORIZATION AND INDEX CREATION

os.environ["HUGGINGFACEHUB_API_TOKEN"] = "<REPLACE WITH YOUR API TOKEN>"
path_to_pdf = ['/content/content/MyDrive/2/Lecture01.pdf']

raw_text = get_pdf_text(path_to_pdf)

# get the text chunks
text_chunks = get_text_chunks(raw_text)

# create vector store
vectorstore = get_vectorstore(text_chunks)

## creating a db with similarity search and obtaining top 3 most matched vectors of all the vectors present in vector index
db = vectorstore.as_retriever(search_kwargs={'k': 3})

## passing database to bot as input and initializing the bot

bot = retrieval_qa_chain(db,True)

  from tqdm.autonotebook import trange


.gitattributes:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/270 [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/66.3k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/4.96G [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.40k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

load INSTRUCTOR_Transformer
max_seq_length  512




In [12]:
## passing query to llm

query = "What is a Database?"
sol=bot(query)

## answer giveb by llm

print(sol['result'])

print(sol['source_documents'])


A database system is a combination of the database and the DBMS.
[Document(page_content='1 \n \n \nWhat is a Database?  \n• Collection of related  data organized to facilitate retrieval, management and updating  \no Data – known facts that can be recorded and that have implicit meaning  \n \n• Properties  \no Represents some aspects of the real world – miniworld (UoD)  \n\uf0a7 Changes in UoD are reflected in the database  \no Logically coherent collection of data with inherent meaning  \no Designed, built and populated for some specific purpose  \n\uf0a7 Intended user group  \n\uf0a7 Set of applications  \n \nDatabase Management System(DBMS)  \n• A collec tion of programs that enables a user to  create  and maintain  the database  \no Defining  \n\uf0a7 Specifying the types of data, structures, constraints  \no Constructing  \n\uf0a7 Storing the data on a storage medium  \no Manipulating  \n\uf0a7 Retrieving specific data , updating , generating reports  \n \nDBMS classification  \n•

In [13]:
# normal falcon without context

llm = HuggingFaceHub(repo_id="tiiuae/falcon-7b-instruct", model_kwargs={"temperature":0.7,"max_length":500, "max_new_tokens":700})



llm(query)



'\nA database is a collection of information stored on a computer system. A database consists of a set of databases, which are used to store different types of data. The database consists of a set of tables, which are used to store and retrieve data. A database is used to store, retrieve, and manipulate large amounts of data in an efficient manner.'

In [16]:
ques=['what is Database Management System',
      'what are the DBMS classifications',
      'what is Database system',
      'what are the Advantages of a DBMS?',
      ' what are the Characteristics of database approach',
      'How to Design databases']



sol=bot(ques[0])
print(ques[0])
print(sol['result'])
print(" ")

sol=bot(ques[1])
print(ques[1])
print(sol['result'])
print(" ")

sol=bot(ques[2])
print(ques[2])
print(sol['result'])
print(" ")

sol=bot(ques[3])
print(ques[3])
print(sol['result'])
print(" ")

sol=bot(ques[4])
print(ques[4])
print(sol['result'])
print(" ")

sol=bot(ques[5])
print(ques[5])
print(sol['result'])
print(" ")


what is Database Management System

The answer is "A Database Management System (DBMS) is a software application that is used to store, retrieve, and manipulate data. It is designed to provide a user-friendly interface to the user, allowing them to easily create, update, and maintain their own data without having to build their own database from scratch. DBMSs are commonly used in businesses, healthcare, and government to store and manage their data. They can also be used in personal applications, such as web development, to store and retrieve data from a web server. In summary, a DBMS is a software application that is used to store, retrieve, and manipulate data."
 
what are the DBMS classifications

The DBMS classifications are relational databases, object databases, hierarchical databases, and network databases.
 
what is Database system


The correct answer to the question "What is a Database?" is "A database is a collection of related data organized to facilitate retrieval, manage