In [None]:
!pip install -q langchain
!pip install -q torch
!pip install -q transformers
!pip install -q sentence-transformers
!pip install -q datasets
!pip install -q faiss-cpu

In [None]:
from langchain.document_loaders import HuggingFaceDatasetLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import AutoTokenizer, pipeline
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA

In [None]:
# Specify the dataset name and the column containing the content
dataset_name = "databricks/databricks-dolly-15k"
page_content_column = "context"  # or any other column you're interested in

# Create a loader instance
loader = HuggingFaceDatasetLoader(dataset_name, page_content_column)

# Load the data
data = loader.load()

# Display the first 15 entries
data[:2]

In [None]:
# Create an instance of the RecursiveCharacterTextSplitter class with specific parameters.
# It splits text into chunks of 1000 characters each with a 150-character overlap.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=150)

# 'data' holds the text you want to split, split the text into documents using the text splitter.
docs = text_splitter.split_documents(data)
docs[0]

In [None]:
# Document(page_content="Virgin Australia, the trading name of Virgin Australia
#  Airlines Pty Ltd, is an Australian-based airline. It is the largest airline
# by fleet size to use the Virgin brand. It commenced services on 31 August 2000
#  as Virgin Blue, with two aircraft on a single route.
# It suddenly found itself as a major airline in Australia's domestic market
# after the collapse of Ansett Australia in September 2001.
# The airline has since grown to directly serve 32 cities in Australia,
# from hubs in Brisbane, Melbourne and Sydney.",
# metadata={'instruction': 'When did Virgin Australia start operating?',
# 'response': 'Virgin Australia commenced services on 31 August 2000 as Virgin
# Blue, with two aircraft on a single route.',
# 'category': 'closed_qa'})

In [None]:
# Define the path to the pre-trained model you want to use
modelPath = "sentence-transformers/all-MiniLM-l6-v2"

# Create a dictionary with model configuration options, specifying to use the CPU for computations
model_kwargs = {'device':'cpu'}

# Create a dictionary with encoding options, specifically setting 'normalize_embeddings' to False
encode_kwargs = {'normalize_embeddings': False}

# Initialize an instance of HuggingFaceEmbeddings with the specified parameters
embeddings = HuggingFaceEmbeddings(
    model_name=modelPath,     # Provide the pre-trained model's path
    model_kwargs=model_kwargs, # Pass the model configuration options
    encode_kwargs=encode_kwargs # Pass the encoding options
)


In [None]:

text = "This is a test document."
query_result = embeddings.embed_query(text)
query_result[:3]

In [None]:
# [-0.038338545709848404, 0.1234646886587143, -0.02864295244216919]

In [None]:

db = FAISS.from_documents(docs, embeddings)

In [None]:
question = "What is cheesemaking?"
searchDocs = db.similarity_search(question)
print(searchDocs[0].page_content)

In [None]:
# The goal of cheese making is to control the spoiling of milk into cheese.
# The milk is traditionally from a cow, goat, sheep or buffalo, although,
# in theory, cheese could be made from the milk of any mammal.
# Cow's milk is most commonly used worldwide.
# The cheesemaker's goal is a consistent product with specific characteristics
# (appearance, aroma, taste, texture). The process used to make a Camembert will
# be similar to, but not quite the same as, that used to make Cheddar.

# Some cheeses may be deliberately left to ferment from naturally airborne
# spores and bacteria; this approach generally leads to a less consistent
# product but one that is valuable in a niche market.

In [None]:
# Create a tokenizer object by loading the pretrained "Intel/dynamic_tinybert" tokenizer.
tokenizer = AutoTokenizer.from_pretrained("Intel/dynamic_tinybert")

# Create a question-answering model object by loading the pretrained "Intel/dynamic_tinybert" model.
model = AutoModelForQuestionAnswering.from_pretrained("Intel/dynamic_tinybert")

In [None]:
# Specify the model name you want to use
model_name = "Intel/dynamic_tinybert"

# Load the tokenizer associated with the specified model
tokenizer = AutoTokenizer.from_pretrained(model_name, padding=True, truncation=True, max_length=512)

# Define a question-answering pipeline using the model and tokenizer
question_answerer = pipeline(
    "question-answering",
    model=model_name,
    tokenizer=tokenizer,
    return_tensors='pt'
)

# Create an instance of the HuggingFacePipeline, which wraps the question-answering pipeline
# with additional model-specific arguments (temperature and max_length)
llm = HuggingFacePipeline(
    pipeline=question_answerer,
    model_kwargs={"temperature": 0.7, "max_length": 512},
)

In [None]:
# Create a retriever object from the 'db' using the 'as_retriever' method.
# This retriever is likely used for retrieving data or documents from the database.
retriever = db.as_retriever()

In [None]:
docs = retriever.get_relevant_documents("What is Cheesemaking?")
print(docs[0].page_content)



In [None]:
# output:

# The goal of cheese making is to control the spoiling of milk into cheese.
# The milk is traditionally from a cow, goat, sheep or buffalo, although,
# in theory, cheese could be made from the milk of any mammal. Cow's milk
# is most commonly used worldwide. The cheesemaker's goal is a consistent
# product with specific characteristics (appearance, aroma, taste, texture).
# The process used to make a Camembert will be similar to, but not quite the
# same as, that used to make Cheddar.

# Some cheeses may be deliberately left to ferment from naturally airborne
# spores and bacteria; this approach generally leads to a less consistent
# product but one that is valuable in a niche market.

In [None]:
# Create a retriever object from the 'db' with a search configuration where it retrieves up to 4 relevant splits/documents.
retriever = db.as_retriever(search_kwargs={"k": 4})

# Create a question-answering instance (qa) using the RetrievalQA class.
# It's configured with a language model (llm), a chain type "refine," the retriever we created, and an option to not return source documents.
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="refine", retriever=retriever, return_source_documents=False)

In [None]:
question = "Who is Thomas Jefferson?"
result = qa.run({"query": question})
print(result["result"])

In [None]:
# Thomas Jefferson (April 13, 1743 – July 4, 1826) was an American statesman,
# diplomat, lawyer, architect, philosopher, and Founding Father who served as
# the third president of the United States from 1801 to 1809.
# Among the Committee of Five charged by the Second Continental Congress with
# authoring the Declaration of Independence, Jefferson was the Declaration's
# primary author. Following the American Revolutionary War and prior to becoming
# the nation's third president in 1801, Jefferson was the first United States
# secretary of state under George Washington and then the nation's second vice
# president under John Adams.

In [None]:
!pip install PyPDF2

In [None]:
!pip install pdfplumber langchain gpt4all numba

In [None]:
from PyPDF2 import PdfReader
from pdfplumber import pdf
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer, util
import time
from langchain.llms.gpt4all import GPT4All

from numba import jit, cuda
import numpy as np


import sys,time,random

def progressBar(count_value, total, suffix=''):
    bar_length = 100
    filled_up_Length = int(round(bar_length* count_value / float(total)))
    percentage = round(100.0 * count_value/float(total),1)
    bar = '=' * filled_up_Length + '-' * (bar_length - filled_up_Length)
    sys.stdout.write('[%s] %s%s ...%s\r' %(bar, percentage, '%', suffix))
    sys.stdout.flush()

# Loading the llm model here
llm = GPT4All("orca-mini-3b.ggmlv3.q4_0.bin")

def get_query():
    query = input("Enter your question\n")
    progressBar(1, 7)
    return query


def load_split_pdf(pdf_path):
    pdf_loader = PdfReader(open(pdf_path, "rb"))
    pdf_text = ""
    for page_num in range(len(pdf_loader.pages)):
        pdf_page = pdf_loader.pages[page_num]
        pdf_text += pdf_page.extract_text()
    progressBar(2, 7)
    return pdf_text


def split_text_using_RCTS(pdf_text):
    text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2048,
    chunk_overlap=64
    )
    split_texts = text_splitter.split_text(pdf_text)
    paragraphs = []
    for text in split_texts:
        paragraphs.extend(text.split('\n'))
    progressBar(3, 7)
    return paragraphs


def Initialize_sentence_transformer():
    model_name = "sentence-transformers/all-MiniLM-L6-v2"
    embeddings = SentenceTransformer(model_name)
    progressBar(4, 7)
    return embeddings


def encode_each_paragraph(paragraphs, embeddings):
    responses = []
    for paragraph in paragraphs:
        response = embeddings.encode([paragraph], convert_to_tensor=True)
        responses.append((paragraph, response))
    progressBar(5, 7)
    return responses


def choose_most_relevant_sentence(embeddings, responses, query):
    query_embedding = embeddings.encode([query], convert_to_tensor=True)
    best_response = None
    best_similarity = -1.0
    answers = []

    for paragraph, response in responses:

        similarity = util.pytorch_cos_sim(query_embedding, response).item()

        if similarity >= 0.6:

            # count += 1

            answers.append(paragraph)
    answer = "\n".join(answers)
    progressBar(6, 7)
    return answer


def query_the_llm(answer, llm_model, query):
    prompt_message = answer + "\n" + query

    final_response = llm_model.generate(prompt=prompt_message)

    return final_response



def main(llm):
    start_time = time.time()

    pdf_path = "/content/221070041_Labhansh_DBMS6.pdf"

    query = get_query()

    pdf_text = load_split_pdf(pdf_path)

    paragraphs = split_text_using_RCTS(pdf_text)

    embeddings = Initialize_sentence_transformer()

    responses = encode_each_paragraph(paragraphs=paragraphs, embeddings=embeddings)

    answer = choose_most_relevant_sentence(embeddings=embeddings, responses=responses, query=query)

    final_response = query_the_llm(answer=answer, llm_model=llm, query=query)


    print ("The answer from model is\n", final_response)

    end_time = time.time()
    elapsed_time = end_time - start_time

    print(f"Execution time: {elapsed_time/60} minutes \n")

    progressBar(7, 7)

main(llm)


TypeError: Serializable.__init__() takes 1 positional argument but 2 were given

In [None]:
!pip install -U transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig,AutoModel, BitsAndBytesConfig, AutoModelForSeq2SeqLM
!pip install bitsandbytes
!pip install torch
!pip install accelerate

In [None]:
import torch
print("Loading model and tokenizer...")
model_id = "HuggingFaceH4/zephyr-7b-beta"
if torch.cuda.is_available():
    config=AutoConfig.from_pretrained(model_id)
    config.pretraining_tp = 1
    model =AutoModelForCausalLM.from_pretrained(model_id, load_in_4bit=True)
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, use_cache=True)
print("Loaded model and tokenizer")

Loading model and tokenizer...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loaded model and tokenizer


In [None]:
!pip install streamlit google-generativeai python-dotenv langchain PyPDF2 chromadb faiss-cpu langchain_google_genai fitz pymupdf sentence_transformers einops -q

In [None]:
# !pip install transformers==4.30
# !pip install accelerate
# !pip install -i https://pypi.org/simple/ bitsandbytes
import fitz
from langchain.embeddings import HuggingFaceEmbeddings
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from langchain_google_genai import GoogleGenerativeAIEmbeddings
import google.generativeai as genai
from langchain.vectorstores import FAISS
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains.question_answering import load_qa_chain
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv

load_dotenv()
os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))



def get_pdf_text(pdf_docs):
    text=""
    for pdf in pdf_docs:
        pdf_reader= PdfReader(pdf)
        for page in pdf_reader.pages:
            text+= page.extract_text()
    return  text



def get_text_chunks(text):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
    chunks = text_splitter.split_text(text)
    return chunks


def get_vector_store(text_chunks):
    print("SSSS")
    embeddings = HuggingFaceEmbeddings(model_name="nomic-ai/nomic-embed-text-v1",model_kwargs={"trust_remote_code":True,"revision":"289f532e14dbbbd5a04753fa58739e9ba766f3c7"})
    vector_store = FAISS.from_texts(text_chunks, embedding=embeddings)
    vector_store.save_local("faiss_index")


def get_conversational_chain():

    prompt_template = """
    Answer the question as detailed as possible from the provided context, make sure to provide all the details, if the answer is not in
    provided context just say, "answer is not available in the context", don't provide the wrong answer\n\n
    Context:\n {context}?\n
    Question: \n{question}\n

    Answer:
    """
    model_id = "HuggingFaceH4/zephyr-7b-beta"
    # if torch.cuda.is_available():
    config=AutoConfig.from_pretrained(model_id)
    config.pretraining_tp = 1
    model =AutoModelForCausalLM.from_pretrained(model_id, load_in_4bit=True)
    tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, use_cache=True)
    print("Loaded model and tokenizer")
    # model = ChatGoogleGenerativeAI(model="gemini-pro",
    #                          temperature=0.3)

    prompt = PromptTemplate(template = prompt_template, input_variables = ["context", "question"])
    chain = load_qa_chain(model, chain_type="stuff", prompt=prompt)

    return chain



def user_input(user_question):
    embeddings = GoogleGenerativeAIEmbeddings(model = "models/embedding-001")

    # new_db = FAISS.load_local("faiss_index", embeddings)
    new_db = FAISS.load_local("faiss_index", embeddings,allow_dangerous_deserialization=True)
    docs = new_db.similarity_search(user_question)

    chain = get_conversational_chain()


    response = chain(
        {"input_documents":docs, "question": user_question}
        , return_only_outputs=True)

    print(response)
    # st.write("Reply: ", response["output_text"])




def main():

    # user_question = st.text_input("Ask a Question from the PDF Files")
    user_question = input("Ask a Question from the PDF Files")

    if user_question:
        user_input(user_question)

    # pdf_docs=open("/content/221070041_Labhansh_DBMS6.pdf", "r")

    # import fitz
    doc = fitz.open('/content/Uma_Shankar_Gopalka_And_Anr_vs_State_Of_Jharkhand_And_Anr_on_13_May_2004.PDF')
    text = ""
    for page in doc:
      text+=page.get_text()
    # print(text)

    # raw_text=get_pdf_text(pdf_docs)
    # print(raw_text)
    # raw_text="AIM: Study of Aggregate Functions, Clauses and Transactions Commands on Databases using SQL. TOOL: MariaDB PROGRAMMING LANGUAGE: Structured Query Language (SQL) THEORY: Explain Aggregate Functions, Clauses and Transaction Commands. OPERATIONS EXECUTED: Aggregate Functions sum, count(*), count(distinct()), min,max, avg Clauses group by, order by, having Transaction Commands start transaction savepoint rollback commit Other as, with, limit, case"
    text_chunks = get_text_chunks(text)
    get_vector_store(text_chunks)

    # with st.sidebar:
    #     st.title("Menu:")
    #     pdf_docs = st.file_uploader("Upload your PDF Files and Click on the Submit & Process Button", accept_multiple_files=True)
  #     if st.button("Submit & Process"):
  #         with st.spinner("Processing..."):
  #             raw_text = get_pdf_text(pdf_docs)
  #             text_chunks = get_text_chunks(raw_text)
  #             get_vector_store(text_chunks)
  #             st.success("Done")



if __name__ == "__main__":
    main()

Collecting transformers==4.30
  Using cached transformers-4.30.0-py3-none-any.whl (7.2 MB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.30)
  Using cached tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[31mERROR: Operation cancelled by user[0m[31m
[31mERROR: Operation cancelled by user[0m[31m
[0mLooking in indexes: https://pypi.org/simple/
[31mERROR: Operation cancelled by user[0m[31m
[0m

KeyboardInterrupt: 

In [None]:
!curl ipinfo.io

{
  "ip": "35.237.107.164",
  "hostname": "164.107.237.35.bc.googleusercontent.com",
  "city": "North Charleston",
  "region": "South Carolina",
  "country": "US",
  "loc": "32.8546,-79.9748",
  "org": "AS396982 Google LLC",
  "postal": "29415",
  "timezone": "America/New_York",
  "readme": "https://ipinfo.io/missingauth"
}

In [None]:
!streamlit run /usr/local/lib/python3.10/dist-packages/colab_kernel_launcher.py


Collecting usage statistics. To deactivate, set browser.gatherUsageStats to False.
[0m
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://34.74.8.197:8501[0m
[0m
[34m  Stopping...[0m
^C


In [None]:
!npm install localtunnel

[K[?25h[37;40mnpm[0m [0m[30;43mWARN[0m [0m[35msaveError[0m ENOENT: no such file or directory, open '/content/package.json'
[0m[37;40mnpm[0m [0m[34;40mnotice[0m[35m[0m created a lockfile as package-lock.json. You should commit this file.
[0m[37;40mnpm[0m [0m[30;43mWARN[0m [0m[35menoent[0m ENOENT: no such file or directory, open '/content/package.json'
[0m[37;40mnpm[0m [0m[30;43mWARN[0m[35m[0m content No description
[0m[37;40mnpm[0m [0m[30;43mWARN[0m[35m[0m content No repository field.
[0m[37;40mnpm[0m [0m[30;43mWARN[0m[35m[0m content No README data
[0m[37;40mnpm[0m [0m[30;43mWARN[0m[35m[0m content No license field.
[0m
+ localtunnel@2.0.2
added 22 packages from 22 contributors and audited 22 packages in 2.793s

3 packages are looking for funding
  run `npm fund` for details

found 1 [93mmoderate[0m severity vulnerability
  run `npm audit fix` to fix them, or `npm audit` for details
[K[?25h

In [None]:
!streamlit run app.py &>/content/logs.txt &

In [None]:
!npx localtunnel --port 8501 & wget -q -O - https://loca.lt/mytunnelpassword

[K[?25hnpx: installed 22 in 1.518s
your url is: https://modern-bars-judge.loca.lt


In [None]:
34.74.8.197

