In [1]:
# python 3.10
! pip install faiss-cpu==1.8.0 gpt4all==2.2.1.post1 pdfminer-six==20231228
! pip install -U sentence-transformers==2.5.1

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [2]:
! pip freeze

absl-py==1.4.0
aiohttp==3.9.3
aiosignal==1.3.1
alembic==1.13.0
annotated-types==0.6.0
anyio==4.0.0
appdirs==1.4.4
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
array-record==0.5.0
arrow==1.3.0
asgiref==3.8.0
asttokens==2.4.1
astunparse==1.6.3
async-lru==2.0.4
async-timeout==4.0.3
attrs==23.1.0
Babel==2.13.0
backcall==0.2.0
backoff==2.2.1
bcrypt==4.1.2
beautifulsoup4==4.12.2
beniget==0.4.1
bleach==6.1.0
blinker==1.7.0
Brlapi==0.8.3
Brotli==1.0.9
build==1.1.1
cachetools==5.3.2
catboost==1.2.2
certifi==2020.6.20
cffi==1.16.0
chardet==4.0.0
charset-normalizer==3.3.0
chroma-hnswlib==0.7.3
click==8.1.7
cloudpickle==3.0.0
colorama==0.4.4
coloredlogs==15.0.1
comm==0.2.1
command-not-found==0.3
contourpy==1.1.1
cryptography==42.0.5
cupshelpers==1.0
cycler==0.12.1
databricks-cli==0.18.0
dataclasses-json==0.6.4
dbus-python==1.2.18
debugpy==1.8.1
decorator==5.1.1
defer==1.0.6
defusedxml==0.7.1
Deprecated==1.2.14
distro==1.7.0
dm-tree==0.1.8
docker==6.1.3
entrypoints==0.4
etils==1.7.0
exceptiongr

In [3]:
import re
import copy
import faiss
import numpy as np
from gpt4all import GPT4All
from pdfminer.high_level import extract_pages
from sentence_transformers import SentenceTransformer
from pdfminer.layout import LTTextContainer, LTChar,LTTextLine

In [4]:
#data_path = "C:/Users/Admin/Downloads/ATT-F700U-EN-UM-TN-TAH-020420-FINAL-WEB.pdf"
data_path="/home/flores-cuba/Documents/Gith_Cod/LLM/RAG/Planet_Earth_Topic_3.pdf"
max_len = 500
max_tokens=200

In [5]:
def retrieve_data(path):
    Extract_Data=[]
    font_sizes=[]

    for page_layout in extract_pages(path):
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                for text_line in element:
                    if isinstance(text_line , LTTextLine):
                        for character in text_line:
                            if isinstance(character, LTChar):
                                Font_size=character.size
                                font_name=character.fontname
                Extract_Data.append([Font_size,font_name,(element.get_text()),len(element.get_text())])
                font_sizes.append(Font_size)

    appended_data = copy.deepcopy(Extract_Data)
    corpus_of_documents = [lis[2] for lis in appended_data]
    return corpus_of_documents

In [6]:
def merge_texts(texts, max_len):
    merged_text = ''
    for text in texts:
        if len(merged_text + text) <= max_len:
            merged_text += text
        else:
            break
    return merged_text

In [7]:
def preprocess_extracted_data(extracted_text):
    merged_texts = []
    current_text = []
    current_length = 0

    for text in extracted_text:
        if current_length + len(text) <= max_len:
            current_text.append(text)
            current_length += len(text)
        else:
            merged_texts.append(merge_texts(current_text, max_len))
            current_text = [text]
            current_length = len(text)

    merged_texts.append(merge_texts(current_text, max_len))

    filtered_list = [re.sub(r'\s+', ' ', text.replace('\n',' ').strip()) for text in merged_texts if text.strip()]
    return filtered_list

In [8]:
extracted_text = retrieve_data(data_path)
print(extracted_text)


corpus_of_documents = preprocess_extracted_data(extracted_text)



In [9]:
corpus_of_documents

['© Roger N. Anderson Planet Earth: An Introduction to Earth Sciences Topic 3: Plate Tectonics Roger N. Anderson Columbia University 1 © Roger N. Anderson Planet Earth Topic 3: Plate Tectonics Roger N. Anderson',
 'Figure 3-1. The global mid-ocean ridge system as seen on this bathymetry map created by scientists from Lamont Doherty Earth Observatory of Columbia. The topography of the ocean basins was determined from satellite measurements of the ocean surface, that is deflected slightly to mirror the sea floor topography. The mid-ocean ridge spreading centers form the extensional component of the',
 'theory of Plate Tectonics. But how is new sea floor continuously formed at the mid-ocean spreading centers such as at the Mid-Atlantic Ridge, that grows the Atlantic Ocean and separates North and South America farther and farther from Europe and Africa without expanding the Earth? 2 © Roger N. Anderson Figure 3-2. The Lithospheric Plates that define the surface dynamics of Planet Earth.',


In [10]:
model = SentenceTransformer('paraphrase-mpnet-base-v2')
vectors = model.encode(corpus_of_documents)
vector_dimension = vectors.shape[1]
index = faiss.IndexFlatL2(vector_dimension)
faiss.normalize_L2(vectors)
index.add(vectors)

In [11]:
def search(user_input):
    search_vector = model.encode(user_input)
    _vector = np.array([search_vector])
    faiss.normalize_L2(_vector)
    k = index.ntotal
    distances, ann = index.search(_vector, k=k)
    dis , anns = distances[0] , ann[0]
    return dis , anns 

In [12]:
#user_input = "Tell me about camera"
user_input = "Tell me about how do earthquakes occur"

dis , ann = search(user_input)
relevant_document = corpus_of_documents[ann[0]]
print("relevant data >>>>>>> ",relevant_document)


relevant data >>>>>>>  4 © Roger N. Anderson But what controls earthquakes so precisely as to align them into belts? Moreover,


In [13]:
prompt = """
[INST] 
Answer the question based only on the following relevant document in {max_tokens} words:
{relevant_document}

Question: {user_input}
[/INST]
"""

In [14]:

text = prompt.format(user_input=user_input, relevant_document=relevant_document,max_tokens=max_tokens)

print("input text to llm >> ", text)

# gpt_model from : https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF
gpt_model = GPT4All("/home/flores-cuba/Documents/LLM_models/mistral-7b-instruct-v0.1.Q3_K_S.gguf", allow_download= False)
llm_output = gpt_model.generate(prompt=text, max_tokens=max_tokens, temp=0.1)
print("Mistral Output : ",llm_output)


input text to llm >>  
[INST] 
Answer the question based only on the following relevant document in 200 words:
4 © Roger N. Anderson But what controls earthquakes so precisely as to align them into belts? Moreover,

Question: Tell me about how do earthquakes occur
[/INST]

Mistral Output :  Earthquakes are a natural phenomenon that occurs when there is a sudden release of energy in the Earth's crust. This can happen due to various reasons such as volcanic activity, human activities like mining and drilling, or natural processes like plate tectonics. The movement of tectonic plates causes stress on the rocks beneath them, which eventually leads to the formation of fault lines. When these fault lines become overloaded with pressure, they can suddenly rupture, releasing energy in the form of seismic waves that cause earthquakes.

The exact mechanism behind how earthquakes occur is not fully understood, but scientists have identified several factors that contribute to their occurrence. The