In [None]:
!pip install pdfplumber
!pip install pinecone
!pip install langchain

In [None]:
# "https://www.abudhabi.gov.ae/-/media/sites/adgov/gazettes/2023/en/first-edition-english-2023.ashx",
# "https://www.researchgate.net/profile/M-Dawoud/publication/337936386_GROUNDWATER_ATLAS_OF_ABU_DHABI_EMIRATE/links/5df5bb1f299bf10bc35c7b99/GROUNDWATER-ATLAS-OF-ABU-DHABI-EMIRATE.pdf"

In [1]:
import os
import re
import pdfplumber
import pinecone
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Set your API keys
ANTHROPIC_API_KEY = "KEY"
PINECONE_API_KEY = "KEY"
PINECONE_ENVIRONMENT = "us-east-1" # e.g., "us-west1-gcp"
PINECONE_INDEX_NAME = "dense-index"


In [2]:

# Define a function to preprocess text
def preprocess_text(text):
    # Replace consecutive spaces, newlines and tabs
    text = re.sub(r'\s+', ' ', text)
    return text

def process_pdf(file_path):
    # create a loader
    loader = PyPDFLoader(file_path)
    # load your data
    data = loader.load()
    # Split your data up into smaller documents with Chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
    documents = text_splitter.split_documents(data)
    # Convert Document objects into strings

    texts = [{"page_content":doc.page_content,"_id":doc.metadata['source']+"_"+str(doc.metadata['page'])+"_"+doc.metadata['page_label']} for doc in documents]
    return texts



In [3]:
import glob
files=glob.glob("data/*")
print(files)

['data/AbuDhabiGroundwaterAtlas2018.pdf', 'data/First Edition English 2023.pdf']


In [4]:
texts=[]
for file in files:
    texts+=process_pdf(file)

In [7]:

from pinecone import Pinecone

pc = Pinecone(api_key=PINECONE_API_KEY)

index_name = "doc-index"

if not pc.has_index(index_name):
    pc.create_index_for_model(
        name=index_name,
        cloud="aws",
        region="us-east-1",
        embed={
            "model":"llama-text-embed-v2",
            "field_map":{"text": "page_content"}
        }
    )

In [8]:
len(texts)

319

In [9]:
import itertools
def chunks(iterable, batch_size=200):
    """A helper function to break an iterable into chunks of size batch_size."""
    it = iter(iterable)
    chunk = tuple(itertools.islice(it, batch_size))
    while chunk:
        yield chunk
        chunk = tuple(itertools.islice(it, batch_size))

In [None]:
index_name="doc-index"
dense_index = pc.Index(name=index_name)

# Upsert records into a namespace

for single_chunk in chunks(texts,2):
    dense_index.upsert_records(
        namespace="abudhabi", 
        records=single_chunk
    )
