In [3]:
import re
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import pandas as pd
from langchain_core.documents import Document

In [4]:
# config for the RAG
from config_rag import (
    BOOK_LIST,
    CHUNK_SIZE,
    CHUNK_OVERLAP,
)


def post_process(splits):
    for split in splits:
        # replace newline with blank
        split.page_content = split.page_content.replace("\n", " ")
        split.page_content = re.sub("[^a-zA-Z0-9 \n\.]", " ", split.page_content)
        # remove duplicate blank
        split.page_content = " ".join(split.page_content.split())

    return splits


# load all pages from pdf books
def load_all_pages(book_list):
    all_pages = []

    for book in book_list:
        print(f"Loading book: {book}...")
        loader = PyPDFLoader(book)

        # loader split in pages
        pages = loader.load()

        all_pages.extend(pages)

        print(f"Loaded {len(pages)} pages...")

    return all_pages


#
# Split pages in chunk
#
def split_in_chunks(all_pages):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP
    )

    splits = text_splitter.split_documents(all_pages)

    # some post processing on text
    splits = post_process(splits)

    print(f"Splitted the pdf in {len(splits)} chunks...")
    print("Size of splits: " + str(text_splitter.__sizeof__()) + "bytes")

    return splits

  split.page_content = re.sub("[^a-zA-Z0-9 \n\.]", " ", split.page_content)


In [5]:
# 1. Load a list of pdf documents
all_pages = load_all_pages(BOOK_LIST)
print(f"all pages: \n {all_pages} \n {"*" * 20}")

# 2. Split pages in chunks
document_splits = split_in_chunks(all_pages)
print(f"document_splits: \n {document_splits} \n {"*" * 20}")

Loading book: pdfFiles/sharding-adg-addshard-cookbook-3610618.pdf...
Loaded 86 pages...
Loading book: pdfFiles/globally-distributed-autonomous-database.pdf...
Loaded 8 pages...
Loading book: pdfFiles/oracle-database-23c.pdf...
Loaded 120 pages...
Loading book: pdfFiles/oracle-globally-distributed-database-guide.pdf...
Loaded 378 pages...
Loading book: pdfFiles/Oracle True cache.pdf...
Loaded 58 pages...
Loading book: pdfFiles/Trobleshooting.pdf...
Loaded 17 pages...
all pages: 
 ********************
Splitted the pdf in 1561 chunks...
Size of splits: 16bytes
document_splits: 
 ********************


In [7]:
document_splits

[Document(metadata={'producer': 'Mac OS X 10.9.5 Quartz PDFContext', 'creator': 'Word', 'creationdate': '2017-09-21T21:02:58+00:00', 'author': 'Oracle Corporation', 'keywords': 'Oracle, Oracle Corporation, sharding, Oracle database, data, replication of data, cookbook, OLTP, distribution of data', 'moddate': '2021-04-22T14:17:39+03:00', 'subject': 'Oracle Sharding is a scalability and availability feature for custom-designed OLTP applications that enables distribution and replication of data across a pool of discrete Oracle databases that share no hardware or software.', 'title': 'Sharding ADG Addshard Cookbook', 'source': 'pdfFiles/sharding-adg-addshard-cookbook-3610618.pdf', 'total_pages': 86, 'page': 0, 'page_label': '1'}, page_content='System Managed Sharding with Active Data Guard using ADD SHARD Method Cookbook SEPTEMBER 2017'),
 Document(metadata={'producer': 'Mac OS X 10.9.5 Quartz PDFContext', 'creator': 'Word', 'creationdate': '2017-09-21T21:02:58+00:00', 'author': 'Oracle Co

In [8]:
# 1. Load the csv and format as documents
detailed = pd.read_csv('pdfFiles/preprocessed.csv')
todrop = ['text_processed', 'text_no_stopwords', 'text_stemmed', 'text_lemmatized', 'status', 'scraped_timestamp']
detailed.drop(columns=todrop, axis=1, inplace=True)
detailed = detailed.dropna().astype(str)

In [10]:
detailed.head(3)

Unnamed: 0,id,url,title,word_count,character_count,depth,links_found,domain,path,url_type,section,text
0,1,https://docs.oracle.com/en-us/iaas/Content/gen...,Overview of Generative AI Service,1130,7103,0,9,docs.oracle.com,/en-us/iaas/Content/generative-ai/overview.htm,Overview,Overview,Overview of Generative AI Service Generative A...
1,2,https://docs.oracle.com/en-us/iaas/Content/gen...,Overview of Generative AI Service,1130,7103,0,9,docs.oracle.com,/en-us/iaas/Content/generative-ai/overview.htm,Overview,Overview,Overview of Generative AI Service Generative A...
2,3,https://docs.oracle.com/en-us/iaas/Content/gen...,Concepts for Generative AI,1385,8471,0,3,docs.oracle.com,/en-us/iaas/Content/generative-ai/concepts.htm,Concepts,Concepts,Concepts for Generative AI To help you underat...


In [13]:
# Convert each row into a Document object
documents = [
    Document(
        page_content=row['text'],  # this will be chunked later
        metadata={k: v for k, v in row.items() if k != 'text'}
    )
    for _, row in detailed.iterrows()
]

# 2. Split pages in chunks
document_splits = split_in_chunks(documents)

Splitted the pdf in 708 chunks...
Size of splits: 16bytes


In [14]:
document_splits

[Document(metadata={'id': '1', 'url': 'https://docs.oracle.com/en-us/iaas/Content/generative-ai/overview.htm', 'title': 'Overview of Generative AI Service', 'word_count': '1130', 'character_count': '7103', 'depth': '0', 'links_found': '9', 'domain': 'docs.oracle.com', 'path': '/en-us/iaas/Content/generative-ai/overview.htm', 'url_type': 'Overview', 'section': 'Overview'}, page_content='Overview of Generative AI Service Generative AI is a fully managed Oracle Cloud Infrastructure service that provides a set of state of the art customizable large language models LLMs that cover a wide range of use cases including chat text generation summarization and creating text embeddings. Use the playground the API or the CLI to try out the ready to use pretrained models or create and host your own fine tuned custom models based on your own data on dedicated AI clusters. The OCI Generative AI service includes the following foundational models for chat rerank and text embeddings. Chat Ask questions a