In [1]:
# get file path every row
import os
os.listdir("pdfs")

['50uu014.pdf',
 '51uu012.pdf',
 '51uu013.pdf',
 '51uu014.pdf',
 '51uu015.pdf',
 '51uu016.pdf',
 '51uu017.pdf',
 '51uu019.pdf',
 '51uu020.pdf',
 '51uu023.pdf',
 '51uu024.pdf',
 '52uu011.pdf',
 '52uu012.pdf',
 '52uu013.pdf',
 '52uu014.pdf',
 '52uu015.pdf',
 '52uu016.pdf',
 '52uu017.pdf',
 '52uu020.pdf',
 '52uu021.pdf',
 '52uu023.pdf',
 '52uu024.pdf',
 '59uu002.pdf',
 '59uu005.pdf',
 '61uu010.pdf',
 '61uu016.pdf',
 '61uu017.pdf',
 '61uu022.pdf',
 '64uu001.pdf',
 '66uu011.pdf',
 'LN11-2017%286018%29uu2-2017.pdf',
 'ln113tln6684-2021.pdf',
 'ln155tln6697-2021.pdf',
 'ln234tln6728-2021.pdf',
 'ln295tln6752-2021.pdf',
 'ln296tln6753-2021.pdf',
 'ln297tln6754-2021.pdf',
 'Nomor_21_Tahun_2022.pdf',
 'Salinan+UU+Nomor+27+Tahun+2022.pdf',
 'Salinan_UU_No_22_Tahun_2022.pdf',
 'TLN6018-2017+REVISI.pdf',
 'UU++NO+8+TH+1992.pdf',
 'UU+11+Tahun+2008.pdf',
 'UU+12+Tahun+2008.pdf',
 'UU+13+2016+Indonesia.pdf',
 'UU+13+Tahun+2008.pdf',
 'UU+13-2003.pdf',
 'UU+13-2003pjl.pdf',
 'UU+14+Tahun+2008.pdf',
 '

In [2]:
import os
from dotenv import load_dotenv
from llama_index.core.readers import SimpleDirectoryReader
from llama_index.core.node_parser import SimpleNodeParser
from llama_index.embeddings.azure_openai import AzureOpenAIEmbedding
from llama_index.llms.azure_openai import AzureOpenAI
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import (
    download_loader,
    VectorStoreIndex,
    StorageContext,
    Settings,
    load_index_from_storage,
)
from llama_index.readers.file import PDFReader, EpubReader
from llama_index.vector_stores.pinecone import PineconeVectorStore
from pinecone import Pinecone

load_dotenv()


pc = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))
pc_index = pc.Index(host=os.environ.get("PINECONE_HOST"))

Settings.embed_model = AzureOpenAIEmbedding(
    model="text-embedding-3-large",
    deployment_name="corpu-text-embedding-3-large",
    api_key=os.getenv("AZURE_API_KEY"),
    azure_endpoint=os.getenv("AZURE_API_BASE"),
    api_version="2023-05-15",
)

# Settings.llm = AzureOpenAI(
#     model="text-davinci-003",
#     deployment_name="corpu-text-davinci-003",
#     temperature=0.4,
#     api_key=os.getenv("AZURE_API_KEY"),
#     azure_endpoint=os.getenv("AZURE_API_BASE"),
#     api_version="2023-05-15",
# )
Settings.node_parser = SimpleNodeParser.from_defaults(chunk_size=300, chunk_overlap=10)    

In [16]:
from pathlib import Path
import os
from tqdm import tqdm

pdf_directory = Path("e:/law-ai/pdfs")

if not pdf_directory.exists():
    raise FileNotFoundError(f"Directory does not exist: {pdf_directory}")

try:
    pdf_files = list(pdf_directory.glob("*.pdf"))
    if not pdf_files:
        raise FileNotFoundError(f"No PDF files found in directory: {pdf_directory}")
    print("PDF files found:", pdf_files)
except PermissionError as e:
    raise PermissionError(f"PermissionError: {e}")

try:
    readers = []
    for file in tqdm(pdf_files, desc="Loading PDF files"):
        readers.append(PDFReader().load_data(file))
except Exception as e:
    raise RuntimeError(f"Error reading PDF files: {e}")

vector_store = PineconeVectorStore(pinecone_index='https://law-ai-kbgyolu.svc.aped-4627-b74a.pinecone.io')
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex.from_documents(
    documents=readers,
    show_progress=True,
    storage_context=storage_context,
)

print("Index created successfully.")


PDF files found: [WindowsPath('e:/law-ai/pdfs/50uu014.pdf'), WindowsPath('e:/law-ai/pdfs/51uu012.pdf'), WindowsPath('e:/law-ai/pdfs/51uu013.pdf'), WindowsPath('e:/law-ai/pdfs/51uu014.pdf'), WindowsPath('e:/law-ai/pdfs/51uu015.pdf'), WindowsPath('e:/law-ai/pdfs/51uu016.pdf'), WindowsPath('e:/law-ai/pdfs/51uu017.pdf'), WindowsPath('e:/law-ai/pdfs/51uu019.pdf'), WindowsPath('e:/law-ai/pdfs/51uu020.pdf'), WindowsPath('e:/law-ai/pdfs/51uu023.pdf'), WindowsPath('e:/law-ai/pdfs/51uu024.pdf'), WindowsPath('e:/law-ai/pdfs/52uu011.pdf'), WindowsPath('e:/law-ai/pdfs/52uu012.pdf'), WindowsPath('e:/law-ai/pdfs/52uu013.pdf'), WindowsPath('e:/law-ai/pdfs/52uu014.pdf'), WindowsPath('e:/law-ai/pdfs/52uu015.pdf'), WindowsPath('e:/law-ai/pdfs/52uu016.pdf'), WindowsPath('e:/law-ai/pdfs/52uu017.pdf'), WindowsPath('e:/law-ai/pdfs/52uu020.pdf'), WindowsPath('e:/law-ai/pdfs/52uu021.pdf'), WindowsPath('e:/law-ai/pdfs/52uu023.pdf'), WindowsPath('e:/law-ai/pdfs/52uu024.pdf'), WindowsPath('e:/law-ai/pdfs/59uu002.

Loading PDF files:  46%|████▌     | 885/1916 [06:11<10:05,  1.70it/s]  EOF marker not found
Loading PDF files:  46%|████▌     | 885/1916 [06:11<07:13,  2.38it/s]


RuntimeError: Error reading PDF files: Stream has ended unexpectedly

In [29]:
readers = EpubReader().load_data("data_pdf\Wes McKinney - Python for Data Analysis_ Data Wrangling with pandas, NumPy, and Jupyter-O'Reilly Media, Inc. (2022).epub")
for reader in readers:
    reader.metadata["competency"] = "Data Wrangling"
    reader.metadata["competency_grouping"] = "Data Science"
    reader.metadata["file_name"] = "Python for Data Analysis_ Data Wrangling with pandas, NumPy, and Jupyter"
    reader.metadata["publication_year"] = "2022"
    reader.metadata["publisher"] = "O'Reilly Media, Inc."
    reader.metadata["ISBN"] = "0636920519829"
    reader.metadata["author"] = "Wes McKinney"
    reader.metadata["source_type"] = "Book"
    reader.metadata["DOI"] = "-"
vector_store = PineconeVectorStore(pinecone_index=pc_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex.from_documents(
    documents= readers,
    show_progress=True,
    storage_context=storage_context,
)

Parsing nodes: 100%|██████████| 1/1 [00:03<00:00,  3.19s/it]
Generating embeddings: 100%|██████████| 1754/1754 [03:28<00:00,  8.40it/s]
Upserted vectors: 100%|██████████| 1754/1754 [02:25<00:00, 12.02it/s]


In [30]:
readers = EpubReader().load_data("data_pdf\Elmer, Gary_ Elmer, Gary - 2021 Beginners Guide to Python Programming Language_ A Crash Course to Mastering Python in One Hour (2020).epub")
for reader in readers:
    reader.metadata["competency"] = "Data Analyst"
    reader.metadata["competency_grouping"] = "Data Business Group"
    reader.metadata["file_name"] = "Beginners Guide to Python Programming Language: A Crash Course to Mastering Python Programming Language in One Hour"
    reader.metadata["publication_year"] = "2020"
    reader.metadata["publisher"] = "-"
    reader.metadata["ISBN"] = "-"
    reader.metadata["author"] = "Gary Elmer"
    reader.metadata["source_type"] = "Book"
    reader.metadata["DOI"] = "-"
vector_store = PineconeVectorStore(pinecone_index=pc_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex.from_documents(
    documents= readers,
    show_progress=True,
    storage_context=storage_context,
)

Parsing nodes: 100%|██████████| 1/1 [00:00<00:00,  7.48it/s]
Generating embeddings: 100%|██████████| 89/89 [00:09<00:00,  9.10it/s]
Upserted vectors: 100%|██████████| 89/89 [00:07<00:00, 11.35it/s]


In [31]:
readers = EpubReader().load_data("data_pdf\Jake Knapp, John Zeratsky, Braden Kowitz - Sprint_ How to Solve Big Problems and Test New Ideas in Just Five Days-Simon & Schuster (2016).epub")
for reader in readers:
    reader.metadata["competency"] = "Digital Product Innovation"
    reader.metadata["competency_grouping"] = "Digital Product Management"
    reader.metadata["file_name"] = "SPRINT How to Solve Big Problems and Test New Ideas in Just Five Days"
    reader.metadata["publication_year"] = "2016"
    reader.metadata["publisher"] = "-"
    reader.metadata["ISBN"] = "978-1-5011-2177-7"
    reader.metadata["author"] = "Jake Knapp, John Zeratsky, Braden Kowitz"
    reader.metadata["source_type"] = "Book"
    reader.metadata["DOI"] = "-"
vector_store = PineconeVectorStore(pinecone_index=pc_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex.from_documents(
    documents= readers,
    show_progress=True,
    storage_context=storage_context,
)

Parsing nodes: 100%|██████████| 1/1 [00:01<00:00,  1.09s/it]
Generating embeddings: 100%|██████████| 496/496 [01:10<00:00,  7.05it/s]
Upserted vectors: 100%|██████████| 496/496 [00:45<00:00, 10.97it/s]


In [32]:
readers = EpubReader().load_data("data_pdf\Start With Why by Simon Sinek.epub")
for reader in readers :
    reader.metadata["competency"] = "Research & Innovation Management"
    reader.metadata["competency_grouping"] = "Problem Solving"
    reader.metadata["file_name"] = "Start With Why"
    reader.metadata["publication_year"] = "2009"
    reader.metadata["publisher"] = "Penguin Group"
    reader.metadata["ISBN"] = "978-1-101-14903-4"
    reader.metadata["author"] = "Simon Sinek"
    reader.metadata["source_type"] = "Book"
    reader.metadata["DOI"] = "-"
    
vector_store = PineconeVectorStore(pinecone_index=pc_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex.from_documents(
    documents= readers,
    show_progress=True,
    storage_context=storage_context,
)

Parsing nodes: 100%|██████████| 1/1 [00:00<00:00,  1.34it/s]
Generating embeddings: 100%|██████████| 507/507 [01:13<00:00,  6.93it/s]
Upserted vectors: 100%|██████████| 507/507 [00:26<00:00, 19.03it/s]


In [None]:
for document in documents:
    if document.metadata.get('last_accessed_date') is None:
        document.metadata['last_accessed_date'] = document.metadata['creation_date']
    if document.metadata
    
vector_store = PineconeVectorStore(pinecone_index=pc_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex.from_documents(
    documents=documents,
    show_progress=True,
    storage_context=storage_context,
)