In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Pinecone as PineconeLang
from pinecone import Pinecone, ServerlessSpec
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os
from langchain.document_loaders import PyPDFLoader
from dotenv import load_dotenv

  from tqdm.autonotebook import tqdm


In [2]:
load_dotenv()

True

In [3]:
def create_vector_index(index_name, dimensions, metric, cloud_provider, region):
    """
    Create a vector index in Pinecone.

    Args:
    index_name (str): Name of the index.
    dimensions (int): Dimensionality of the vectors for the index.
    metric (str): Similarity metric (e.g., 'cosine', 'euclidean').
    cloud_provider (str): Cloud service provider (e.g., 'aws').
    region (str): Cloud region (e.g., 'us-east-1').

    Returns:
    None
    """
    pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
    
    # Create the index with the specified parameters
    pc.create_index(
        name=index_name,
        dimension=dimensions,
        metric=metric,
        spec=ServerlessSpec(
            cloud=cloud_provider,
            region=region
        )
    )

index_name = "chatbot"  # Give the index a meaningful name
vector_dimensions = 1536  # Replace with your model's vector dimensions
similarity_metric = "cosine"  # Metric used for vector similarity
cloud_provider = "aws"  # Cloud service provider
region = "us-east-1"  # Cloud region

# Create the vector index
create_vector_index(index_name, vector_dimensions, similarity_metric, cloud_provider, region)


In [4]:

def load_pdf_document(file_path):
    """
    Loads a PDF document using PyPDFLoader.

    Args:
    file_path (str): Path to the PDF file.

    Returns:
    list: A list of LangChain Document objects if the file is a valid PDF.
    None: If the file format is not PDF or an error occurs during loading.
    """
    _, file_extension = os.path.splitext(file_path)

    if file_extension.lower() != '.pdf':
        print(f'Unsupported file format: {file_extension}')
        return None

    print(f'Loading PDF file: {file_path}')
    loader = PyPDFLoader(file_path)
    try:
        document_data = loader.load()
        return document_data
    except Exception as e:
        print(f"Error loading PDF: {e}")
        return None

In [5]:

def process_document(documents, chunk_size=1000, chunk_overlap=200):
    """
    Processes a list of documents by splitting them into chunks.

    Args:
    documents (list): List of LangChain Document objects.
    chunk_size (int): Size of each document chunk (default 1000).
    chunk_overlap (int): Overlap between chunks (default 200).

    Returns:
    tuple: A tuple containing the document contents and their metadata.
    """
    # Split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    chunks = text_splitter.split_documents(documents)

    # Extract contents and metadata
    document_contents = [chunk.page_content for chunk in chunks]
    document_metadata = [chunk.metadata for chunk in chunks]

    return document_contents, document_metadata


In [6]:

def uploaddoc(file_path):
    """
    Uploads and processes a PDF document, then indexes it using PineconeLang.

    Args:
    file_path (str): Path to the PDF file.

    Returns:
    None
    """
    # Load the PDF document
    documents = load_pdf_document(file_path)
    if not documents:
        print("No documents to process.")
        return

    # Log the number of pages
    print(f'You have {len(documents)} pages in your data')

    # Initialize OpenAI embeddings
    openai_api_key = os.getenv("OPENAI_API_KEY")  # Replace with actual key
    embedding_model = OpenAIEmbeddings(openai_api_key=openai_api_key)

    # Process the document into chunks
    document_contents, document_metadata = process_document(documents)

    # Log metadata and confirm processing
    print(f"Metadata for the document chunks: {document_metadata}")
    print(f"{file_path} processing is done")

    # Upload the document contents and metadata to PineconeLang
    try:
        PineconeLang.from_texts(
            texts=document_contents,
            metadatas=document_metadata,
            embedding=embedding_model,
            index_name="chatbot"  # Replace with your actual index name
        )
        print("Document successfully indexed.")
    except Exception as e:
        print(f"Error during indexing: {e}")

In [7]:
def process_pdfs_in_directory(directory_path):
    """
    Processes all PDF files in a given directory by calling `uploaddoc` on each file.

    Args:
    directory_path (str): Path to the directory containing PDF files.

    Returns:
    None
    """
    # Check if the directory exists
    if not os.path.isdir(directory_path):
        print(f"The directory {directory_path} does not exist.")
        return

    # Iterate over each file in the directory
    for file_name in os.listdir(directory_path):
        # Check if the file is a PDF
        if file_name.lower().endswith('.pdf'):
            file_path = os.path.join(directory_path, file_name)
            print(f"Processing file: {file_path}")

            # Call the function to upload and process the document
            try:
                uploaddoc(file_path)
            except Exception as e:
                print(f"Error processing file {file_path}: {e}")


In [9]:
if __name__ == "__main__":
    directory_path = "data"  # Replace with your directory path
    process_pdfs_in_directory(directory_path)

Processing file: data\jc306-un-staff-rev1_en.pdf
Loading PDF file: data\jc306-un-staff-rev1_en.pdf
You have 49 pages in your data
Metadata for the document chunks: [{'source': 'data\\jc306-un-staff-rev1_en.pdf', 'page': 0}, {'source': 'data\\jc306-un-staff-rev1_en.pdf', 'page': 1}, {'source': 'data\\jc306-un-staff-rev1_en.pdf', 'page': 1}, {'source': 'data\\jc306-un-staff-rev1_en.pdf', 'page': 2}, {'source': 'data\\jc306-un-staff-rev1_en.pdf', 'page': 3}, {'source': 'data\\jc306-un-staff-rev1_en.pdf', 'page': 4}, {'source': 'data\\jc306-un-staff-rev1_en.pdf', 'page': 4}, {'source': 'data\\jc306-un-staff-rev1_en.pdf', 'page': 4}, {'source': 'data\\jc306-un-staff-rev1_en.pdf', 'page': 5}, {'source': 'data\\jc306-un-staff-rev1_en.pdf', 'page': 5}, {'source': 'data\\jc306-un-staff-rev1_en.pdf', 'page': 5}, {'source': 'data\\jc306-un-staff-rev1_en.pdf', 'page': 6}, {'source': 'data\\jc306-un-staff-rev1_en.pdf', 'page': 6}, {'source': 'data\\jc306-un-staff-rev1_en.pdf', 'page': 6}, {'source'