In [88]:
import os
from dotenv import load_dotenv
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.embeddings import HuggingFaceEmbeddings
from pinecone import Pinecone, ServerlessSpec

In [89]:
load_dotenv()

# Step 1: Load and preprocess PDF data
def load_pdf(data_path):
    loader = DirectoryLoader(data_path, glob="*.pdf", loader_cls=PyPDFLoader)
    documents = loader.load()
    return documents

extracted_data = load_pdf("data/")

def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=20)
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

text_chunks = text_split(extracted_data)
print("Number of chunks:", len(text_chunks))

Number of chunks: 5860


In [90]:
# Step 2: Initialize embeddings
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

embeddings = download_hugging_face_embeddings()

In [91]:
# Step 3: Configure Pinecone
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
INDEX_NAME = os.getenv("INDEX_NAME")
DIMENSION = int(os.getenv("DIMENSION"))

# Initialize Pinecone client
pc = Pinecone(api_key=PINECONE_API_KEY)

# Check if the index exists and create it only if it doesn't
try:
  existing_indexes = [index.name for index in pc.list_indexes()]
  if INDEX_NAME in existing_indexes:
    print(f"Index '{INDEX_NAME}' already exists.")
  else:
    print(f"Index '{INDEX_NAME}' does not exist. Creating index...")
    pc.create_index(
      name=INDEX_NAME,
      dimension=DIMENSION,
      metric="cosine",
      spec=ServerlessSpec(
          cloud="aws",
          region="us-east-1"
      )
    )
    print(f"Index '{INDEX_NAME}' created successfully.")
except Exception as e:
  print(f"Error during index management: {e}")

# Connect to the existing or newly created index
try:
  index = pc.Index(INDEX_NAME)
  print(f"Connected to index '{INDEX_NAME}'.")
except Exception as e:
  print(f"Error connecting to index '{INDEX_NAME}': {e}")

Index 'chatbot' already exists.
Connected to index 'chatbot'.


In [87]:
# Step 4: Upsert data into Pinecone
def upsert_embeddings_to_pinecone(index, text_chunks, embeddings, batch_size=100):
    chunk_texts = [t.page_content for t in text_chunks]
    chunk_embeddings = embeddings.embed_documents(chunk_texts)

    for i in range(0, len(chunk_embeddings), batch_size):
        batch = chunk_embeddings[i:i+batch_size]
        metadata = [{"page_content": chunk_texts[i+j]} for j in range(len(batch))]
        vectors = [(f"id-{i+j}", batch[j], metadata[j]) for j in range(len(batch))]
        index.upsert(vectors)

    print("Upsert complete.")

upsert_embeddings_to_pinecone(index, text_chunks, embeddings)

Upsert complete.
