In [None]:
import os
import logging
import hashlib
import json
import boto3
from dotenv import load_dotenv
from tqdm import tqdm
import time
import re
import unicodedata
import string
%matplotlib inline


from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import (
    UnstructuredPDFLoader, UnstructuredWordDocumentLoader
)
from langchain_openai import OpenAIEmbeddings
from pinecone import Pinecone, ServerlessSpec

from langchain_ollama import OllamaEmbeddings

# ---------- Load env variables -----------
load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
PINECONE_API_ENV = os.getenv("PINECONE_API_ENV", "us-east-1")
INDEX_NAME = "ut-rag-app-2"

BATCH_SIZE = int(os.getenv("PINECONE_BATCH_SIZE", 100))
MAX_VECTOR_PAYLOAD_BYTES = 4 * 1024 * 1024  # 4MB
EMBED_BATCH_SIZE = 50  # max chunk size for embedding

AWS_REGION = "us-east-2"  # region from your ARN
DYNAMO_TABLE_NAME = "ScrapedPages"  # from your ARN

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# ---------- Pinecone init -----------
pc = Pinecone(api_key=PINECONE_API_KEY)
if INDEX_NAME not in pc.list_indexes().names():
    pc.create_index(
        name=INDEX_NAME,
        dimension=1536,  # Must match your embedding dimension
        spec=ServerlessSpec(cloud='aws', region=PINECONE_API_ENV)
    )
pinecone_index = pc.Index(INDEX_NAME)

# ---------- Embeddings init -----------
# Use OllamaEmbeddings or OpenAIEmbeddings, but be sure dimension matches Pinecone
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
# embeddings = OllamaEmbeddings(model="llama3")
# embeddings = OpenAIEmbeddings(model="text-embedding-3-large", openai_api_key=OPENAI_API_KEY)

# ---------- DynamoDB init -----------
dynamo = boto3.resource(
    "dynamodb",
    region_name=AWS_REGION,
    aws_access_key_id=os.getenv("AWS_ACCESS_KEY"),
    aws_secret_access_key=os.getenv("AWS_SECRET_ACCESS_KEY"),
)
table = dynamo.Table(DYNAMO_TABLE_NAME)

########################################
# Comprehensive Text Preprocessing
########################################

def preprocess_text(text: str) -> str:
    """
    A comprehensive text preprocessing pipeline.
    1) Lowercase
    2) Normalize unicode
    3) Remove HTML tags
    4) Remove punctuation
    5) Collapse extra whitespace
    6) (Optional) remove stopwords if desired
    """
    # 1) Lowercase
    #text = text.lower()
    # 2) Normalize unicode
    #text = unicodedata.normalize("NFKD", text)
    # 3) Remove HTML tags
    text = re.sub(r"<[^>]+>", " ", text)
    # 4) Remove punctuation
    #text = text.translate(str.maketrans("", "", string.punctuation))
    # 5) Collapse extra whitespace
    text = re.sub(r"\s+", " ", text).strip()
    # 6) (Optional) remove stopwords
    # e.g. with nltk stopwords, user must install nltk + download
    # from nltk.corpus import stopwords
    # tokens = text.split()
    # tokens = [t for t in tokens if t not in stopwords.words("english")]
    # text = " ".join(tokens)
    return text


def fetch_items_from_dynamodb() -> list:
    logger.info("Fetching items from DynamoDB table: %s", DYNAMO_TABLE_NAME)
    items = []
    response = table.scan()
    items.extend(response.get("Items", []))

    # Handle pagination
    while 'LastEvaluatedKey' in response:
        response = table.scan(ExclusiveStartKey=response['LastEvaluatedKey'])
        items.extend(response.get("Items", []))

    logger.info(f"Fetched {len(items)} items from DynamoDB.")
    return items


def build_documents_from_items(items: list) -> list:
    """
    Convert each item into a Document.
    The text is in 'scraped_text'.
    The 'url' is stored in metadata.
    """
    docs = []
    for item in items:
        text = item.get("scraped_text")
        url = item.get("url") or "N/A"

        if not text or not isinstance(text, str):
            logger.warning("Skipping item with no or non-string 'scraped_text': %s", item)
            continue

        # -------------- Preprocess here --------------
        text = preprocess_text(text)
        # --------------------------------------------

        doc = Document(page_content=text, metadata={"url": url})
        docs.append(doc)

    logger.info(f"Built {len(docs)} Documents from Dynamo items.")
    return docs


def split_documents(docs: list) -> list:
    """
    Treat each document (each page from DynamoDB) as its own chunk.
    """
    return docs


def estimate_payload_size(vector: dict) -> int:
    return len(json.dumps(vector).encode("utf-8"))


def embed_in_batches(texts, batch_size=50):
    all_embeds = []
    total = len(texts)
    for i in range(0, total, batch_size):
        batch = texts[i : i + batch_size]
        start = time.time()
        logger.info(f"Embedding batch {i} to {i + len(batch)} / {total} ...")
        batch_embeds = embeddings.embed_documents(batch)
        elapsed = time.time() - start
        logger.info(f"Batch embed took {elapsed:.2f} seconds")
        all_embeds.extend(batch_embeds)
    return all_embeds


def index_documents(chunks: list, batch_size: int = BATCH_SIZE):
    """
    Embeds each chunk in smaller EMBED_BATCH_SIZE groups + upserts into Pinecone in safe-size batches.
    """
    if not chunks:
        logger.warning("No chunks provided for indexing.")
        return

    # 1) Batch embed
    texts = [chunk.page_content for chunk in chunks]
    embedded_vectors = embed_in_batches(texts, EMBED_BATCH_SIZE)

    # 2) Create upsert payload
    vectors = []
    for chunk, vector in zip(chunks, embedded_vectors):
        text = chunk.page_content
        vector_id = hashlib.sha256(text.encode()).hexdigest()
        meta = {
            "text": text,
            "url": chunk.metadata["url"]
        }
        vectors.append({
            "id": vector_id,
            "values": vector,
            "metadata": meta
        })

    if not vectors:
        logger.warning("No vectors to upsert.")
        return

    # 3) Upsert in safe-size batches
    logger.info(f"Upserting {len(vectors)} vectors to Pinecone in batches of {batch_size}...")
    batch = []
    current_size = 0

    for vec in tqdm(vectors, desc="Indexing chunks"):
        est_size = estimate_payload_size(vec)
        if current_size + est_size > MAX_VECTOR_PAYLOAD_BYTES or len(batch) >= batch_size:
            pinecone_index.upsert(vectors=batch)
            batch = []
            current_size = 0
        batch.append(vec)
        current_size += est_size

    # final partial batch
    if batch:
        pinecone_index.upsert(vectors=batch)

    logger.info("Indexing complete.")


def rag_workflow_dynamodb():
    """Fully self-contained RAG ingestion using DynamoDB -> Pinecone with batching + text preprocessing."""
    items = fetch_items_from_dynamodb()
    if not items:
        logger.warning("No items found in DynamoDB.")
        return

    docs = build_documents_from_items(items)
    if not docs:
        logger.warning("No valid docs to index.")
        return

    chunks = split_documents(docs)
    logger.info(f"Created {len(chunks)} chunks from documents.")
    index_documents(chunks)

if __name__ == "__main__":
    # Just call the ingestion function
    rag_workflow_dynamodb()



# 1) Compute chunk sizes in characters
lengths = [len(chunk.page_content) for chunk in chunks]

# 2) Plot the distribution
plt.figure(figsize=(10, 6))
plt.hist(lengths, bins=50)
plt.title("Chunk Size Distribution")
plt.xlabel("Chunk length (characters)")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()



# Visualize embedded vector
embedded_vectors = embed_in_batches([chunk.page_content for chunk in chunks])
plt.show()



# Running in collab
import plotly.express as px
import pandas as pd

# coords is your PCA output (N×3 array)
df = pd.DataFrame(coords, columns=["PC1","PC2","PC3"])

fig = px.scatter_3d(
    df, x="PC1", y="PC2", z="PC3",
    title="3D PCA Projection of Document Embeddings"
)
fig.show()