# Crime Data PDF Processing Pipeline

This notebook processes FBI crime statistic PDFs through the following steps:
1. Download PDFs from S3
2. Convert PDFs to markdown using Mistral OCR
3. Upload generated markdown to S3
4. Extract year information
5. Create chunks for embedding
6. Upload chunks to S3
7. Create vector embeddings in Pinecone

## Import Libraries and Setup

In [None]:
import os, uuid, shutil, tempfile, re, json
from pathlib import Path
from datetime import datetime, timezone
from dotenv import load_dotenv
import boto3
from sentence_transformers import SentenceTransformer
import pinecone
from pinecone import Pinecone, ServerlessSpec

# Custom modules - update these paths if needed:
from mistralparsing_userpdf import process_pdf as mistral_process_pdf
from utils.chunking import KamradtModifiedChunker

# Load environment variables
load_dotenv()

True

## Helper Functions for Year Extraction

In [48]:
def extract_year_from_filename(filename):
    """Extract a 4-digit year from a filename."""
    # Look for 4 consecutive digits that likely represent a year (between 1900 and 2099)
    match = re.search(r'(?:19|20)\d{2}', filename)
    if match:
        return match.group(0)
    return None  # Return None if no year found

def extract_year_from_content(content):
    """Extract year from markdown content based on common patterns."""
    # Pattern for "FBI Releases YYYY Crime Statistics" or similar
    title_match = re.search(r'FBI Releases (\d{4}) Crime Statistics', content)
    if title_match:
        return title_match.group(1)
        
    # Look for years in the text that are likely report years
    year_matches = re.finditer(r'(?:in|for|during|of)(?: the)? (?:year )?(\d{4})', content.lower())
    for match in year_matches:
        year = match.group(1)
        # Validate year is between 1900 and current year
        if 1900 <= int(year) <= datetime.now().year:
            return year
            
    # Last resort: just find any 4-digit number that looks like a year
    general_year = re.search(r'\b((?:19|20)\d{2})\b', content)
    if general_year:
        return general_year.group(1)
        
    return None

## S3 and Pinecone Setup

In [49]:
# --- S3 Setup ---
s3 = boto3.client(
    "s3",
    aws_access_key_id=os.getenv("AWS_SERVER_PUBLIC_KEY"),
    aws_secret_access_key=os.getenv("AWS_SERVER_SECRET_KEY")
)
bucket = os.getenv("BUCKET_NAME")
input_folder = "crime records summary/"  # The folder with PDFs in S3
markdown_folder = "processed_markdown/"  # Where to store processed markdown files
chunks_folder = "chunks/"  # Where to store chunks

# --- Pinecone Setup ---
pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
index_name = os.getenv("PINECONE_INDEX_NAME")

# Delete index if it exists
if index_name in pc.list_indexes().names():
    pc.delete_index(index_name)

# Create new index with ServerlessSpec
pc.create_index(
    name=index_name, 
    dimension=384, 
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"  # Adjust region as needed
    )
)
index = pc.Index(index_name)

# --- Temp Dir ---
temp_dir = tempfile.mkdtemp(prefix="pdf_downloads_")

## 1. Download PDFs from S3

In [50]:
print("Downloading PDFs from S3...")
resp = s3.list_objects_v2(Bucket=bucket, Prefix=input_folder)
pdf_paths = []
original_filenames = {}  # To keep track of original filenames for year extraction

for obj in resp.get("Contents", []):
    if obj["Key"].endswith(".pdf"):
        filename = os.path.basename(obj["Key"])
        local_pdf = os.path.join(temp_dir, filename)
        s3.download_file(bucket, obj["Key"], local_pdf)
        pdf_paths.append(local_pdf)
        original_filenames[local_pdf] = filename  
        print(f"Downloaded: {filename}")

if not pdf_paths:
    print("No PDFs found in the S3 folder.")

Downloading PDFs from S3...
Downloaded: 1995Summary.pdf
Downloaded: 1996Summary.pdf
Downloaded: 1997Summary.pdf
Downloaded: 1998Summary.pdf
Downloaded: 1999Summary.pdf
Downloaded: 2000Summary.pdf
Downloaded: 2001Summary.pdf
Downloaded: 2002Summary.pdf
Downloaded: 2003Summary.pdf
Downloaded: 2004Summary.pdf
Downloaded: 2006Summary.pdf
Downloaded: 2007 CIUS Summary.pdf
Downloaded: 2008 CIUS Summary.pdf
Downloaded: 2009Summary.pdf
Downloaded: 2010 CIUS Summary.pdf
Downloaded: 2011Summary.pdf
Downloaded: 2012 CIUS Summary.pdf
Downloaded: 2013 CIUS Summary _final.pdf
Downloaded: 2014 CIUS Summary_final.pdf
Downloaded: 2015 CIUS Summary_final.pdf
Downloaded: 2016 CIUS Summary.pdf
Downloaded: 2017 CIUS Summary.pdf
Downloaded: 2018 CIUS Summary.pdf
Downloaded: 2019 CIUS Summary.pdf


## 2. Convert PDFs to Markdown and  Save Markdown File Paths for Later Processing

We'll save the markdown file paths to a JSON file so we can process them separately

In [51]:
# --- Convert PDFs to Markdown (Mistral) ---
print("Converting PDFs to Markdown...")
md_files = []

for pdf_path in pdf_paths:
    try:
        pdf_filename = os.path.basename(pdf_path)
        print(f"Processing {pdf_filename} ...")
        
        # First extract year from PDF filename
        year = extract_year_from_filename(pdf_filename)
        if not year:
            print(f"Warning: Could not extract year from filename {pdf_filename}")
            # Use a timestamp if no year is found
            year = datetime.now().strftime('%Y')
            
        out_dir = os.path.join(temp_dir, "mistral_output")
        os.makedirs(out_dir, exist_ok=True)
        
        # Verify the PDF file exists
        if not os.path.exists(pdf_path):
            raise FileNotFoundError(f"PDF file not found: {pdf_path}")
            
        # Get absolute path to ensure no path issues
        abs_pdf_path = os.path.abspath(pdf_path)
        print(f"Using absolute PDF path: {abs_pdf_path}")
        
        # Process the PDF - this always outputs to output.md
        md_path = mistral_process_pdf(pdf_path=abs_pdf_path, output_dir=out_dir)
        
        if os.path.exists(md_path):
            # Create a simplified markdown filename using just the year
            simplified_md_filename = f"{year}.md"
            simplified_md_path = os.path.join(out_dir, simplified_md_filename)
            
            print(f"Renaming output to simplified filename: {simplified_md_filename}")
            
            # Copy the content to the simplified filename
            with open(md_path, 'r', encoding='utf-8') as src:
                content = src.read()
                
            with open(simplified_md_path, 'w', encoding='utf-8') as dest:
                dest.write(content)
                
            # Add the simplified path to our list
            md_files.append(simplified_md_path)
            
            # Upload markdown to S3 with the simplified name
            s3_md_key = f"{markdown_folder}{simplified_md_filename}"
            print(f"Uploading markdown to S3: {s3_md_key}")
            s3.upload_file(simplified_md_path, bucket, s3_md_key)
        else:
            print(f"Warning: Markdown file not created at expected path {md_path}")
            
    except Exception as e:
        print(f"Error processing PDF {pdf_filename}: {str(e)}")
        import traceback
        traceback.print_exc()
        
        # Create an error markdown file
        error_md_filename = f"{year}.md"
        error_md_path = os.path.join(out_dir, error_md_filename)
        
        with open(error_md_path, 'w', encoding='utf-8') as f:
            f.write(f"# Processing Error\n\nFailed to process {pdf_path}\n\nError: {str(e)}")
        
        md_files.append(error_md_path)
        
        # Upload error markdown to S3
        s3_md_key = f"{markdown_folder}{error_md_filename}"
        print(f"Uploading error markdown to S3: {s3_md_key}")
        s3.upload_file(error_md_path, bucket, s3_md_key)

if not md_files:
    print("No Markdown files generated.")
else:
    print(f"Successfully generated {len(md_files)} markdown files")

# Save the markdown file paths for later processing
markdown_paths = {
    "md_files": md_files,
    "timestamp": datetime.now(timezone.utc).isoformat()
}

with open("markdown_paths.json", "w") as f:
    json.dump(markdown_paths, f)

Converting PDFs to Markdown...
Processing 1995Summary.pdf ...
Using absolute PDF path: C:\Users\shush\AppData\Local\Temp\pdf_downloads_9tmj6okr\1995Summary.pdf
Processing 1995Summary.pdf ...
Markdown with embedded images generated in C:\Users\shush\AppData\Local\Temp\pdf_downloads_9tmj6okr\mistral_output\output.md
Renaming output to simplified filename: 1995.md
Uploading markdown to S3: processed_markdown/1995.md
Processing 1996Summary.pdf ...
Using absolute PDF path: C:\Users\shush\AppData\Local\Temp\pdf_downloads_9tmj6okr\1996Summary.pdf
Processing 1996Summary.pdf ...
Markdown with embedded images generated in C:\Users\shush\AppData\Local\Temp\pdf_downloads_9tmj6okr\mistral_output\output.md
Renaming output to simplified filename: 1996.md
Uploading markdown to S3: processed_markdown/1996.md
Processing 1997Summary.pdf ...
Using absolute PDF path: C:\Users\shush\AppData\Local\Temp\pdf_downloads_9tmj6okr\1997Summary.pdf
Processing 1997Summary.pdf ...
Markdown with embedded images generat

# Processing Complete

The PDFs have been downloaded and converted to markdown files. These files have been uploaded to S3.

To proceed with chunking and vector embedding, run the `embedding_and_chunking.ipynb` notebook.

## Markdown Chunking and Vector Embedding 
Now that we have successfully converted the pdf files into markdowns ,we will be chunking it into json files for creating vectors and store it into pinecone index `crime-reports`

In [54]:
print("Chunking and creating vector embeddings...")

# Check if we have markdown files
try:
    with open("markdown_paths.json", "r") as f:
        markdown_paths = json.load(f)
        md_files = markdown_paths.get("md_files", [])
        
    # Verify these files still exist
    md_files = [f for f in md_files if os.path.exists(f)]
    print(f"Loaded {len(md_files)} markdown files from saved paths")
    
except (FileNotFoundError, json.JSONDecodeError) as e:
    print(f"Could not load markdown paths: {e}")
    
    # If no markdown files were loaded, list them from S3
    print("Downloading markdown files from S3...")
    resp = s3.list_objects_v2(Bucket=bucket, Prefix=markdown_folder)
    md_files = []
    
    for obj in resp.get("Contents", []):
        if obj["Key"].endswith(".md"):
            filename = os.path.basename(obj["Key"])
            local_md = os.path.join(temp_dir, filename)
            s3.download_file(bucket, obj["Key"], local_md)
            md_files.append(local_md)
            print(f"Downloaded: {filename}")
            
    print(f"Downloaded {len(md_files)} markdown files from S3")



Chunking and creating vector embeddings...
Loaded 24 markdown files from saved paths


## Check if the markdown files are there or not !

In [55]:
if not md_files:
    print("No markdown files found!")
else:
    # Print all files for debugging
    print("Found the following markdown files:")
    for i, file in enumerate(md_files):
        print(f"{i+1}. {file}")
        
    # Initialize model and chunker
    model = SentenceTransformer("all-MiniLM-L6-v2")
    chunker = KamradtModifiedChunker(
        avg_chunk_size=300,
        min_chunk_size=50,
        embedding_function=lambda texts: [model.encode(t).tolist() for t in texts]
    )
    
    # Track overall statistics
    total_processed = 0
    total_failed = 0
    processed_files = []
    failed_files = []

Found the following markdown files:
1. C:\Users\shush\AppData\Local\Temp\pdf_downloads_9tmj6okr\mistral_output\1995.md
2. C:\Users\shush\AppData\Local\Temp\pdf_downloads_9tmj6okr\mistral_output\1996.md
3. C:\Users\shush\AppData\Local\Temp\pdf_downloads_9tmj6okr\mistral_output\1997.md
4. C:\Users\shush\AppData\Local\Temp\pdf_downloads_9tmj6okr\mistral_output\1998.md
5. C:\Users\shush\AppData\Local\Temp\pdf_downloads_9tmj6okr\mistral_output\1999.md
6. C:\Users\shush\AppData\Local\Temp\pdf_downloads_9tmj6okr\mistral_output\2000.md
7. C:\Users\shush\AppData\Local\Temp\pdf_downloads_9tmj6okr\mistral_output\2001.md
8. C:\Users\shush\AppData\Local\Temp\pdf_downloads_9tmj6okr\mistral_output\2002.md
9. C:\Users\shush\AppData\Local\Temp\pdf_downloads_9tmj6okr\mistral_output\2003.md
10. C:\Users\shush\AppData\Local\Temp\pdf_downloads_9tmj6okr\mistral_output\2004.md
11. C:\Users\shush\AppData\Local\Temp\pdf_downloads_9tmj6okr\mistral_output\2006.md
12. C:\Users\shush\AppData\Local\Temp\pdf_downloa

In [56]:
# --- Create and upload chunks to S3 ---
import json, uuid, os
from datetime import datetime, timezone

print("Processing markdown files and creating chunks...")

# Track overall statistics
total_processed = 0
total_failed = 0
processed_files = []
failed_files = []
all_chunk_data = {}  # Store chunk data for later vector embedding

# Process each markdown file for chunking
for md_file in md_files:
    try:
        # Validate file exists
        if not os.path.exists(md_file):
            print(f"WARNING: File not found: {md_file}")
            total_failed += 1
            failed_files.append(md_file)
            continue
            
        with open(md_file, "r", encoding="utf-8") as f:
            content = f.read()
        
        # Get the filename to extract year
        md_filename = os.path.basename(md_file)
        print(f"\n===== PROCESSING {md_filename} =====")
        
        # Extract year from the filename (should be the filename itself without extension)
        year = os.path.splitext(md_filename)[0]
        if not re.match(r'^(19|20)\d{2}$', year):
            # If filename isn't just a year, try to extract year
            year = extract_year_from_filename(md_filename)
            
            # If still no year, try from content
            if not year:
                year = extract_year_from_content(content)
                
        print(f"Year detected: {year}")
                
        # Process chunks
        chunks = chunker.split_text(content)
        print(f"Created {len(chunks)} chunks")
        
        # Create simplified chunk filename: year_chunks.json
        chunks_json_filename = f"{year}_chunks.json"
        chunks_json_path = os.path.join(temp_dir, chunks_json_filename)
        
        # S3 path for storing and referencing chunks
        s3_chunks_key = f"{chunks_folder}{chunks_json_filename}"
        print(f"Chunks will be stored at S3 path: {s3_chunks_key}")
        
        # Check if this chunks file already exists in S3
        existing_chunks = []
        try:
            response = s3.get_object(Bucket=bucket, Key=s3_chunks_key)
            existing_data = json.loads(response['Body'].read().decode('utf-8'))
            existing_chunks = existing_data.get("chunks", [])
            print(f"Found existing chunks file with {len(existing_chunks)} chunks")
        except Exception as e:
            if "NoSuchKey" not in str(e):  # Only log if it's not just a missing file
                print(f"No existing chunks file found: {str(e)}")
        
        # Combine with new chunks if there were existing ones
        all_chunks = existing_chunks + chunks
        
        chunks_data = {
            "source_file": md_filename,
            "chunks": all_chunks,
            "chunk_count": len(all_chunks),
            "year": year,
            "processed_date": datetime.now(timezone.utc).isoformat(),
            "document_id": year,  # Use year as document ID
            "s3_location": s3_chunks_key
        }
        
        # Save locally first
        with open(chunks_json_path, 'w', encoding='utf-8') as f:
            json.dump(chunks_data, f, ensure_ascii=False, indent=2)
            
        # Verify the file was created locally
        if not os.path.exists(chunks_json_path):
            print(f"ERROR: Failed to create local chunks file: {chunks_json_path}")
        else:
            print(f"Successfully created local chunks file: {chunks_json_path}")
            
        # Upload chunks JSON to S3
        print(f"Uploading chunks to S3: {s3_chunks_key}")
        s3.upload_file(chunks_json_path, bucket, s3_chunks_key)
        
        # Verify the S3 upload
        try:
            s3.head_object(Bucket=bucket, Key=s3_chunks_key)
            print(f"S3 upload successful: {s3_chunks_key}")
        except Exception as e:
            print(f"ERROR: S3 upload verification failed: {str(e)}")
            
        # Store data for later vector embedding
        all_chunk_data[md_file] = {
            "year": year,
            "chunks": chunks,
            "existing_chunks_count": len(existing_chunks),
            "s3_chunks_key": s3_chunks_key,
            "md_filename": md_filename
        }
        
        print(f"✅ Successfully processed chunks for '{md_filename}'")
        total_processed += 1
        processed_files.append(md_file)
            
    except Exception as e:
        print(f"❌ ERROR processing markdown file {md_file}: {str(e)}")
        import traceback
        traceback.print_exc()
        total_failed += 1
        failed_files.append(md_file)

# Print summary of chunk processing
print("\n===== CHUNK PROCESSING SUMMARY =====")
print(f"Total files processed successfully: {total_processed}")
print(f"Total files failed: {total_failed}")

if failed_files:
    print("\nFailed files:")
    for f in failed_files:
        print(f"- {f}")

print("\nAll markdown files chunked and uploaded to S3!")

# Save chunk data for the next cell
with open("chunk_data.json", "w") as f:
    # Convert to serializable format
    serializable_data = {}
    for md_file, data in all_chunk_data.items():
        serializable_data[md_file] = {
            "year": data["year"],
            "chunks_count": len(data["chunks"]),
            "existing_chunks_count": data["existing_chunks_count"],
            "s3_chunks_key": data["s3_chunks_key"],
            "md_filename": data["md_filename"]
        }
    json.dump(serializable_data, f)

Processing markdown files and creating chunks...

===== PROCESSING 1995.md =====
Year detected: 1995
Created 10 chunks
Chunks will be stored at S3 path: chunks/1995_chunks.json
Successfully created local chunks file: C:\Users\shush\AppData\Local\Temp\pdf_downloads_9tmj6okr\1995_chunks.json
Uploading chunks to S3: chunks/1995_chunks.json
S3 upload successful: chunks/1995_chunks.json
✅ Successfully processed chunks for '1995.md'

===== PROCESSING 1996.md =====
Year detected: 1996
Created 11 chunks
Chunks will be stored at S3 path: chunks/1996_chunks.json
Successfully created local chunks file: C:\Users\shush\AppData\Local\Temp\pdf_downloads_9tmj6okr\1996_chunks.json
Uploading chunks to S3: chunks/1996_chunks.json
S3 upload successful: chunks/1996_chunks.json
✅ Successfully processed chunks for '1996.md'

===== PROCESSING 1997.md =====
Year detected: 1997
Created 13 chunks
Chunks will be stored at S3 path: chunks/1997_chunks.json
Successfully created local chunks file: C:\Users\shush\AppD

## Vector Embedding and Pinecone Upload (Separate Cell)

In [57]:
# --- Vector Embedding and Pinecone Upload ---
from sentence_transformers import SentenceTransformer
import json, uuid
from datetime import datetime, timezone
import os

print("Creating embeddings and uploading to Pinecone...")

# Initialize model
model = SentenceTransformer("all-MiniLM-L6-v2")

# Make sure we have the index initialized
if 'index' not in locals():
    pc = Pinecone(api_key=os.getenv("PINECONE_API_KEY"))
    index = pc.Index("crime-reports")

# Track embedding progress
total_vectors = 0
total_files_embedded = 0
embedding_errors = 0

# Process each markdown file
for md_file in md_files:
    try:
        # Get the filename and content
        md_filename = os.path.basename(md_file)
        print(f"\n===== CREATING EMBEDDINGS FOR {md_filename} =====")
        
        with open(md_file, "r", encoding="utf-8") as f:
            content = f.read()
            
        # Get year
        year = os.path.splitext(md_filename)[0]
        if not re.match(r'^(19|20)\d{2}$', year):
            year = extract_year_from_filename(md_filename)
            if not year:
                year = extract_year_from_content(content)
                
        print(f"Year for embedding: {year}")
        
        # Recreate chunks (we don't load from chunk_data.json to avoid losing chunk text)
        chunks = chunker.split_text(content)
        print(f"Creating embeddings for {len(chunks)} chunks")
        
        # S3 path for retrieving chunks
        chunks_json_filename = f"{year}_chunks.json"
        s3_chunks_key = f"{chunks_folder}{chunks_json_filename}"
        
        # Create vector embeddings
        vectors = []
        
        for i, chunk in enumerate(chunks):
            # Create embedding
            emb = model.encode(chunk).tolist()
            
            # Create metadata for retrieval
            metadata = {
                "file": md_filename,
                "document_id": year,  # Use year as document ID
                "chunk_index": i,
                "text_preview": chunk[:100] + ("..." if len(chunk) > 100 else ""),
                "chunks_s3_path": s3_chunks_key,
                "s3_bucket": bucket,
                "year": year,
                "chunk_length": len(chunk),
                "time_processed": datetime.now(timezone.utc).isoformat()
            }
                
            vectors.append((
                f"{year}_{i}_{uuid.uuid4()}",  # ID includes year and chunk index
                emb,
                metadata
            ))
            
        # Upsert in batches
        print(f"Upserting {len(vectors)} vectors to Pinecone namespace: {year}")
        
        for i in range(0, len(vectors), 100):
            batch = vectors[i:i+100]
            # Use year as namespace for easy filtering
            index.upsert(vectors=batch, namespace=year)
        
        total_vectors += len(vectors)
        total_files_embedded += 1
        print(f"✅ Successfully embedded {len(vectors)} chunks for '{md_filename}'")
            
    except Exception as e:
        print(f"❌ ERROR creating embeddings for {md_file}: {str(e)}")
        import traceback
        traceback.print_exc()
        embedding_errors += 1

# Print embedding summary
print("\n===== EMBEDDING SUMMARY =====")
print(f"Total files embedded: {total_files_embedded}")
print(f"Total vectors created: {total_vectors}")
print(f"Embedding errors: {embedding_errors}")
print("\nVector embedding complete!")

Creating embeddings and uploading to Pinecone...

===== CREATING EMBEDDINGS FOR 1995.md =====
Year for embedding: 1995
Creating embeddings for 10 chunks
Upserting 10 vectors to Pinecone namespace: 1995
✅ Successfully embedded 10 chunks for '1995.md'

===== CREATING EMBEDDINGS FOR 1996.md =====
Year for embedding: 1996
Creating embeddings for 11 chunks
Upserting 11 vectors to Pinecone namespace: 1996
✅ Successfully embedded 11 chunks for '1996.md'

===== CREATING EMBEDDINGS FOR 1997.md =====
Year for embedding: 1997
Creating embeddings for 13 chunks
Upserting 13 vectors to Pinecone namespace: 1997
✅ Successfully embedded 13 chunks for '1997.md'

===== CREATING EMBEDDINGS FOR 1998.md =====
Year for embedding: 1998
Creating embeddings for 10 chunks
Upserting 10 vectors to Pinecone namespace: 1998
✅ Successfully embedded 10 chunks for '1998.md'

===== CREATING EMBEDDINGS FOR 1999.md =====
Year for embedding: 1999
Creating embeddings for 10 chunks
Upserting 10 vectors to Pinecone namespace:

## Let's check if Pinecone index is created 

In [58]:
# --- Comprehensive Pinecone Setup ---
import os
from pinecone import Pinecone, ServerlessSpec
from dotenv import load_dotenv
import time

print("Setting up Pinecone connection...")

# Load environment variables if not already loaded
if "PINECONE_API_KEY" not in os.environ:
    load_dotenv()

pinecone_api_key = os.getenv("PINECONE_API_KEY")
if not pinecone_api_key:
    print("ERROR: PINECONE_API_KEY not found in environment variables")
    print("Please check your .env file or set it manually.")
else:
    print(f"Found Pinecone API key: {pinecone_api_key[:4]}...")  # Show first 4 chars only for security

# Initialize Pinecone client
try:
    pc = Pinecone(api_key=pinecone_api_key)
    
    # List all available indexes
    all_indexes = pc.list_indexes()
    print("\nCurrent Pinecone indexes:")
    if not all_indexes.names():
        print("No indexes found in this Pinecone account.")
    else:
        for i, name in enumerate(all_indexes.names()):
            print(f"{i+1}. {name}")
    
    # Set our target index name
    index_name = os.getenv("PINECONE_INDEX_NAME")
    
    # Check if the index exists
    if index_name in all_indexes.names():
        print(f"\nSuccess: Index '{index_name}' exists in Pinecone.")
        index = pc.Index(index_name)
        print(f"Connected to index '{index_name}'")
    else:
        print(f"\nIndex '{index_name}' does not exist in Pinecone. Creating...")

except Exception as e:
    print(f"ERROR connecting to Pinecone: {str(e)}")
    import traceback
    traceback.print_exc()

Setting up Pinecone connection...
Found Pinecone API key: pcsk...

Current Pinecone indexes:
1. embedding-cosine
2. crime-records
3. gpt-4o-research-agent
4. pinecone-embeddings
5. nvidia-reports

Success: Index 'crime-records' exists in Pinecone.
Connected to index 'crime-records'


In [63]:
stats = index.describe_index_stats()
print(f"\nIndex stats: {stats}")
print(f"Total vectors in index: {stats['total_vector_count']}")


Index stats: {'dimension': 384,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'1995': {'vector_count': 10},
                '1996': {'vector_count': 11},
                '1997': {'vector_count': 13},
                '1998': {'vector_count': 10},
                '1999': {'vector_count': 10},
                '2000': {'vector_count': 10},
                '2001': {'vector_count': 13},
                '2002': {'vector_count': 9},
                '2003': {'vector_count': 1},
                '2004': {'vector_count': 658},
                '2006': {'vector_count': 4},
                '2007': {'vector_count': 3},
                '2008': {'vector_count': 3},
                '2009': {'vector_count': 4},
                '2010': {'vector_count': 3},
                '2011': {'vector_count': 4},
                '2012': {'vector_count': 3},
                '2013': {'vector_count': 4},
                '2014': {'vector_count': 4},
                '2015': {'vector_count': 1},
             