# Texas GLO Action Plan - Data Exploration

This notebook explores the DRGR (Disaster Recovery Grant Reporting) reports from the Texas General Land Office.

**Contents:**
1. Overview of available data
2. PDF extraction testing
3. Sample text analysis
4. Table extraction examples

In [None]:
# Standard imports
import sys
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Add src to path
sys.path.insert(0, str(Path.cwd().parent / 'src'))

# Project imports
from config import DRGR_REPORTS_DIR, DATA_DIR, DATABASE_PATH
from utils import get_all_pdfs, parse_filename, get_category_from_path, init_database
from pdf_processor import PDFProcessor

print(f"Reports directory: {DRGR_REPORTS_DIR}")
print(f"Data directory: {DATA_DIR}")

## 1. Data Overview

Let's see what PDF files we have available.

In [None]:
# Get all PDF files
pdf_files = get_all_pdfs()
print(f"Total PDF files: {len(pdf_files)}")

# Calculate total size
total_size = sum(p.stat().st_size for p in pdf_files)
print(f"Total size: {total_size / (1024**2):.1f} MB")

In [None]:
# Create a dataframe of all files with metadata
file_data = []
for pdf in pdf_files:
    meta = parse_filename(pdf.name)
    category = get_category_from_path(pdf)
    file_data.append({
        'filename': pdf.name,
        'category': category,
        'disaster_code': meta.get('disaster_code'),
        'year': meta.get('year'),
        'quarter': meta.get('quarter'),
        'size_mb': pdf.stat().st_size / (1024**2),
    })

df_files = pd.DataFrame(file_data)
df_files.head(10)

In [None]:
# Summary by category
category_summary = df_files.groupby('category').agg({
    'filename': 'count',
    'size_mb': 'sum'
}).rename(columns={'filename': 'file_count'}).sort_values('file_count', ascending=False)

print("Files by Category:")
print(category_summary.to_string())

In [None]:
# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# File count by category
category_summary['file_count'].plot(kind='barh', ax=axes[0], color='steelblue')
axes[0].set_xlabel('Number of Files')
axes[0].set_title('PDF Count by Category')

# Size by category
category_summary['size_mb'].plot(kind='barh', ax=axes[1], color='coral')
axes[1].set_xlabel('Size (MB)')
axes[1].set_title('Total Size by Category')

plt.tight_layout()
plt.show()

In [None]:
# Timeline of reports
yearly = df_files[df_files['year'].notna()].groupby('year').size()
print("\nReports by Year:")
print(yearly.to_string())

## 2. PDF Text Extraction Test

Let's test the text extraction on a sample document.

In [None]:
import fitz  # PyMuPDF

# Pick a sample file (medium-sized recent report)
sample_pdf = DRGR_REPORTS_DIR / "2019_Disasters_ActionPlan" / "drgr-2019-disasters-2025-q4.pdf"

if not sample_pdf.exists():
    # Fallback to first available PDF
    sample_pdf = pdf_files[0]

print(f"Testing with: {sample_pdf.name}")
print(f"Size: {sample_pdf.stat().st_size / 1024:.1f} KB")

In [None]:
# Open and extract text
doc = fitz.open(sample_pdf)
print(f"Page count: {len(doc)}")
print(f"PDF metadata: {doc.metadata}")

In [None]:
# Extract text from first few pages
for page_num in range(min(3, len(doc))):
    page = doc[page_num]
    text = page.get_text()
    print(f"\n{'='*60}")
    print(f"PAGE {page_num + 1} (first 1000 chars):")
    print('='*60)
    print(text[:1000])

In [None]:
doc.close()

## 3. Table Extraction Test

Let's test table extraction using pdfplumber.

In [None]:
import pdfplumber

# Use an expenditure report which should have clear tables
table_pdf = DRGR_REPORTS_DIR / "Expenditure_Reports" / "cdbg-dr-and-mit-timely-expenditure-report-2020-4q.pdf"

if not table_pdf.exists():
    table_pdf = sample_pdf

print(f"Testing tables with: {table_pdf.name}")

In [None]:
# Extract tables
with pdfplumber.open(table_pdf) as pdf:
    print(f"Page count: {len(pdf.pages)}")
    
    for page_num, page in enumerate(pdf.pages[:5], start=1):
        tables = page.extract_tables()
        print(f"\nPage {page_num}: Found {len(tables)} table(s)")
        
        for i, table in enumerate(tables):
            if table and len(table) > 0:
                df = pd.DataFrame(table[1:], columns=table[0] if table[0] else None)
                print(f"\n  Table {i+1}: {len(df)} rows x {len(df.columns)} columns")
                print(df.head().to_string())

## 4. Run Full Processing

Use the PDFProcessor to extract all documents.

In [None]:
# Initialize processor
processor = PDFProcessor()

# Check current stats
stats = processor.get_document_stats()
print("Current Processing Status:")
print(f"  Total registered: {stats['total_documents']}")
print(f"  Processed: {stats['processed_documents']}")
print(f"  Total pages: {stats['total_pages']}")
print(f"  Total tables: {stats['total_tables']}")

In [None]:
# Process a small batch to test (change limit or remove to process all)
# WARNING: Processing all 442 PDFs may take 30-60 minutes

# Uncomment to process:
# processor.process_all(limit=10)  # Test with 10 files first
# processor.process_all()  # Process all files

In [None]:
# View stats by category
stats = processor.get_document_stats()
if stats['by_category']:
    df_stats = pd.DataFrame(stats['by_category'])
    print("\nProcessed by Category:")
    print(df_stats.to_string(index=False))

In [None]:
processor.close()

## 5. Sample Extracted Content

Once processing is complete, explore the extracted content.

In [None]:
import sqlite3

# Connect to database
conn = sqlite3.connect(DATABASE_PATH)

# Query sample documents
df_docs = pd.read_sql_query('''
    SELECT filename, category, year, quarter, page_count, text_extracted
    FROM documents
    ORDER BY year DESC, quarter DESC
    LIMIT 20
''', conn)

print("Recent Documents:")
df_docs

In [None]:
# Sample extracted text
sample_text = pd.read_sql_query('''
    SELECT d.filename, t.page_number, t.char_count, 
           SUBSTR(t.text_content, 1, 500) as text_preview
    FROM document_text t
    JOIN documents d ON t.document_id = d.id
    WHERE t.char_count > 100
    LIMIT 5
''', conn)

for _, row in sample_text.iterrows():
    print(f"\n{'='*60}")
    print(f"{row['filename']} - Page {row['page_number']} ({row['char_count']} chars)")
    print('='*60)
    print(row['text_preview'])

In [None]:
conn.close()

## Next Steps

After running the PDF extraction:

1. **Phase 2**: Run NLP entity extraction (see `02_entity_analysis.ipynb`)
2. **Phase 3**: Analyze financial tables (see `03_financial_analysis.ipynb`)
3. **Phase 4**: Set up semantic search with embeddings
4. **Phase 5**: Build interactive dashboard