# Texas GLO - Entity Analysis

This notebook explores the entities extracted from the DRGR disaster recovery reports using NLP.

**Contents:**
1. Entity extraction statistics
2. Analysis by entity type
3. Geographic entity analysis
4. Financial entity analysis
5. Disaster-specific entity analysis

In [None]:
import sys
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3

# Add src to path
sys.path.insert(0, str(Path.cwd().parent / 'src'))

from config import DATABASE_PATH, EXPORTS_DIR
from nlp_processor import NLPProcessor

# Set up plotting
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

print(f"Database: {DATABASE_PATH}")

## 1. Entity Extraction Overview

In [None]:
# Connect to database
conn = sqlite3.connect(DATABASE_PATH)

# Get overall stats
stats = pd.read_sql_query('''
    SELECT 
        COUNT(*) as total_entities,
        COUNT(DISTINCT document_id) as documents_with_entities,
        COUNT(DISTINCT entity_type) as entity_types,
        COUNT(DISTINCT entity_text) as unique_values
    FROM entities
''', conn)

print("Entity Extraction Summary:")
print(f"  Total entities: {stats['total_entities'].iloc[0]:,}")
print(f"  Documents with entities: {stats['documents_with_entities'].iloc[0]}")
print(f"  Entity types: {stats['entity_types'].iloc[0]}")
print(f"  Unique entity values: {stats['unique_values'].iloc[0]:,}")

In [None]:
# Entities by type
df_types = pd.read_sql_query('''
    SELECT entity_type, COUNT(*) as count, COUNT(DISTINCT entity_text) as unique_values
    FROM entities
    GROUP BY entity_type
    ORDER BY count DESC
''', conn)

print("\nEntities by Type:")
df_types

In [None]:
# Visualize entity distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Top 15 entity types by count
df_top = df_types.head(15)
axes[0].barh(df_top['entity_type'], df_top['count'], color='steelblue')
axes[0].set_xlabel('Count')
axes[0].set_title('Top 15 Entity Types by Count')
axes[0].invert_yaxis()

# Unique values per type
axes[1].barh(df_top['entity_type'], df_top['unique_values'], color='coral')
axes[1].set_xlabel('Unique Values')
axes[1].set_title('Unique Values per Entity Type')
axes[1].invert_yaxis()

plt.tight_layout()
plt.show()

## 2. Domain-Specific Entities

Let's examine the custom disaster recovery entities we extracted.

In [None]:
# Disaster entities
df_disasters = pd.read_sql_query('''
    SELECT entity_text, COUNT(*) as mentions
    FROM entities
    WHERE entity_type = 'DISASTER'
    GROUP BY entity_text
    ORDER BY mentions DESC
    LIMIT 20
''', conn)

print("Top Disaster Mentions:")
df_disasters

In [None]:
# FEMA declarations
df_fema = pd.read_sql_query('''
    SELECT entity_text, COUNT(*) as mentions
    FROM entities
    WHERE entity_type = 'FEMA_DECLARATION'
    GROUP BY entity_text
    ORDER BY mentions DESC
    LIMIT 20
''', conn)

print("FEMA Declarations Found:")
df_fema

In [None]:
# Program names
df_programs = pd.read_sql_query('''
    SELECT entity_text, COUNT(*) as mentions
    FROM entities
    WHERE entity_type = 'PROGRAM'
    GROUP BY entity_text
    ORDER BY mentions DESC
    LIMIT 20
''', conn)

print("Recovery Programs Mentioned:")
df_programs

## 3. Geographic Analysis

Analyze Texas counties and locations mentioned in the reports.

In [None]:
# Texas counties
df_counties = pd.read_sql_query('''
    SELECT entity_text, COUNT(*) as mentions
    FROM entities
    WHERE entity_type = 'TX_COUNTY'
    GROUP BY entity_text
    ORDER BY mentions DESC
    LIMIT 30
''', conn)

print("Most Mentioned Texas Counties:")
df_counties.head(20)

In [None]:
# Visualize top counties
if len(df_counties) > 0:
    fig, ax = plt.subplots(figsize=(10, 8))
    top_counties = df_counties.head(20)
    ax.barh(top_counties['entity_text'], top_counties['mentions'], color='teal')
    ax.set_xlabel('Number of Mentions')
    ax.set_title('Top 20 Texas Counties in Disaster Reports')
    ax.invert_yaxis()
    plt.tight_layout()
    plt.show()

In [None]:
# All geographic entities (GPE = Geopolitical Entity)
df_gpe = pd.read_sql_query('''
    SELECT entity_text, COUNT(*) as mentions
    FROM entities
    WHERE entity_type = 'GPE'
    GROUP BY entity_text
    ORDER BY mentions DESC
    LIMIT 30
''', conn)

print("Top Geographic Entities (GPE):")
df_gpe.head(20)

## 4. Financial Analysis

Analyze money amounts mentioned in the reports.

In [None]:
# Top money amounts
df_money = pd.read_sql_query('''
    SELECT entity_text, COUNT(*) as mentions
    FROM entities
    WHERE entity_type = 'MONEY'
    GROUP BY entity_text
    ORDER BY mentions DESC
    LIMIT 30
''', conn)

print("Most Frequently Mentioned Dollar Amounts:")
df_money.head(20)

In [None]:
# Parse and analyze money amounts
import re

def parse_money(text):
    """Parse money string to numeric value."""
    text = text.replace(',', '').replace('$', '')
    multiplier = 1
    if 'billion' in text.lower() or text.endswith('B'):
        multiplier = 1e9
    elif 'million' in text.lower() or text.endswith('M'):
        multiplier = 1e6
    
    # Extract numeric value
    match = re.search(r'[\d.]+', text)
    if match:
        return float(match.group()) * multiplier
    return None

# Apply parsing
df_money['value'] = df_money['entity_text'].apply(parse_money)
df_money_valid = df_money[df_money['value'].notna()].copy()

print(f"\nParsed {len(df_money_valid)} valid money amounts")
print(f"\nLargest amounts mentioned:")
df_money_valid.nlargest(10, 'value')[['entity_text', 'value', 'mentions']]

## 5. Damage Metrics

Analyze damage-related metrics extracted from reports.

In [None]:
# Damage metrics
df_damage = pd.read_sql_query('''
    SELECT entity_text, COUNT(*) as mentions
    FROM entities
    WHERE entity_type = 'DAMAGE_METRIC'
    GROUP BY entity_text
    ORDER BY mentions DESC
    LIMIT 30
''', conn)

print("Damage Metrics Extracted:")
df_damage

In [None]:
# Rainfall amounts
df_rain = pd.read_sql_query('''
    SELECT entity_text, COUNT(*) as mentions
    FROM entities
    WHERE entity_type = 'RAINFALL'
    GROUP BY entity_text
    ORDER BY mentions DESC
    LIMIT 20
''', conn)

print("Rainfall Amounts Mentioned:")
df_rain

In [None]:
# Wind speeds
df_wind = pd.read_sql_query('''
    SELECT entity_text, COUNT(*) as mentions
    FROM entities
    WHERE entity_type = 'WIND_SPEED'
    GROUP BY entity_text
    ORDER BY mentions DESC
    LIMIT 20
''', conn)

print("Wind Speeds Mentioned:")
df_wind

## 6. Entity Co-occurrence Analysis

Analyze which entities appear together in documents.

In [None]:
# Find documents mentioning specific disasters
def get_docs_for_entity(entity_text):
    """Get document IDs mentioning a specific entity."""
    df = pd.read_sql_query('''
        SELECT DISTINCT document_id
        FROM entities
        WHERE entity_text LIKE ?
    ''', conn, params=(f'%{entity_text}%',))
    return set(df['document_id'])

# Find entities that co-occur with Harvey
harvey_docs = get_docs_for_entity('Harvey')
print(f"Documents mentioning Harvey: {len(harvey_docs)}")

# Find top counties in Harvey documents
df_harvey_counties = pd.read_sql_query(f'''
    SELECT entity_text, COUNT(*) as mentions
    FROM entities
    WHERE entity_type = 'TX_COUNTY' 
    AND document_id IN ({','.join(map(str, harvey_docs))})
    GROUP BY entity_text
    ORDER BY mentions DESC
    LIMIT 15
''', conn) if harvey_docs else pd.DataFrame()

if len(df_harvey_counties) > 0:
    print("\nTop Counties in Harvey Documents:")
    print(df_harvey_counties)

## 7. Entities by Document Category

In [None]:
# Entities by document category
df_by_category = pd.read_sql_query('''
    SELECT d.category, e.entity_type, COUNT(*) as count
    FROM entities e
    JOIN documents d ON e.document_id = d.id
    GROUP BY d.category, e.entity_type
    ORDER BY d.category, count DESC
''', conn)

# Pivot for heatmap
df_pivot = df_by_category.pivot_table(
    index='category', 
    columns='entity_type', 
    values='count', 
    fill_value=0
)

print("Entity counts by document category:")
df_pivot.head()

In [None]:
# Heatmap of entity distribution
if len(df_pivot) > 0:
    # Select top entity types
    top_types = df_types.head(10)['entity_type'].tolist()
    df_heatmap = df_pivot[top_types] if all(t in df_pivot.columns for t in top_types) else df_pivot.iloc[:, :10]
    
    fig, ax = plt.subplots(figsize=(12, 8))
    sns.heatmap(df_heatmap, annot=True, fmt='d', cmap='YlOrRd', ax=ax)
    ax.set_title('Entity Distribution by Document Category')
    plt.tight_layout()
    plt.show()

## 8. Export Entities

In [None]:
# Export all entities to CSV
df_all_entities = pd.read_sql_query('''
    SELECT 
        e.entity_type,
        e.entity_text,
        d.filename,
        d.category,
        d.year,
        d.quarter,
        e.page_number
    FROM entities e
    JOIN documents d ON e.document_id = d.id
    ORDER BY e.entity_type, e.entity_text
''', conn)

output_path = EXPORTS_DIR / 'entities.csv'
df_all_entities.to_csv(output_path, index=False)
print(f"Exported {len(df_all_entities):,} entities to {output_path}")

In [None]:
# Export summary stats
summary_path = EXPORTS_DIR / 'entity_summary.csv'
df_types.to_csv(summary_path, index=False)
print(f"Exported summary to {summary_path}")

In [None]:
conn.close()
print("\nAnalysis complete!")

## Next Steps

After entity analysis:

1. **Phase 3**: Analyze financial tables (see `03_financial_analysis.ipynb`)
2. **Phase 4**: Build semantic search with embeddings
3. **Phase 5**: Create interactive dashboard