# PostgreSQL Database Demo for Experiment Results

This notebook demonstrates how to connect to the centralized PostgreSQL database and query experiment results.

## Setup and Connection

In [None]:
import sqlalchemy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Create database connection
engine = sqlalchemy.create_engine("postgresql+psycopg2://unlearning:unlearning@localhost/unlearning_db")

print("Connected to PostgreSQL database!")

## Basic Data Exploration

In [None]:
# Check how much data we have
count_query = "SELECT COUNT(*) as total_rows FROM fact_event;"
result = pd.read_sql(count_query, engine)
print(f"Total rows in database: {result['total_rows'].iloc[0]}")

In [None]:
# Get unique experiment configurations
config_query = """
SELECT DISTINCT 
    accountant, 
    gamma_bar, 
    gamma_split,
    COUNT(*) as num_runs
FROM fact_event 
WHERE accountant IS NOT NULL
GROUP BY accountant, gamma_bar, gamma_split
ORDER BY accountant, gamma_bar, gamma_split;
"""

configs = pd.read_sql(config_query, engine)
print("Experiment configurations:")
print(configs)

## Analysis Examples

### Regret Analysis by Accountant Type

In [None]:
# Compare regret across different accountant types
regret_query = """
SELECT 
    accountant,
    seed,
    avg_regret_empirical,
    N_star_emp,
    m_emp,
    final_acc
FROM fact_event 
WHERE accountant IS NOT NULL 
    AND avg_regret_empirical IS NOT NULL
ORDER BY accountant, seed;
"""

regret_data = pd.read_sql(regret_query, engine)
print(f"Loaded {len(regret_data)} regret observations")
print(regret_data.head())

In [None]:
# Plot regret by accountant type
if len(regret_data) > 0:
    plt.figure(figsize=(10, 6))
    
    for accountant in regret_data['accountant'].unique():
        subset = regret_data[regret_data['accountant'] == accountant]
        plt.scatter(subset['seed'], subset['avg_regret_empirical'], 
                   label=f'{accountant} (n={len(subset)})', alpha=0.7)
    
    plt.xlabel('Seed')
    plt.ylabel('Average Regret (Empirical)')
    plt.title('Regret Performance by Accountant Type')
    plt.legend()
    plt.grid(True, alpha=0.3)
    plt.show()
else:
    print("No regret data available for plotting")

### Deletion Capacity Analysis

In [None]:
# Analyze deletion capacity (m_emp) vs theoretical predictions
deletion_query = """
SELECT 
    run_id,
    seed,
    accountant,
    m_emp,
    m_theory_live,
    N_star_emp,
    N_star_theory,
    eps_spent,
    eps_remaining
FROM fact_event 
WHERE m_emp IS NOT NULL 
    AND accountant IS NOT NULL
ORDER BY accountant, seed;
"""

deletion_data = pd.read_sql(deletion_query, engine)
print(f"Loaded {len(deletion_data)} deletion capacity observations")
print(deletion_data.head())

In [None]:
# Summary statistics by accountant
if len(deletion_data) > 0:
    summary = deletion_data.groupby('accountant').agg({
        'm_emp': ['mean', 'std', 'min', 'max'],
        'N_star_emp': ['mean', 'std'],
        'eps_spent': ['mean', 'std']
    }).round(3)
    
    print("Summary statistics by accountant type:")
    print(summary)
else:
    print("No deletion data available for analysis")

## Advanced Queries

### Joining with Lookup Tables

In [None]:
# Use lookup tables for cleaner queries
lookup_query = """
SELECT 
    la.accountant_name,
    COUNT(*) as num_experiments,
    AVG(fe.avg_regret_empirical) as avg_regret,
    AVG(fe.m_emp) as avg_deletions
FROM fact_event fe
JOIN lut_accountant la ON fe.accountant = la.accountant_name
WHERE fe.avg_regret_empirical IS NOT NULL
GROUP BY la.accountant_name
ORDER BY avg_regret;
"""

lookup_results = pd.read_sql(lookup_query, engine)
print("Aggregated results by accountant:")
print(lookup_results)

## Database Schema Information

In [None]:
# Show available tables
tables_query = """
SELECT table_name, table_type 
FROM information_schema.tables 
WHERE table_schema = 'public'
ORDER BY table_name;
"""

tables = pd.read_sql(tables_query, engine)
print("Available tables:")
print(tables)

In [None]:
# Show fact_event table structure
columns_query = """
SELECT column_name, data_type, is_nullable 
FROM information_schema.columns 
WHERE table_name = 'fact_event' 
    AND table_schema = 'public'
ORDER BY ordinal_position;
"""

columns = pd.read_sql(columns_query, engine)
print("fact_event table structure:")
print(columns)

## Comparison: SQL vs CSV Loading

The traditional CSV loading approach requires:
1. Manually finding and parsing multiple CSV files
2. Concatenating data from different experiments  
3. Handling inconsistent schemas across files
4. Re-processing data for each analysis

The PostgreSQL approach provides:
1. **Centralized storage** - All experiment data in one place
2. **Efficient queries** - SQL optimizations and indexing
3. **Consistent schema** - Normalized tables with referential integrity
4. **Concurrent access** - Multiple analysts can query simultaneously
5. **Data validation** - Type checking and constraints
6. **Incremental loading** - Add new experiments without reprocessing old data

### Loading new data

To load new CSV files into the database:

```bash
python etl_load.py --csv-dir experiments/deletion_capacity/results/new_grid \
    --dsn postgresql://unlearning:unlearning@localhost/unlearning_db
```

The ETL script handles:
- Automatic discovery of CSV files in directory structure
- Generation of unique run IDs
- Batch insertion for performance
- Conflict resolution (ON CONFLICT DO NOTHING)