# Bronze Layer Exploration

Explore raw bronze layer data quality and patterns in the RF Asset Discovery data lake.

**Purpose:** Understand the raw signals data before transforming to silver/gold layers.

**Data Source:** `data/unified.duckdb` - signals table (14,202 rows)

## 1. Setup

In [None]:
import duckdb
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path

# Connect to DuckDB
DB_PATH = Path('../data/unified.duckdb')
con = duckdb.connect(str(DB_PATH), read_only=True)

print(f"Connected to: {DB_PATH}")
print(f"DuckDB version: {duckdb.__version__}")

## 2. Bronze Layer Overview

In [None]:
# Get all tables and row counts
tables = con.execute("""
    SELECT table_name 
    FROM information_schema.tables 
    WHERE table_schema = 'main'
""").fetchall()

print("Bronze Layer Tables:")
print("=" * 50)
for (table,) in tables:
    count = con.execute(f'SELECT COUNT(*) FROM "{table}"').fetchone()[0]
    print(f"{table:30} {count:>10,} rows")

In [None]:
# Signals table schema
print("\nSignals Table Schema:")
print("=" * 50)
cols = con.execute("""
    SELECT column_name, data_type 
    FROM information_schema.columns 
    WHERE table_name = 'signals'
    ORDER BY ordinal_position
""").fetchall()

for col, dtype in cols:
    print(f"{col:25} {dtype}")

## 3. Signal Distribution

In [None]:
# Load signals into pandas for analysis
signals_df = con.execute("""
    SELECT signal_id, frequency_hz, power_db, bandwidth_hz, 
           freq_band, detection_count, state, 
           first_seen, last_seen, survey_id, segment_id
    FROM signals
""").df()

print(f"Loaded {len(signals_df):,} signals")
signals_df.head()

In [None]:
# Frequency distribution by band
band_counts = signals_df['freq_band'].value_counts()

fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar chart of bands
band_counts.head(15).plot(kind='barh', ax=axes[0], color='steelblue')
axes[0].set_xlabel('Signal Count')
axes[0].set_ylabel('Frequency Band')
axes[0].set_title('Top 15 Frequency Bands by Signal Count')

# Histogram of frequencies
signals_df['frequency_mhz'] = signals_df['frequency_hz'] / 1e6
axes[1].hist(signals_df['frequency_mhz'], bins=100, color='steelblue', alpha=0.7)
axes[1].set_xlabel('Frequency (MHz)')
axes[1].set_ylabel('Signal Count')
axes[1].set_title('Signal Distribution Across Spectrum')

plt.tight_layout()
plt.show()

In [None]:
# Power distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(signals_df['power_db'], bins=50, color='coral', alpha=0.7)
axes[0].axvline(x=0, color='red', linestyle='--', label='Noise floor (0 dB)')
axes[0].axvline(x=10, color='green', linestyle='--', label='Strong signal (+10 dB)')
axes[0].set_xlabel('Power (dB above noise)')
axes[0].set_ylabel('Signal Count')
axes[0].set_title('Signal Strength Distribution')
axes[0].legend()

# Box plot by band
top_bands = band_counts.head(10).index.tolist()
band_power = signals_df[signals_df['freq_band'].isin(top_bands)]
band_power.boxplot(column='power_db', by='freq_band', ax=axes[1], rot=45)
axes[1].set_xlabel('Frequency Band')
axes[1].set_ylabel('Power (dB)')
axes[1].set_title('Power Distribution by Band')
plt.suptitle('')  # Remove auto-title

plt.tight_layout()
plt.show()

## 4. Time Patterns

In [None]:
# Signal detection over time
signals_df['first_seen'] = pd.to_datetime(signals_df['first_seen'])

# Group by minute
signals_df['minute'] = signals_df['first_seen'].dt.floor('min')
time_series = signals_df.groupby('minute').size()

plt.figure(figsize=(14, 5))
time_series.plot(kind='line', color='steelblue')
plt.xlabel('Time')
plt.ylabel('Signals Detected')
plt.title('Signal Detection Rate Over Time')
plt.grid(True, alpha=0.3)
plt.show()

print(f"\nScan Duration: {signals_df['first_seen'].min()} to {signals_df['first_seen'].max()}")
print(f"Total Duration: {signals_df['first_seen'].max() - signals_df['first_seen'].min()}")

## 5. Segment Coverage

In [None]:
# Signals by segment
segments_df = con.execute("""
    SELECT s.segment_id, s.start_freq_hz / 1e6 as start_mhz, 
           s.end_freq_hz / 1e6 as end_mhz, s.priority, s.status, s.signals_found
    FROM survey_segments s
    ORDER BY s.start_freq_hz
""").df()

print(f"Total Segments: {len(segments_df)}")
print(f"\nTop 10 Segments by Signal Count:")
segments_df.nlargest(10, 'signals_found')[['start_mhz', 'end_mhz', 'priority', 'signals_found']]

In [None]:
# Segment coverage heatmap
plt.figure(figsize=(14, 6))
plt.barh(range(len(segments_df)), segments_df['signals_found'], 
         color=plt.cm.viridis(segments_df['priority'] / segments_df['priority'].max()))
plt.xlabel('Signals Found')
plt.ylabel('Segment Index')
plt.title('Signals Found per Segment (color = priority)')
plt.colorbar(plt.cm.ScalarMappable(cmap='viridis'), label='Priority')
plt.show()

## 6. Data Quality

In [None]:
# NULL rates
null_rates = signals_df.isnull().sum() / len(signals_df) * 100

print("NULL Rates:")
print("=" * 40)
for col, rate in null_rates.items():
    status = "✓" if rate == 0 else "⚠" if rate < 50 else "✗"
    print(f"{status} {col:25} {rate:6.2f}%")

In [None]:
# Value ranges and statistics
print("\nValue Ranges:")
print("=" * 60)
print(f"Frequency: {signals_df['frequency_hz'].min()/1e6:.1f} - {signals_df['frequency_hz'].max()/1e6:.1f} MHz")
print(f"Power: {signals_df['power_db'].min():.1f} to {signals_df['power_db'].max():.1f} dB")
print(f"Detection count: {signals_df['detection_count'].min()} to {signals_df['detection_count'].max()}")
print(f"Unique bands: {signals_df['freq_band'].nunique()}")
print(f"Unique states: {signals_df['state'].unique().tolist()}")

In [None]:
# Quality tiers
quality_bins = [
    ('Excellent', signals_df['power_db'] >= 15),
    ('Strong', (signals_df['power_db'] >= 10) & (signals_df['power_db'] < 15)),
    ('Good', (signals_df['power_db'] >= 5) & (signals_df['power_db'] < 10)),
    ('Fair', (signals_df['power_db'] >= 0) & (signals_df['power_db'] < 5)),
    ('Weak', (signals_df['power_db'] >= -10) & (signals_df['power_db'] < 0)),
    ('Very Weak', signals_df['power_db'] < -10),
]

print("\nSignal Quality Distribution:")
print("=" * 40)
for name, mask in quality_bins:
    count = mask.sum()
    pct = count / len(signals_df) * 100
    print(f"{name:15} {count:>6,} ({pct:5.1f}%)")

## 7. Known Transmitter Matching

In [None]:
# Expected FM stations (NYC area)
expected_fm = {
    'WNYC': 93.9,
    'WBGO': 88.3,
    'WQXR': 105.9,
    'WFAN': 101.9,
    'Z100': 100.3,
    'Hot 97': 97.1,
    'Power 105.1': 105.1,
    'WPLJ': 95.5,
}

fm_signals = signals_df[signals_df['freq_band'] == 'fm_broadcast'].copy()
fm_signals['frequency_mhz'] = fm_signals['frequency_hz'] / 1e6

print("FM Station Matching:")
print("=" * 60)
matches = 0
for station, freq in expected_fm.items():
    # Find signals within 0.1 MHz
    match = fm_signals[(fm_signals['frequency_mhz'] >= freq - 0.1) & 
                       (fm_signals['frequency_mhz'] <= freq + 0.1)]
    if len(match) > 0:
        best = match.loc[match['power_db'].idxmax()]
        print(f"✓ {station:15} {freq:6.1f} MHz - Found at {best['power_db']:+.1f} dB")
        matches += 1
    else:
        print(f"✗ {station:15} {freq:6.1f} MHz - Not found")

print(f"\nMatch Rate: {matches}/{len(expected_fm)} ({matches/len(expected_fm)*100:.0f}%)")

In [None]:
# Other expected transmitters
expected_other = {
    'ADS-B': (1090.0, 'adsb'),
    'NOAA Weather': (162.55, 'noaa_weather'),
    'Marine Ch16': (156.8, 'marine_vhf'),
    'FRS Ch1': (462.5625, 'frs_gmrs'),
}

print("\nOther Known Transmitters:")
print("=" * 60)
for name, (freq, band) in expected_other.items():
    band_signals = signals_df[signals_df['freq_band'] == band]
    band_signals['frequency_mhz'] = band_signals['frequency_hz'] / 1e6
    match = band_signals[(band_signals['frequency_mhz'] >= freq - 0.5) & 
                         (band_signals['frequency_mhz'] <= freq + 0.5)]
    if len(match) > 0:
        best = match.loc[match['power_db'].idxmax()]
        print(f"✓ {name:20} {freq:8.3f} MHz - Found at {best['power_db']:+.1f} dB")
    else:
        print(f"✗ {name:20} {freq:8.3f} MHz - Not found in {band}")

## Summary

### Key Findings
1. **Data Volume:** 14,202 signals across full RTL-SDR range (24-1766 MHz)
2. **Quality:** Most signals (84%) are weak (-10 to 0 dB); only 0.7% are high quality
3. **Coverage:** 20 frequency bands classified, 89 segments scanned
4. **Gaps:** bandwidth_hz is NULL for all signals; rf_protocol not classified

### Silver Layer Candidates
- Signals with power_db >= +10 dB: ~97 signals
- Known FM stations matched: 8/8 (100%)

### Next Steps
1. Run second survey pass to build detection_count
2. Transform bronze → silver with quality gates
3. Add protocol classification

In [None]:
# Cleanup
con.close()
print("Connection closed.")