# Analysis Helper Functions

Statistical analysis, visualization, and data enrichment functions.

## Setup

In [None]:
from pymaude import MaudeDatabase
import pandas as pd
import matplotlib.pyplot as plt

# Use shared database
db = MaudeDatabase('notebooks.db', verbose=True)
db.add_years('2020-2023', tables=['device', 'patient', 'text'], download=True)
db.create_search_index()

print("Setup complete!")

## Trend Analysis

Calculate year-by-year event counts:

In [None]:
# Get device events
results = db.search_by_device_names('pacemaker', start_date='2020-01-01')

# Calculate trends
trends = db.get_trends_by_year(results)
print(trends)

# Visualize
plt.figure(figsize=(10, 6))
plt.plot(trends['year'], trends['event_count'], marker='o', linewidth=2)
plt.xlabel('Year')
plt.ylabel('Adverse Events')
plt.title('Pacemaker Adverse Events Over Time')
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

### Trends with Grouped Search

In [None]:
# Compare multiple device types
results = db.search_by_device_names({
    'pacemaker': 'pacemaker',
    'defibrillator': 'defibrillator'
}, start_date='2020-01-01')

trends = db.get_trends_by_year(results)

# Plot grouped trends
plt.figure(figsize=(10, 6))
for group in results['search_group'].unique():
    group_trends = trends[trends['search_group'] == group]
    plt.plot(group_trends['year'], group_trends['event_count'], 
             marker='o', label=group.title(), linewidth=2)

plt.xlabel('Year')
plt.ylabel('Adverse Events')
plt.title('Cardiac Device Trends by Type')
plt.legend()
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

## Summary Statistics

Comprehensive statistics grouped by search_group or custom column:

In [None]:
# Grouped search automatically uses search_group
results = db.search_by_device_names({
    'pacemaker': 'pacemaker',
    'defibrillator': 'defibrillator',
    'icd': ['icd', 'implantable cardioverter']
})

summary = db.summarize_by_brand(results)

print("Event counts by device type:")
print(summary['counts'])

print("\nEvent types by device:")
print(summary['event_types'])

## Event Type Comparison

Compare event type distributions with statistical testing:

In [None]:
results = db.search_by_device_names({
    'insulin_pump': 'insulin pump',
    'glucose_monitor': 'glucose monitor'
})

comparison = db.event_type_comparison(results)

print("Event type comparison:")
print(comparison['summary'])

chi_sq = comparison['chi_square_test']
print(f"\nChi-square: {chi_sq['statistic']:.2f}")
print(f"P-value: {chi_sq['p_value']:.4f}")

if chi_sq['p_value'] < 0.05:
    print("\n✓ Statistically significant difference")
else:
    print("\n✗ No significant difference")

## Patient Data Enrichment

Add patient outcome information:

In [None]:
results = db.search_by_device_names('defibrillator')
print(f"Device events: {len(results)}")

# Enrich with patient data
enriched = db.enrich_with_patient_data(results)
print(f"Events with patient data: {len(enriched)}")

# Count unique outcomes
outcome_summary = db.count_unique_outcomes_per_report(enriched)

# Count reports with death
deaths = outcome_summary['unique_outcomes'].apply(lambda x: 'D' in x).sum()
print(f"\nReports with death: {deaths} ({100*deaths/len(outcome_summary):.1f}%)")

# Count hospitalizations
hosp = outcome_summary['unique_outcomes'].apply(lambda x: 'H' in x).sum()
print(f"Reports with hospitalization: {hosp} ({100*hosp/len(outcome_summary):.1f}%)")

## Narrative Enrichment

Add event narrative text:

In [None]:
results = db.search_by_device_names('catheter')

# Sample a few for narratives
sample = results.head(5)
narratives = db.get_narratives(sample['MDR_REPORT_KEY'].tolist())

if len(narratives) > 0:
    print("Sample narrative:")
    print(f"Report {narratives.iloc[0]['MDR_REPORT_KEY']}:")
    print(narratives.iloc[0]['FOI_TEXT'][:300], "...")

## Brand Name Standardization

Standardize brand name variations for cleaner analysis:

In [None]:
results = db.search_by_device_names('insulin pump')

print("Original brand variations (sample):")
print(results['BRAND_NAME'].value_counts().head(5))

# Define mapping
mapping = {
    'T:SLIM X2 INSULIN PUMP WITH CONTROL-IQ TECHNOLOGY': 'Tandem t:slim X2',
    'T:SLIM X2 INSULIN PUMP WITH BASAL-IQ TECHNOLOGY': 'Tandem t:slim X2',
    'T:SLIM X2 WITH BASAL-IQ TECHNOLOGY': 'Tandem t:slim X2',
    '670G INSULIN PUMP MMT-1780KL': 'Medtronic 670G',
    '640G INSULIN PUMP MMT-1712K': 'Medtronic 640G'
}

standardized = db.standardize_brand_names(results, mapping)

print("\nStandardized brands:")
print(standardized['standard_brand'].value_counts().head(5))

## Event Key Deduplication

Handle duplicate reports (same event, multiple sources):

In [None]:
results = db.search_by_device_names('insulin pump')

# Check duplication
dup_stats = db.count_unique_events(results)

print("Duplication analysis:")
print(f"Total reports: {dup_stats['total_reports']}")
print(f"Unique events: {dup_stats['unique_events']}")
print(f"Duplication rate: {dup_stats['duplication_rate']:.1f}%")

if dup_stats['duplication_rate'] > 5:
    # Deduplicate by keeping first received
    deduped = db.select_primary_report(results, strategy='first_received')
    print(f"\nDeduplicated to {len(deduped)} unique events")

## Combined Workflow Example

Typical analysis combining multiple helpers:

In [None]:
# Step 1: Grouped search
results = db.search_by_device_names({
    'pacemaker': 'pacemaker',
    'defibrillator': 'defibrillator'
}, start_date='2020-01-01')

print(f"Total events: {len(results)}")

# Step 2: Check deduplication
dup_stats = db.count_unique_events(results)
print(f"Duplication rate: {dup_stats['duplication_rate']:.1f}%")

if dup_stats['duplication_rate'] > 5:
    results = db.select_primary_report(results, strategy='first_received')
    print(f"Deduplicated to {len(results)} events")

# Step 3: Summary statistics
summary = db.summarize_by_brand(results)
print("\nEvent counts:")
print(summary['counts'])

# Step 4: Event type comparison
comparison = db.event_type_comparison(results)
if comparison['chi_square_test']['p_value'] < 0.05:
    print("\n✓ Significant difference in event types")

# Step 5: Patient outcomes
enriched = db.enrich_with_patient_data(results)
outcome_summary = db.count_unique_outcomes_per_report(enriched)
deaths = outcome_summary['unique_outcomes'].apply(lambda x: 'D' in x).sum()
print(f"\nReports with death: {deaths} ({100*deaths/len(outcome_summary):.1f}%)")

# Step 6: Export
results.to_csv('cardiac_device_analysis.csv', index=False)
print("\nResults exported to cardiac_device_analysis.csv")

## Cleanup

In [None]:
db.close()

## Summary

### Analysis Functions
- `get_trends_by_year(df)` - Year-by-year event counts
- `summarize_by_brand(df)` - Comprehensive statistics
- `event_type_comparison(df)` - Statistical comparison
- `chi_square_test(df, row, col)` - Custom statistical test

### Data Enrichment
- `enrich_with_patient_data(df)` - Add patient outcomes
- `enrich_with_narratives(df)` - Add event descriptions
- `enrich_with_problems(df)` - Add device problem codes

### Data Quality
- `count_unique_events(df)` - Check duplication
- `select_primary_report(df, strategy)` - Deduplicate
- `standardize_brand_names(df, mapping)` - Clean brand names

**Next**: [05_advanced_workflows.ipynb](05_advanced_workflows.ipynb) - Complete research workflows