# FDA CRL Exploratory Analysis

Interactive exploration of FDA Complete Response Letter patterns.

## Setup

In [None]:
import sys
sys.path.insert(0, '../src')

import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from pathlib import Path

# Our modules
from data_acquisition import CRLDataAcquisition
from pdf_parser import CRLParser
from analysis import CRLAnalyzer

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')

print("Setup complete!")

## 1. Download and Parse CRL Data

In [None]:
# Download CRL data (run once)
acq = CRLDataAcquisition(data_dir='../data')
results = acq.download_and_extract_all(force=False)
manifest = acq.create_manifest()

In [None]:
# Parse PDFs (run once, takes time)
parser = CRLParser(extraction_method='auto')

# Parse approved CRLs
approved_docs = parser.parse_directory(
    Path('../data/raw/approved_crls'), 
    'approved',
    limit=50  # Start small for testing
)

# Parse unapproved CRLs
unapproved_docs = parser.parse_directory(
    Path('../data/raw/unapproved_crls'),
    'unapproved',
    limit=50
)

all_docs = approved_docs + unapproved_docs
print(f"Parsed {len(all_docs)} documents")

In [None]:
# Save parsed data
parser.save_parsed_data(all_docs, Path('../data/processed/parsed_crls.json'))

## 2. Load and Explore Data

In [None]:
# Load parsed data
analyzer = CRLAnalyzer(data_path=Path('../data/processed/parsed_crls.json'))
df = analyzer.df

print(f"Dataset shape: {df.shape}")
df.head()

In [None]:
# Basic statistics
print("\n=== Approval Status ===")
print(df['approval_status'].value_counts())

print("\n=== Application Types ===")
print(df['application_type'].value_counts())

print("\n=== Page Count Stats ===")
print(df['page_count'].describe())

## 3. Deficiency Category Analysis

In [None]:
# Deficiency frequency
freq = analyzer.deficiency_frequency_analysis()

print("Overall deficiency counts:")
for cat, count in sorted(freq['overall'].items(), key=lambda x: -x[1]):
    print(f"  {cat}: {count}")

In [None]:
# Visualize deficiency frequency
analyzer.plot_deficiency_frequency(save_path='../outputs/deficiency_frequency.png')
plt.show()

In [None]:
# Co-occurrence heatmap
analyzer.plot_cooccurrence_heatmap(save_path='../outputs/cooccurrence.png')
plt.show()

## 4. Rescue Rate Analysis

In [None]:
# Calculate rescue rates
rescue_rates = analyzer.calculate_rescue_rates()
rescue_rates

In [None]:
# Visualize rescue rates
analyzer.plot_rescue_rates(save_path='../outputs/rescue_rates.png')
plt.show()

## 5. Predictive Modeling

In [None]:
# Build classifier
classifier_results = analyzer.build_approval_classifier()

# Print performance
for name, results in classifier_results['models'].items():
    print(f"\n{name}:")
    print(f"  Test Accuracy: {results['test_accuracy']:.3f}")
    print(f"  CV Mean: {results['cv_mean']:.3f} (Â±{results['cv_std']:.3f})")

In [None]:
# Feature importance
analyzer.plot_feature_importance(
    classifier_results, 
    model_name='Random Forest',
    save_path='../outputs/feature_importance.png'
)
plt.show()

In [None]:
# ROC curves
analyzer.plot_roc_curves(classifier_results, save_path='../outputs/roc_curves.png')
plt.show()

## 6. Statistical Tests

In [None]:
# Run statistical tests
stats_results = analyzer.statistical_tests()

# Print significant features
print("Statistically significant features (p < 0.05):")
for feature, result in stats_results.items():
    if result.get('significant', False):
        print(f"  {feature}: p={result['p_value']:.4f}")

In [None]:
# Visualize statistical comparison
analyzer.plot_statistical_comparison(save_path='../outputs/statistical_comparison.png')
plt.show()

## 7. Deep Dive: Sample CRL Analysis

In [None]:
# Look at a specific CRL
sample = df.iloc[0]

print(f"Drug: {sample.get('drug_name', 'Unknown')}")
print(f"Application: {sample.get('application_type', 'Unknown')} {sample.get('application_number', 'Unknown')}")
print(f"Status: {sample['approval_status']}")
print(f"\nDeficiency Categories: {sample.get('deficiency_categories', [])}")
print(f"\nSafety Concerns: {sample.get('has_safety_concerns', False)}")
print(f"Efficacy Concerns: {sample.get('has_efficacy_concerns', False)}")
print(f"CMC Issues: {sample.get('has_cmc_issues', False)}")
print(f"Requests New Trial: {sample.get('requests_new_trial', False)}")

## 8. Custom Analysis: Your Turn

In [None]:
# Add your custom analysis here
# Example: Filter by specific criteria

# CRLs with both safety and efficacy concerns
both_concerns = df[
    (df['has_safety_concerns'] == True) & 
    (df['has_efficacy_concerns'] == True)
]

print(f"CRLs with both safety AND efficacy concerns: {len(both_concerns)}")
print(f"Approval rate: {(both_concerns['approval_status'] == 'approved').mean():.1%}")

In [None]:
# Export results
summary = analyzer.generate_full_analysis(output_dir=Path('../outputs'))
print("\nAnalysis complete! Check ../outputs/ for all visualizations.")