# Inclusion/Exclusion Criteria Analysis

This notebook analyzes the extraction of inclusion and exclusion criteria from scientific papers and cross-references with participant demographics data.

**Key Questions:**
1. How often do papers have explicit inclusion/exclusion criteria?
2. For papers without explicit criteria, were demographics still extracted?
3. Are papers without explicit criteria more likely to be healthy participant studies (no diagnosis)?

In [None]:
import pandas as pd
import json
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Set up plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## Load Data

In [None]:
# Load inclusion/exclusion criteria data
ie_csv_path = '../../outputs/demographics/inclusion_exclusion/full_inclusion_exclusion_gpt-4o-mini-2024-07-18.csv'
ie_json_path = '../../outputs/demographics/inclusion_exclusion/full_inclusion_exclusion_gpt-4o-mini-2024-07-18.json'

# Load CSV for structured analysis
ie_df = pd.read_csv(ie_csv_path)

# Load JSON for detailed text inspection
with open(ie_json_path, 'r') as f:
    ie_json = json.load(f)

print(f"Loaded {len(ie_df)} inclusion/exclusion records")
ie_df.head()

In [None]:
# Load demographics data
demo_json_path = '../../outputs/demographics/extractions/full_md_demographics-zeroshot_gpt-4o-mini-2024-07-18.json'

with open(demo_json_path, 'r') as f:
    demo_json = json.load(f)

print(f"Loaded {len(demo_json)} demographics records")

In [None]:
# Convert demographics to DataFrame
demo_records = []
for record in demo_json:
    pmcid = record.get('pmcid')
    groups = record.get('groups', [])
    
    if groups:
        for group in groups:
            demo_records.append({
                'pmcid': pmcid,
                'diagnosis': group.get('diagnosis'),
                'group_name': group.get('group_name'),
                'count': group.get('count'),
                'imaging_sample': group.get('imaging_sample')
            })
    else:
        # Record with no groups extracted
        demo_records.append({
            'pmcid': pmcid,
            'diagnosis': None,
            'group_name': None,
            'count': None,
            'imaging_sample': None
        })

demo_df = pd.DataFrame(demo_records)
print(f"Created {len(demo_df)} demographic group records from {demo_df['pmcid'].nunique()} unique papers")
demo_df.head()

## 1. How Often Do Papers Have Explicit Inclusion/Exclusion Criteria?

In [None]:
# Count papers with/without explicit criteria
ie_df['has_inclusion'] = ie_df['inclusion_criteria'].notna()
ie_df['has_exclusion'] = ie_df['exclusion_criteria'].notna()
ie_df['has_either'] = ie_df['has_inclusion'] | ie_df['has_exclusion']
ie_df['has_both'] = ie_df['has_inclusion'] & ie_df['has_exclusion']

# Summary statistics
total_papers = len(ie_df)
with_inclusion = ie_df['has_inclusion'].sum()
with_exclusion = ie_df['has_exclusion'].sum()
with_either = ie_df['has_either'].sum()
with_both = ie_df['has_both'].sum()
with_neither = total_papers - with_either

print("\n=== Inclusion/Exclusion Criteria Prevalence ===")
print(f"Total papers analyzed: {total_papers}")
print(f"\nPapers with inclusion criteria: {with_inclusion} ({with_inclusion/total_papers*100:.1f}%)")
print(f"Papers with exclusion criteria: {with_exclusion} ({with_exclusion/total_papers*100:.1f}%)")
print(f"Papers with either criteria: {with_either} ({with_either/total_papers*100:.1f}%)")
print(f"Papers with both criteria: {with_both} ({with_both/total_papers*100:.1f}%)")
print(f"Papers with neither criteria: {with_neither} ({with_neither/total_papers*100:.1f}%)")

In [None]:
# Visualize criteria prevalence
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Pie chart of overall criteria presence
criteria_counts = [
    with_both,
    with_inclusion - with_both,
    with_exclusion - with_both,
    with_neither
]
labels = [
    f'Both\n({with_both})',
    f'Inclusion only\n({with_inclusion - with_both})',
    f'Exclusion only\n({with_exclusion - with_both})',
    f'Neither\n({with_neither})'
]
colors = ['#2ecc71', '#3498db', '#e74c3c', '#95a5a6']

axes[0].pie(criteria_counts, labels=labels, autopct='%1.1f%%', colors=colors, startangle=90)
axes[0].set_title('Distribution of Inclusion/Exclusion Criteria in Papers', fontsize=12, fontweight='bold')

# Bar chart
categories = ['Inclusion', 'Exclusion', 'Either', 'Both', 'Neither']
counts = [with_inclusion, with_exclusion, with_either, with_both, with_neither]
percentages = [c/total_papers*100 for c in counts]

bars = axes[1].bar(categories, percentages, color=['#3498db', '#e74c3c', '#9b59b6', '#2ecc71', '#95a5a6'])
axes[1].set_ylabel('Percentage of Papers (%)', fontsize=11)
axes[1].set_title('Prevalence of Different Criteria Types', fontsize=12, fontweight='bold')
axes[1].set_ylim(0, 100)

# Add value labels on bars
for bar, count in zip(bars, counts):
    height = bar.get_height()
    axes[1].text(bar.get_x() + bar.get_width()/2., height + 2,
                f'{count}\n({height:.1f}%)',
                ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.savefig('../../outputs/demographics/figures/inclusion_exclusion_prevalence.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Analyze dedicated section vs prose
dedicated_section_count = ie_df['has_dedicated_section'].sum()
print(f"\nPapers with dedicated inclusion/exclusion section: {dedicated_section_count} ({dedicated_section_count/total_papers*100:.1f}%)")
print(f"Papers with criteria in prose only: {with_either - dedicated_section_count} ({(with_either - dedicated_section_count)/total_papers*100:.1f}%)")

# Show where criteria are typically found
print("\n=== Criteria Location Distribution ===")
location_counts = ie_df['criteria_location'].value_counts()
print(location_counts)

## 2. Cross-Reference with Demographics Extraction

In [None]:
# Merge datasets
# First, get unique papers from demographics
demo_summary = demo_df.groupby('pmcid').agg({
    'diagnosis': lambda x: list(x.dropna().unique()) if any(x.notna()) else None,
    'count': 'sum',
    'group_name': lambda x: list(x.dropna().unique()) if any(x.notna()) else None
}).reset_index()

demo_summary['has_demographics'] = demo_summary['count'].notna() & (demo_summary['count'] > 0)

# Merge with inclusion/exclusion data
merged_df = ie_df.merge(demo_summary, on='pmcid', how='outer', indicator=True)

print(f"\nMerged dataset: {len(merged_df)} papers")
print(f"Papers in both datasets: {(merged_df['_merge'] == 'both').sum()}")
print(f"Papers only in IE dataset: {(merged_df['_merge'] == 'left_only').sum()}")
print(f"Papers only in demographics dataset: {(merged_df['_merge'] == 'right_only').sum()}")

In [None]:
# Analyze demographics extraction success based on criteria presence
# Only look at papers in both datasets
both_datasets = merged_df[merged_df['_merge'] == 'both'].copy()

# Fill NaN values for analysis
both_datasets['has_either'] = both_datasets['has_either'].fillna(False)
both_datasets['has_demographics'] = both_datasets['has_demographics'].fillna(False)

# Crosstab analysis
crosstab = pd.crosstab(
    both_datasets['has_either'],
    both_datasets['has_demographics'],
    margins=True,
    margins_name='Total'
)

print("\n=== Demographics Extraction vs Explicit Criteria ===")
print("\nCrosstab (rows=has criteria, columns=has demographics):")
print(crosstab)

# Calculate percentages
print("\nPercentages:")
print(pd.crosstab(
    both_datasets['has_either'],
    both_datasets['has_demographics'],
    normalize='index'
) * 100)

In [None]:
# Visualize the relationship
fig, ax = plt.subplots(figsize=(10, 6))

# Prepare data for grouped bar chart
with_criteria_demo = both_datasets[both_datasets['has_either'] == True]['has_demographics'].sum()
with_criteria_no_demo = both_datasets[both_datasets['has_either'] == True]['has_demographics'].count() - with_criteria_demo
no_criteria_demo = both_datasets[both_datasets['has_either'] == False]['has_demographics'].sum()
no_criteria_no_demo = both_datasets[both_datasets['has_either'] == False]['has_demographics'].count() - no_criteria_demo

x = ['Papers with\nExplicit Criteria', 'Papers without\nExplicit Criteria']
demographics_extracted = [with_criteria_demo, no_criteria_demo]
no_demographics = [with_criteria_no_demo, no_criteria_no_demo]

x_pos = range(len(x))
width = 0.35

bars1 = ax.bar([p - width/2 for p in x_pos], demographics_extracted, width, 
               label='Demographics Extracted', color='#2ecc71')
bars2 = ax.bar([p + width/2 for p in x_pos], no_demographics, width,
               label='No Demographics Extracted', color='#e74c3c')

ax.set_ylabel('Number of Papers', fontsize=11)
ax.set_title('Demographics Extraction Success by Presence of Explicit Criteria', 
             fontsize=12, fontweight='bold')
ax.set_xticks(x_pos)
ax.set_xticklabels(x)
ax.legend()

# Add value labels
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{int(height)}',
                ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.savefig('../../outputs/demographics/figures/demographics_vs_criteria.png', dpi=300, bbox_inches='tight')
plt.show()

# Calculate extraction success rate
with_criteria_total = with_criteria_demo + with_criteria_no_demo
no_criteria_total = no_criteria_demo + no_criteria_no_demo

print(f"\nDemographics extraction success rate:")
print(f"  Papers WITH explicit criteria: {with_criteria_demo}/{with_criteria_total} ({with_criteria_demo/with_criteria_total*100:.1f}%)")
print(f"  Papers WITHOUT explicit criteria: {no_criteria_demo}/{no_criteria_total} ({no_criteria_demo/no_criteria_total*100:.1f}%)")

## 3. Are Papers Without Criteria More Likely to Have Healthy Participants?

In [None]:
# Classify papers by whether they have patient groups
def has_patient_diagnosis(diagnosis_list):
    """Check if any diagnosis exists (non-null, non-empty)"""
    if diagnosis_list is None:
        return False
    if isinstance(diagnosis_list, list):
        # Filter out None values and check if any remain
        valid_diagnoses = [d for d in diagnosis_list if d is not None and str(d).strip().lower() not in ['', 'none', 'nan']]
        return len(valid_diagnoses) > 0
    return False

both_datasets['has_patient_diagnosis'] = both_datasets['diagnosis'].apply(has_patient_diagnosis)

# Count papers by criteria and diagnosis status
print("\n=== Patient Diagnosis Status by Presence of Explicit Criteria ===")
diagnosis_crosstab = pd.crosstab(
    both_datasets['has_either'],
    both_datasets['has_patient_diagnosis'],
    margins=True,
    margins_name='Total'
)

print("\nCrosstab (rows=has criteria, columns=has patient diagnosis):")
print(diagnosis_crosstab)

print("\nPercentages:")
diagnosis_pct = pd.crosstab(
    both_datasets['has_either'],
    both_datasets['has_patient_diagnosis'],
    normalize='index'
) * 100
print(diagnosis_pct)

In [None]:
# Only analyze papers where demographics were successfully extracted
with_demo = both_datasets[both_datasets['has_demographics'] == True].copy()

print("\n=== Among Papers with Extracted Demographics ===")
demo_diagnosis_crosstab = pd.crosstab(
    with_demo['has_either'],
    with_demo['has_patient_diagnosis'],
    margins=True,
    margins_name='Total'
)

print("\nCrosstab (rows=has criteria, columns=has patient diagnosis):")
print(demo_diagnosis_crosstab)

print("\nPercentages:")
demo_diagnosis_pct = pd.crosstab(
    with_demo['has_either'],
    with_demo['has_patient_diagnosis'],
    normalize='index'
) * 100
print(demo_diagnosis_pct)

In [None]:
# Visualize the relationship
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Left plot: All papers
with_crit_patients = both_datasets[(both_datasets['has_either'] == True) & (both_datasets['has_patient_diagnosis'] == True)].shape[0]
with_crit_healthy = both_datasets[(both_datasets['has_either'] == True) & (both_datasets['has_patient_diagnosis'] == False)].shape[0]
no_crit_patients = both_datasets[(both_datasets['has_either'] == False) & (both_datasets['has_patient_diagnosis'] == True)].shape[0]
no_crit_healthy = both_datasets[(both_datasets['has_either'] == False) & (both_datasets['has_patient_diagnosis'] == False)].shape[0]

x = ['Papers with\nExplicit Criteria', 'Papers without\nExplicit Criteria']
patients = [with_crit_patients, no_crit_patients]
healthy = [with_crit_healthy, no_crit_healthy]

x_pos = range(len(x))
width = 0.35

bars1 = axes[0].bar([p - width/2 for p in x_pos], patients, width, 
                    label='Has Patient Diagnosis', color='#e74c3c')
bars2 = axes[0].bar([p + width/2 for p in x_pos], healthy, width,
                    label='Healthy Only', color='#2ecc71')

axes[0].set_ylabel('Number of Papers', fontsize=11)
axes[0].set_title('All Papers: Patient Status by Presence of Explicit Criteria', 
                  fontsize=12, fontweight='bold')
axes[0].set_xticks(x_pos)
axes[0].set_xticklabels(x)
axes[0].legend()

for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        axes[0].text(bar.get_x() + bar.get_width()/2., height,
                    f'{int(height)}',
                    ha='center', va='bottom', fontsize=10)

# Right plot: Only papers with demographics
with_crit_patients_demo = with_demo[(with_demo['has_either'] == True) & (with_demo['has_patient_diagnosis'] == True)].shape[0]
with_crit_healthy_demo = with_demo[(with_demo['has_either'] == True) & (with_demo['has_patient_diagnosis'] == False)].shape[0]
no_crit_patients_demo = with_demo[(with_demo['has_either'] == False) & (with_demo['has_patient_diagnosis'] == True)].shape[0]
no_crit_healthy_demo = with_demo[(with_demo['has_either'] == False) & (with_demo['has_patient_diagnosis'] == False)].shape[0]

patients_demo = [with_crit_patients_demo, no_crit_patients_demo]
healthy_demo = [with_crit_healthy_demo, no_crit_healthy_demo]

bars3 = axes[1].bar([p - width/2 for p in x_pos], patients_demo, width, 
                    label='Has Patient Diagnosis', color='#e74c3c')
bars4 = axes[1].bar([p + width/2 for p in x_pos], healthy_demo, width,
                    label='Healthy Only', color='#2ecc71')

axes[1].set_ylabel('Number of Papers', fontsize=11)
axes[1].set_title('Papers with Demographics Extracted: Patient Status by Criteria', 
                  fontsize=12, fontweight='bold')
axes[1].set_xticks(x_pos)
axes[1].set_xticklabels(x)
axes[1].legend()

for bars in [bars3, bars4]:
    for bar in bars:
        height = bar.get_height()
        axes[1].text(bar.get_x() + bar.get_width()/2., height,
                    f'{int(height)}',
                    ha='center', va='bottom', fontsize=10)

plt.tight_layout()
plt.savefig('../../outputs/demographics/figures/patient_status_vs_criteria.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Calculate percentages
print("\n=== Summary Statistics ===")
print("\nAll papers in both datasets:")
with_crit_total = with_crit_patients + with_crit_healthy
no_crit_total = no_crit_patients + no_crit_healthy

print(f"  Papers WITH criteria:")
print(f"    - Patient diagnosis: {with_crit_patients}/{with_crit_total} ({with_crit_patients/with_crit_total*100:.1f}%)")
print(f"    - Healthy only: {with_crit_healthy}/{with_crit_total} ({with_crit_healthy/with_crit_total*100:.1f}%)")

print(f"\n  Papers WITHOUT criteria:")
print(f"    - Patient diagnosis: {no_crit_patients}/{no_crit_total} ({no_crit_patients/no_crit_total*100:.1f}%)")
print(f"    - Healthy only: {no_crit_healthy}/{no_crit_total} ({no_crit_healthy/no_crit_total*100:.1f}%)")

print("\nPapers where demographics were extracted:")
with_crit_total_demo = with_crit_patients_demo + with_crit_healthy_demo
no_crit_total_demo = no_crit_patients_demo + no_crit_healthy_demo

print(f"  Papers WITH criteria:")
print(f"    - Patient diagnosis: {with_crit_patients_demo}/{with_crit_total_demo} ({with_crit_patients_demo/with_crit_total_demo*100:.1f}%)")
print(f"    - Healthy only: {with_crit_healthy_demo}/{with_crit_total_demo} ({with_crit_healthy_demo/with_crit_total_demo*100:.1f}%)")

print(f"\n  Papers WITHOUT criteria:")
print(f"    - Patient diagnosis: {no_crit_patients_demo}/{no_crit_total_demo} ({no_crit_patients_demo/no_crit_total_demo*100:.1f}%)")
print(f"    - Healthy only: {no_crit_healthy_demo}/{no_crit_total_demo} ({no_crit_healthy_demo/no_crit_total_demo*100:.1f}%)")

## Sample Inspection

In [None]:
# Show examples of papers without criteria but with demographics extracted
no_crit_with_demo = both_datasets[
    (both_datasets['has_either'] == False) & 
    (both_datasets['has_demographics'] == True)
].copy()

print(f"\n=== Examples: Papers WITHOUT explicit criteria but WITH demographics ===")
print(f"Total: {len(no_crit_with_demo)} papers\n")

for i, row in no_crit_with_demo.head(5).iterrows():
    print(f"PMCID: {row['pmcid']}")
    print(f"  Diagnosis: {row['diagnosis']}")
    print(f"  Groups: {row['group_name']}")
    print(f"  Location note: {row['criteria_location']}")
    print()

In [None]:
# Show examples of papers WITH criteria and patient diagnosis
with_crit_with_patients = both_datasets[
    (both_datasets['has_either'] == True) & 
    (both_datasets['has_patient_diagnosis'] == True)
].copy()

print(f"\n=== Examples: Papers WITH explicit criteria AND patient diagnosis ===")
print(f"Total: {len(with_crit_with_patients)} papers\n")

for i, row in with_crit_with_patients.head(3).iterrows():
    print(f"PMCID: {row['pmcid']}")
    print(f"  Diagnosis: {row['diagnosis']}")
    print(f"  Has dedicated section: {row['has_dedicated_section']}")
    print(f"  Inclusion criteria (first 200 chars): {str(row['inclusion_criteria'])[:200] if pd.notna(row['inclusion_criteria']) else 'None'}...")
    print(f"  Exclusion criteria (first 200 chars): {str(row['exclusion_criteria'])[:200] if pd.notna(row['exclusion_criteria']) else 'None'}...")
    print()

## Key Findings Summary

In [None]:
print("\n" + "="*80)
print("KEY FINDINGS SUMMARY")
print("="*80)

print(f"\n1. PREVALENCE OF EXPLICIT CRITERIA:")
print(f"   - {with_either/total_papers*100:.1f}% of papers have explicit inclusion/exclusion criteria")
print(f"   - {with_both/total_papers*100:.1f}% have both types")
print(f"   - {dedicated_section_count/total_papers*100:.1f}% have a dedicated section for criteria")

print(f"\n2. DEMOGRAPHICS EXTRACTION SUCCESS:")
if with_criteria_total > 0 and no_criteria_total > 0:
    print(f"   - Papers WITH criteria: {with_criteria_demo/with_criteria_total*100:.1f}% had demographics extracted")
    print(f"   - Papers WITHOUT criteria: {no_criteria_demo/no_criteria_total*100:.1f}% had demographics extracted")
    
print(f"\n3. HEALTHY vs PATIENT STUDIES:")
if with_crit_total > 0 and no_crit_total > 0:
    print(f"   - Papers WITH criteria: {with_crit_patients/with_crit_total*100:.1f}% have patient diagnosis")
    print(f"   - Papers WITHOUT criteria: {no_crit_patients/no_crit_total*100:.1f}% have patient diagnosis")

print("\n" + "="*80)