In [12]:
import pandas as pd
import numpy as np
from faker import Faker
from datetime import datetime, timedelta

In [13]:
# 1. Initialize with realistic parameters
np.random.seed(7)
fake = Faker('es_ES')
start_date = datetime(2022, 1, 1)  
end_date = datetime.now() 
n_patients = 500

# 2. Generate demographic data with clinical variables
districts = ['Sant Marti', 'Sants-Montjuic', 'Eixample', 'Gracia',
             'Ciutat Vella', 'Horta-Guinardo', 'Les Corts']

patients = pd.DataFrame({
    'patient_id': [f"PAT_{i:04d}" for i in range(n_patients)],
    'age': np.clip(np.random.normal(45, 15, n_patients), 18, 85).astype(int),
    'gender': np.random.choice(['M', 'F'], n_patients, p=[0.48, 0.52]),
    'district': np.random.choice(
        districts,
        size=n_patients,
        p=[0.25, 0.2, 0.15, 0.1, 0.1, 0.1, 0.1]  # Updated probabilities
    ),
    'smoker': np.random.choice([0, 1], n_patients, p=[0.7, 0.3])
})

# 3. Optimized inhaler data generation function
def generate_inhaler_data(patient_id, district):
    # Generate hourly timestamps from 2022 to present
    date_range = pd.date_range(start_date, end_date, freq='h')
    
    # Updated pollution profiles for all districts
    pollution_profile = {
        'Sant Marti': {'pm25': (19, 4), 'no2': (35, 6.5)},
        'Sants-Montjuic': {'pm25': (20, 4.5), 'no2': (38, 7)},
        'Eixample': {'pm25': (22, 5), 'no2': (45, 8)},
        'Gracia': {'pm25': (18, 4), 'no2': (32, 6)},
        'Ciutat Vella': {'pm25': (25, 6), 'no2': (48, 9)},
        'Horta-Guinardo': {'pm25': (15, 3), 'no2': (28, 5)},
        'Les Corts': {'pm25': (17, 3.5), 'no2': (30, 5.5)}
    }
    
    # Generate pollution data with realistic patterns
    hours = len(date_range)
    pm25 = np.random.normal(*pollution_profile[district]['pm25'], hours).clip(10)
    no2 = np.random.normal(*pollution_profile[district]['no2'], hours).clip(20)
    
    # Temporal patterns
    hour_factor = 0.3 * np.sin(2 * np.pi * (date_range.hour - 7.5) / 24)
    weekday_factor = 0.15 * (date_range.weekday < 5)
    annual_factor = 0.1 * np.sin(2 * np.pi * (date_range.dayofyear - 100) / 365)
    
    # Calculate puff rates with environmental factors
    base_rate = 0.1 + (pm25 * 0.005) + (no2 * 0.003)
    puff_rates = np.clip(
        base_rate + hour_factor + weekday_factor + annual_factor + np.random.normal(0, 0.05, hours),
        0, 3
    )
    
    return pd.DataFrame({
        'patient_id': patient_id,
        'timestamp': date_range,
        'puffs': np.random.poisson(puff_rates),
        'pm25': np.round(pm25, 1),
        'no2': np.round(no2, 1),
        'date': date_range.normalize()
    })

# 4. Generate synthetic data for all patients
inhaler_data = pd.concat(
    [generate_inhaler_data(row['patient_id'], row['district']) 
     for _, row in patients.iterrows()],
    ignore_index=True
)

# 5. Simulate asthma exacerbation events
exacerbations = []
for patient in patients['patient_id']:
    base_rate = 0.5 + patients.loc[patients['patient_id'] == patient, 'smoker'].values[0] * 0.3
    n_events = np.random.poisson(base_rate)
    
    if n_events > 0:
        event_times = np.random.choice(
            pd.date_range(start_date, end_date, freq='h'), 
            size=n_events, 
            replace=False
        )
        
        for ts in sorted(event_times):
            exacerbations.append({
                'patient_id': patient,
                'timestamp': ts,
                'event_type': 'exacerbation',
                'severity': np.random.choice(['mild', 'moderate', 'severe'], p=[0.6, 0.3, 0.1])
            })

exacerbations_df = pd.DataFrame(exacerbations)

# 6. Merge and save final dataset
full_data = inhaler_data.merge(patients, on='patient_id')

# Format and partition data
full_data['date'] = pd.to_datetime(full_data['date']).dt.strftime('%Y-%m-%d')
full_data.to_parquet(
    '../data/raw/iot_inhaler/bcn_asthma_inhaler_dataset.parquet',
    partition_cols=['patient_id'],
    engine='pyarrow',
    compression='snappy'
)