In [1]:
from google.colab import drive
import os

drive.mount('/content/drive')

import sys
sys.path.append('/content/drive/MyDrive/CPSC 8810 Bio/Final Project/code/data/')

Mounted at /content/drive


In [2]:
# This class instantiates the mean and std of the AQI value for e/ region
# Based off of collected data from the EPA
class AQI():
    # *_aqi: The mean Air Quality Index (AQI) value for the region
    # *_std: The mean Standard Deviation for the region
    def __init__(self):
        self.nw_aqi = 47.82 # Northwest
        self.nw_std = 19.83

        self.se_aqi = 42.47 # Southeast
        self.se_std = 2.39

        self.ne_aqi = 39.23 # Northeast
        self.ne_std = 3.74

        self.sw_aqi = 60.8 # Southwest
        self.sw_std = 41.26


# This class instantiates the amount of asthmatics in e/ region based on data obtained from the CDC
class Asthmatics():
    # *_percent: The mean percentage of people in the region with asthma
    # *_error: The mean standard error for the *_percent statistic
    def __init__(self):
        self.se_percent = 8.55 # Southeast
        self.se_error = 0.52

        self.nw_percent = 10.08 # Northwest
        self.nw_error = 0.44

        self.ne_percent = 9.38 # Northeast
        self.ne_error = 0.50

        self.sw_percent = 9.12 # Southwest
        self.sw_error = 0.55


# This class instantiates data from the CDC based on prevalence of asthma attacks per grouping
class Attack():
    def __init__(self):
        # age_groups: Percentage of individuals in e/ age group who've experienced an asthma attack within the last 12 months
        # *_se: Standard Error for the percentages in age_groups
        self.age_groups = {"0-4": 52.9, "5-14": 45.1, "15-19": 29.1, "20-24":34.7, "25-34": 42.2, "35-64": 46.4, "65+":31.8}
        self.age_groups_se = {"0-4": 11.07, "5-14": 3.88, "15-19": 4.79, "20-24":5.36, "25-34": 3.42, "35-64": 1.8, "65+":2.08}

        # gender: Percentage of individuals in both genders who've experienced an asthma attack within the last 12 months
        # *_se: Standard Error for the percentages in gender
        self.gender = {"Male": 37.5, "Female": 43.1}
        self.gender_se = {"Male": 2.04, "Female": 1.58}

        # race_ethnic: Percentage of individuals in e/ race-ethnicity who've experienced an asthma attack within the last 12 months
        # *_se: Standard Error forthe percentages in race_ethnic
        # AI/AN = American Indian or Alaskan Native
        self.race_ethnic = {"White": 43, "Black": 42.4, "AI/AN": 40.8, "Asian": 44.3, "Hispanic": 40, "Mexican": 38.2, "Other Hispanic": 41.7}
        self.race_ethnic_se = {"White": 0.9, "Black": 1.93, "AI/AN": 5.43, "Asian": 4.57, "Hispanic": 1.91, "Mexican": 2.64, "Other Hispanic": 2.6}


# This class's data is derived from the 2020 NHIS survey and the CDC
class NHIS():
    def __init__(self):
        # CDC Baseline for recent attacks among those with asthma
        #self.recent_attack = {"Yes":25.25, "No":74.75}
        self.recent_attack = {"Yes": 40.7, "No": 59.3}
        # During the past 12 months, have you had to visit an emergency room or urgent care center because of asthma?
        self.hosp_visit = {"Yes":6.17, "No":93.83}
        # During the past 3 months, have you used the kind of PRESCRIPTION asthma inhaler that gives QUICK relief from asthma symptoms during an attack?
        # Since this is based off of only those who have had an attack in the past 12 months, I scale the "Yes" value
        self.rescue_use = {"Yes":64.35, "No":35.65}
        self.rescue_use["Yes"] = (self.rescue_use["Yes"] * self.recent_attack["Yes"]) / 100.0 # "Yes" 26.19%
        self.rescue_use["No"] = 100 - self.rescue_use["Yes"]
        # Are you NOW taking a preventive asthma medication every day, most days, some days, or never?
        # Converted to those who do take and those who don't
        self.preventative_use = {"Yes":56.22, "No":43.78}



In [6]:
import numpy as np
import pandas as pd

# Initialize the classes
aqi_data = AQI()
asthmatics_data = Asthmatics()
attack_data = Attack()
nhis_data = NHIS()

# Define possible values
regions = ['NW', 'SE', 'NE', 'SW']
ages = list(attack_data.age_groups.keys())
genders = list(attack_data.gender.keys())
races = list(attack_data.race_ethnic.keys())
NUM_PATIENTS = 200000

baseline_age = np.mean(list(attack_data.age_groups.values()))
baseline_genders = np.mean(list(attack_data.gender.values()))
baseline_race_ethics = np.mean(list(attack_data.race_ethnic.values()))

def generate_aqi(region):
    aqi_mean = getattr(aqi_data, f'{region.lower()}_aqi')
    aqi_std = getattr(aqi_data, f'{region.lower()}_std')
    aqi = max(0, np.random.normal(aqi_mean, aqi_std))  # Ensure AQI is non-negative
    return aqi

def adjust_attack_probability_by_aqi(p_base, aqi):
    # Assume that each 10 points increase in AQI increases the attack chance by 2%
    incr_factor = (aqi / 10) * 0.02
    p_adjusted = min(1, p_base * (1 + incr_factor))
    return p_adjusted

def adjust_attack_probability_by_demographics(age, gender, race, p_base):
    # Calculate the deviation of each demographic attribute from its baseline
    p_age = attack_data.age_groups[age] - baseline_age
    p_gender = attack_data.gender[gender] - baseline_genders
    p_race_ethnic = attack_data.race_ethnic[race] - baseline_race_ethics

    # Sum the deviations to get the total impact on the base probability
    total_impact = p_age + p_gender + p_race_ethnic

    # Convert total impact into a proportional change factor
    change_factor = total_impact / 100

    adjusted_probability = p_base * (1 + change_factor)
    adjusted_probability = max(0, min(adjusted_probability, 1))

    return adjusted_probability


def generate_nhis_data(aqi, age, gender, race, rescue_use, preventative_use):
    p_recent_attack = nhis_data.recent_attack['Yes'] / 100

    p_recent_attack = adjust_attack_probability_by_aqi(p_recent_attack, aqi)

    p_recent_attack = adjust_attack_probability_by_demographics(age, gender, race, p_recent_attack)

    # Adjust probabilities based on rescue and preventative use based on data from the Global Initiative for Asthma
    if preventative_use == 'Yes':
        p_recent_attack *= 0.35  # 65% less likely to have recent attack
    else:
        p_recent_attack = min(1, p_recent_attack * 1.32)  # 32% more likely to have recent attack
    #if preventative_use == 'No':
        #p_recent_attack = min(1, p_recent_attack * 1.65)  # 65% more likely to have recent attack

    if rescue_use == 'Yes':
        p_recent_attack = min(1, p_recent_attack * 1.75)  # Increase likelihood of attack if using rescue inhaler frequently

    recent_attack = np.random.choice(['Yes', 'No'], p=[p_recent_attack, 1 - p_recent_attack])

    return recent_attack


# Function to generate one individual's data
def generate_individual():
    region = np.random.choice(regions)
    age = np.random.choice(ages)
    gender = np.random.choice(genders)
    race = np.random.choice(races)

    aqi = generate_aqi(region)

    rescue_use = np.random.choice(['Yes', 'No'], p=[nhis_data.rescue_use['Yes'] / 100, nhis_data.rescue_use['No'] / 100])
    preventative_use = np.random.choice(['Yes', 'No'], p=[nhis_data.preventative_use['Yes'] / 100, nhis_data.preventative_use['No'] / 100])

    recent_attack = generate_nhis_data(aqi, age, gender, race, rescue_use, preventative_use)

    return {
        'Region': region,
        'Age': age,
        'Gender': gender,
        'Race': race,
        'AQI': aqi,
        'Recent Attack': recent_attack,
        'Rescue Inhaler Use': rescue_use,
        'Preventative Medication Use': preventative_use,
    }

# Generate dataset
data = [generate_individual() for _ in range(NUM_PATIENTS)]
df = pd.DataFrame(data)

# Display the first few rows of the dataframe
print(df.head())
# Get a summary of the dataframe
print(df.info())
# Statistical summary of numeric columns
print(df.describe())
# Check the balance of classes in the 'Recent Attack' column
print(df['Recent Attack'].value_counts(normalize=True))


  Region    Age  Gender      Race         AQI Recent Attack  \
0     NW  20-24  Female  Hispanic   13.478413            No   
1     NW  20-24  Female     Black   57.048036            No   
2     SW   5-14    Male     AI/AN   60.537909           Yes   
3     SW  20-24  Female     White  150.925547           Yes   
4     SE  15-19    Male   Mexican   42.979544           Yes   

  Rescue Inhaler Use Preventative Medication Use  
0                 No                         Yes  
1                 No                         Yes  
2                 No                         Yes  
3                Yes                         Yes  
4                 No                         Yes  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 8 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Region                       200000 non-null  object 
 1   Age                    

In [7]:
df.to_csv('/content/drive/MyDrive/CPSC 8810 Bio/Final Project/code/data/data_subset.csv', index=False)

#Temp

In [None]:
# Example of adjusting the probability for a 25-34-year-old female who is White
p_base = 0.2525  # Suppose this is the baseline probability of having a recent asthma attack
adjusted_p = adjust_attack_probability_by_demographics('25-34', 'Female', 'White', p_base)
print(f"Adjusted probability of a recent asthma attack: {adjusted_p:.2%}")


Adjusted probability of a recent asthma attack: 26.82%


In [None]:
import pandas as pd
import numpy as np

def summarize_data(df):
    # Define the summary DataFrame
    summary_df = pd.DataFrame()

    # Summarize AQI data by region
    for region in regions:
        region_data = df[df['Region'] == region]
        summary_df.loc[region, 'Mean AQI'] = region_data['AQI'].mean()
        summary_df.loc[region, 'AQI StdDev'] = region_data['AQI'].std()

    # Summarize Asthma Attack, Rescue Use, and Preventative Medication Use
    summary_df['% Recent Attack'] = df.groupby('Region')['Recent Attack'].apply(lambda x: (x == 'Yes').mean() * 100)
    summary_df['% Rescue Use'] = df.groupby('Region')['Rescue Inhaler Use'].apply(lambda x: (x == 'Yes').mean() * 100)
    summary_df['% Preventative Use'] = df.groupby('Region')['Preventative Medication Use'].apply(lambda x: (x == 'Yes').mean() * 100)

    # Calculate baseline characteristics for Age, Gender, Race
    demographic_columns = ['Age', 'Gender', 'Race']
    for column in demographic_columns:
        for category in df[column].unique():
            summary_df[f'% {category}'] = df[column].apply(lambda x: x == category).mean() * 100

    return summary_df

# Generate the summary DataFrame
table1 = summarize_data(df)
print(table1)


     Mean AQI  AQI StdDev  % Recent Attack  % Rescue Use  % Preventative Use  \
NW  47.980185   19.705195        27.786740     16.474431           56.428456   
SE  42.470112    2.393145        27.122443     16.066926           56.210223   
NE  39.282294    3.757223        26.914531     16.625665           55.721768   
SW  62.274109   38.911943        27.862824     16.339582           57.112594   

    % 20-24  % 25-34   % 0-4  % 15-19  % 5-14  ...  % 35-64  % Female  % Male  \
NW   14.182   14.242  14.363   14.392  14.419  ...   14.208    49.999  50.001   
SE   14.182   14.242  14.363   14.392  14.419  ...   14.208    49.999  50.001   
NE   14.182   14.242  14.363   14.392  14.419  ...   14.208    49.999  50.001   
SW   14.182   14.242  14.363   14.392  14.419  ...   14.208    49.999  50.001   

    % White  % Other Hispanic  % Mexican  % Black  % Asian  % Hispanic  \
NW   14.273            14.209     14.323   14.189   14.446      14.329   
SE   14.273            14.209     14.323   14

In [None]:
#from sklearn.model_selection import train_test_split

# -- Split the data into training and testing sets --
#train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)