In [1]:
import pandas as pd
import numpy as np

### Demographics Data

Referencing to CT Browser, SNOMED CT codes:
* Male (finding): 248153007
* Female (finding): 248152002

Store age as a property and create a relationship from patient to age group:
| Age Range                  | Age Group            | ID        |
|----------------------------|----------------------|-----------|
| Child (0–12 years)	     |  Child (person)	    | 67822003 |
| Adolescent (12–18 years)	 |  Adolescent (person)	| 133937008 |
| Adult (18–60 years)	     |  Adult (person)	    | 133936004 |
| Elderly (60+ years)	     |  Elderly person	    | 105436006 |

In KG, relationships are depicted as:
* patient (pid) - [:IN_AGE_GROUP] -> sctid (Age group)
* patient (pid) - [:IS_A] -> sctid (gender)

In [2]:
demographics_df = pd.read_sas("../data/nhanes_data/Demographics.xpt")
demographics_df = demographics_df[['SEQN', 'RIAGENDR', 'RIDAGEYR']]
demographics_df.columns = ['SEQN', 'Gender', 'Age_at_screening']

demographics_df['Gender_ID'] = demographics_df['Gender'].map({1.0: '248153007', 2.0: '248152002'})
demographics_df['Gender'] = demographics_df['Gender'].map({1.0: 'Male', 2.0: 'Female'})
demographics_df['Age_at_screening'] = demographics_df['Age_at_screening'].astype(int)

demographics_df['SEQN'] = demographics_df['SEQN'].astype(int)

In [3]:
# Define age groups
def map_age_group(age):
    if (age < 12):
        return 'Child'
    elif (12 <= age < 18):
        return 'Adolescent'
    elif (18 <= age < 60):
        return 'Adult'
    else:
        return 'Elderly'

# Apply function to create new column
demographics_df['age_group'] = demographics_df['Age_at_screening'].apply(map_age_group)

demographics_df['Age_ID'] = demographics_df['age_group'].map({'Child': '67822003', 
                                                              'Adolescent': '133937008',
                                                              'Adult': '133936004',
                                                              'Elderly': '105436006'})

In [4]:
demographics_df.to_csv("../data/nhanes_data/filtered/nhanes_demographics.csv")

demographics_df.head()

Unnamed: 0,SEQN,Gender,Age_at_screening,Gender_ID,age_group,Age_ID
0,130378,Male,43,248153007,Adult,133936004
1,130379,Male,66,248153007,Elderly,105436006
2,130380,Female,44,248152002,Adult,133936004
3,130381,Female,5,248152002,Child,67822003
4,130382,Male,2,248153007,Child,67822003


### Physical Activity Data

In [None]:
PA_df = pd.read_sas("../data/nhanes_data/PhysicalActivity_Questionnaire.xpt")
PA_df = PA_df[['SEQN', 'PAD790Q', 'PAD790U', 'PAD800', 'PAD810Q', 'PAD810U', 'PAD820', 'PAD680']]
PA_df.columns = ['SEQN', 'freq_moderate_act', 'freq_moderate_unit','min_moderate_act', 'freq_vigorous_act', 
                 'freq_vigorous_unit', 'min_vigorous_act', 'min_sedentary_daily']

PA_df['SEQN'] = PA_df['SEQN'].astype(int)

PA_df.head()

Unnamed: 0,SEQN,freq_moderate_act,freq_moderate_unit,min_moderate_act,freq_vigorous_act,freq_vigorous_unit,min_vigorous_act,min_sedentary_daily
0,130378,3.0,b'W',45.0,3.0,b'W',45.0,360.0
1,130379,4.0,b'W',45.0,3.0,b'W',45.0,480.0
2,130380,1.0,b'W',20.0,5.397605e-79,b'',,240.0
3,130384,5.397605e-79,b'',,5.397605e-79,b'',,60.0
4,130385,1.0,b'D',90.0,1.0,b'W',60.0,180.0


In [6]:
threshold = 1e-10  # Define a small threshold

PA_df['freq_moderate_act'] = PA_df['freq_moderate_act'].mask(PA_df['freq_moderate_act'].abs() < threshold, 0)
PA_df['min_moderate_act'] = PA_df['min_moderate_act'].mask(PA_df['min_moderate_act'].abs() < threshold, 0)
PA_df['freq_vigorous_act'] = PA_df['freq_vigorous_act'].mask(PA_df['freq_vigorous_act'].abs() < threshold, 0)
PA_df['min_vigorous_act'] = PA_df['min_vigorous_act'].mask(PA_df['min_vigorous_act'].abs() < threshold, 0)
PA_df['min_sedentary_daily'] = PA_df['min_sedentary_daily'].mask(PA_df['min_sedentary_daily'].abs() < threshold, 0)

In [7]:
PA_df['freq_moderate_act'] = pd.to_numeric(PA_df['freq_moderate_act'], errors='coerce').fillna(9999).astype(int)
PA_df['freq_vigorous_act'] = pd.to_numeric(PA_df['freq_vigorous_act'], errors='coerce').fillna(9999).astype(int)

PA_df['freq_moderate_act'] = PA_df['freq_moderate_act'].replace([7777, 9999], np.nan)
PA_df['freq_vigorous_act'] = PA_df['freq_vigorous_act'].replace([7777, 9999], np.nan)

def replace_fn(row):
    if row['freq_moderate_act'] == 0:
        row['min_moderate_act'] = 0
        row['freq_moderate_unit'] = 'W'
    if row['freq_vigorous_act'] == 0:
        row['min_vigorous_act'] = 0
        row['freq_vigorous_unit'] = 'W'
    return row
PA_df = PA_df.apply(replace_fn, axis=1)

PA_df['min_moderate_act'] = PA_df['min_moderate_act'].replace([7777, 9999], np.nan)
PA_df['min_vigorous_act'] = PA_df['min_vigorous_act'].replace([7777, 9999], np.nan)

PA_df['freq_moderate_unit'] = PA_df['freq_moderate_unit'].astype(str)
PA_df['freq_vigorous_unit'] = PA_df['freq_vigorous_unit'].astype(str)

PA_df.head()

# PA_df.to_csv('../data/nhanes_data/filtered/check.csv')

Unnamed: 0,SEQN,freq_moderate_act,freq_moderate_unit,min_moderate_act,freq_vigorous_act,freq_vigorous_unit,min_vigorous_act,min_sedentary_daily
0,130378,3.0,W,45.0,3.0,W,45.0,360.0
1,130379,4.0,W,45.0,3.0,W,45.0,480.0
2,130380,1.0,W,20.0,0.0,W,0.0,240.0
3,130384,0.0,W,0.0,0.0,W,0.0,60.0
4,130385,1.0,D,90.0,1.0,W,60.0,180.0


In [8]:
# Conversion factors to per week
conversion_factors = {
    'D': 7,        # 7 days in a week
    'W': 1,       # Already in weeks
    'M': 1/4.345,  # 1 month ≈ 4.345 weeks
    'Y': 1/52.143   # 1 year ≈ 52.143 weeks
}

# Calculate minutes per week
PA_df['minutes_mod_weekly'] = PA_df.apply(
    lambda row: (row['freq_moderate_act'] * conversion_factors.get(row['freq_moderate_unit'], 1) * row['min_moderate_act']), axis=1
)
PA_df['minutes_vig_weekly'] = PA_df.apply(
    lambda row: (row['freq_vigorous_act'] * conversion_factors.get(row['freq_vigorous_unit'], 1) * row['min_vigorous_act']), axis=1
)

PA_df['minutes_mod_weekly'] = PA_df['minutes_mod_weekly'].round(3)
PA_df['minutes_vig_weekly'] = PA_df['minutes_vig_weekly'].round(3)

PA_df.head()

Unnamed: 0,SEQN,freq_moderate_act,freq_moderate_unit,min_moderate_act,freq_vigorous_act,freq_vigorous_unit,min_vigorous_act,min_sedentary_daily,minutes_mod_weekly,minutes_vig_weekly
0,130378,3.0,W,45.0,3.0,W,45.0,360.0,135.0,135.0
1,130379,4.0,W,45.0,3.0,W,45.0,480.0,180.0,135.0
2,130380,1.0,W,20.0,0.0,W,0.0,240.0,20.0,0.0
3,130384,0.0,W,0.0,0.0,W,0.0,60.0,0.0,0.0
4,130385,1.0,D,90.0,1.0,W,60.0,180.0,630.0,60.0


In [9]:
def convert_to_weekly(type, row):
    if type=='moderate':
        freq_column = 'freq_moderate_act'
        unit_column = 'freq_moderate_unit'
    else: 
        freq_column = 'freq_vigorous_act'
        unit_column = 'freq_vigorous_unit'

    if pd.isna(row[freq_column]):  # Handle NaN cases
        return np.nan
    if row[unit_column] == 'D':  # Daily → Weekly
        return int(round(row[freq_column] * 7))
    elif row[unit_column] == 'M':  # Monthly → Weekly (Assume 4.33 weeks/month)
        return int(round(row[freq_column] / 4.33))
    elif row[unit_column] == 'Y':  # Yearly → Weekly (Assume 52 weeks/year)
        return int(round(row[freq_column] / 52))
    elif row[unit_column] == 'W':  # Already Weekly
        return int(row[freq_column])
    else:
        return np.nan  # Handle unknown units

# Apply function to create new column
PA_df['freq_mod_weekly'] = PA_df.apply(lambda row: convert_to_weekly('moderate', row), axis=1)
PA_df['freq_vig_weekly'] = PA_df.apply(lambda row: convert_to_weekly('vigorous', row), axis=1)

PA_df = PA_df.drop(['freq_moderate_act','freq_moderate_unit','min_moderate_act','freq_vigorous_act','freq_vigorous_unit','min_vigorous_act'], axis=1)

PA_df.head()

Unnamed: 0,SEQN,min_sedentary_daily,minutes_mod_weekly,minutes_vig_weekly,freq_mod_weekly,freq_vig_weekly
0,130378,360.0,135.0,135.0,3.0,3.0
1,130379,480.0,180.0,135.0,4.0,3.0
2,130380,240.0,20.0,0.0,1.0,0.0
3,130384,60.0,0.0,0.0,0.0,0.0
4,130385,180.0,630.0,60.0,7.0,1.0


In [10]:
# Classification functions
def classify_aerobic_exercise(moderate_freq, vigorous_freq):
    """Classify aerobic exercise frequency based on SNOMED CT entities."""
    freq = moderate_freq + vigorous_freq
    if freq == 0:
        return "160636006"
    elif freq == 1:
        return "160637002"
    elif freq == 2:
        return "160638007"
    elif freq >= 3:
        return "160639004"
    return ""

def classify_sedentary(minutes_per_day):
    """Classify sedentary behavior based on SNOMED CT entities."""
    if minutes_per_day >= 360:  # More than 6 hours per day
        return "415510005"
    elif 0 <= minutes_per_day < 360: 
        return ""
    return ""

def classify_exercise_level(moderate_minutes, vigorous_minutes):
    """Classify overall exercise level based on WHO guidelines."""
    total_exercise = moderate_minutes + vigorous_minutes
    if total_exercise >= 300:   # Exercise above recommended level
        return "424805008"
    elif total_exercise < 150:  # Exercise below recommended level
        return "413300002"
    elif 150 <= total_exercise < 300: # Physically active code
        return "228447005"
    return ""          # Unknown

# Apply classification functions
PA_df["Sedentary_Behavior_SNOMED"] = PA_df["min_sedentary_daily"].apply(classify_sedentary)
PA_df["Exercise_Level_SNOMED"] = PA_df.apply(lambda row: classify_exercise_level(row["minutes_mod_weekly"], row["minutes_vig_weekly"]), axis=1)
PA_df["Exercise_Freq_SNOMED"] = PA_df.apply(lambda row: classify_aerobic_exercise(row["freq_mod_weekly"], row["freq_vig_weekly"]), axis=1)

# Display the first few rows
PA_df.head()

Unnamed: 0,SEQN,min_sedentary_daily,minutes_mod_weekly,minutes_vig_weekly,freq_mod_weekly,freq_vig_weekly,Sedentary_Behavior_SNOMED,Exercise_Level_SNOMED,Exercise_Freq_SNOMED
0,130378,360.0,135.0,135.0,3.0,3.0,415510005.0,228447005,160639004
1,130379,480.0,180.0,135.0,4.0,3.0,415510005.0,424805008,160639004
2,130380,240.0,20.0,0.0,1.0,0.0,,413300002,160637002
3,130384,60.0,0.0,0.0,0.0,0.0,,413300002,160636006
4,130385,180.0,630.0,60.0,7.0,1.0,,424805008,160639004


In [11]:
PA_df.to_csv("../data/nhanes_data/filtered/nhanes_physicalactivity.csv")

### Physical Activity - Youth Data

In [12]:
PA_youth_df = pd.read_sas("../data/nhanes_data/PhysicalActivity_Youth_Questionnaire.xpt")
PA_youth_df = PA_youth_df[['SEQN', 'PAQ706', 'PAQ711']]
PA_youth_df.columns = ['SEQN', 'days_active', 'hours_sedentary']

PA_youth_df['SEQN'] = PA_youth_df['SEQN'].astype(int)

PA_youth_df['days_active'] = PA_youth_df['days_active'].replace([77, 99], np.nan)
PA_youth_df['hours_sedentary'] = PA_youth_df['hours_sedentary'].replace([77, 99], np.nan)

threshold = 1e-10  # Define a small threshold
PA_youth_df['days_active'] = PA_youth_df['days_active'].mask(PA_youth_df['days_active'].abs() < threshold, 0)
PA_youth_df['hours_sedentary'] = PA_youth_df['hours_sedentary'].mask(PA_youth_df['hours_sedentary'].abs() < threshold, 0)

# Convert to minutes
PA_youth_df['youth_act_weekly'] = PA_youth_df.apply(
    lambda row: row['days_active'] * 60, axis=1
)
PA_youth_df['min_sedentary'] = PA_youth_df.apply(
    lambda row: row['hours_sedentary'] * 60, axis=1
)

# Drop hours column
PA_youth_df = PA_youth_df.drop('hours_sedentary', axis=1)

In [13]:
# Perform mapping
PA_youth_df["Sedentary_Youth_SNOMED"] = PA_youth_df["min_sedentary"].apply(classify_sedentary)
PA_youth_df["Exercise_Level_SNOMED"] = PA_youth_df.apply(lambda row: classify_exercise_level(row["youth_act_weekly"], row["youth_act_weekly"]), axis=1)
PA_youth_df["Exercise_Freq_SNOMED"] = PA_youth_df.apply(lambda row: classify_aerobic_exercise(row["days_active"], 0), axis=1)

PA_youth_df.head()

Unnamed: 0,SEQN,days_active,youth_act_weekly,min_sedentary,Sedentary_Youth_SNOMED,Exercise_Level_SNOMED,Exercise_Freq_SNOMED
0,130381,7.0,420.0,180.0,,424805008.0,160639004.0
1,130382,7.0,420.0,120.0,,424805008.0,160639004.0
2,130383,7.0,420.0,120.0,,424805008.0,160639004.0
3,130403,,,,,,
4,130405,7.0,420.0,180.0,,424805008.0,160639004.0


In [14]:
PA_youth_df.to_csv("../data/nhanes_data/filtered/nhanes_physicalactivity_youth.csv")

### Medical Conditions Data

In [None]:
medcond_df = pd.read_sas("../data/nhanes_data/MedicalConditions_Questionnaire.xpt")

medcond_df = medcond_df[['SEQN','MCQ010','MCQ195','MCQ160B','MCQ160C','MCQ160D','MCQ160E','MCQ160F','MCQ160M','MCQ160P',
                         'MCQ160L','MCQ500','MCQ510A','MCQ510B','MCQ510C','MCQ510D','MCQ510E','MCQ510F','MCQ220','MCQ230A','MCQ230B','MCQ230C']]

medcond_df.columns = ['SEQN', 'asthma_diagnosis','arthritis_diagnosis','congestive_hf_diagnosis','coronheart_disease_diagnosis','angina_diagnosis',
                      'heart_attack_diagnosis', 'stroke_diagnosis','thyroid_diagnosis','copd_diagnosis','liver_cond','liver_cond_youth','fatty_liver_diagnosis',
                      'liver_fibrosis_diagnosis','liver_cirrhosis_diagnosis','viral_hepatitis_diagnosis','autoimmune_hepatitis_diagnosis','other_liver_diagnosis',
                      'cancer_diagnosis','first_cancer_diagnosis','second_cancer_diagnosis','third_cancer_diagnosis']

medcond_df['SEQN'] = medcond_df['SEQN'].astype(int)

# Replacing all the 'refused' 'don't know' answers
medcond_df.replace([7, 9, 77, 99], np.nan, inplace=True)

# Combine liver_cond columns
medcond_df['liver_cond'] = medcond_df['liver_cond'].combine_first(medcond_df['liver_cond_youth'])
medcond_df = medcond_df.drop('liver_cond_youth', axis=1)

medcond_df.shape[0]

11744

##### Blood Pressure and Cholesterol Data

In [16]:
bp_chol_df = pd.read_sas("../data/nhanes_data/BloodPressureCholesterol_Questionnaire.xpt")
bp_chol_df = bp_chol_df[['SEQN', 'BPQ020', 'BPQ080']]
bp_chol_df.columns = ['SEQN', 'high_bp_diagnosis', 'high_chol_diagnosis']

bp_chol_df.replace([7, 9], np.nan, inplace=True)

bp_chol_df.shape[0]

8501

In [17]:
conditions_df = pd.merge(medcond_df, bp_chol_df, on='SEQN', how='outer')
conditions_df.shape[0]

11744

In [18]:
# Define mapping for standard 1/2 encoding
diagnosis_mapping = {
    'asthma_diagnosis': '195967001',
    'arthritis_diagnosis': '3723001',
    'congestive_hf_diagnosis': '42343007',
    'coronheart_disease_diagnosis': '53741008',
    'angina_diagnosis': '194828000',
    'heart_attack_diagnosis': '22298006',
    'stroke_diagnosis': '230690007',
    'thyroid_diagnosis': '14304000',
    'copd_diagnosis': '13645005',
    'liver_cond': '235856003',
    'fatty_liver_diagnosis': '197321007',
    'liver_fibrosis_diagnosis': '62484002',
    'liver_cirrhosis_diagnosis': '19943007',
    'viral_hepatitis_diagnosis': '3738000',
    'autoimmune_hepatitis_diagnosis': '408335007',
    'cancer_diagnosis': '363346000',
    'high_bp_diagnosis': '38341003',
    'high_chol_diagnosis': '13644009'
}

cancer_mapping = {
    10: '399326009',    # Bladder
    11: '129154003',    # Blood/Hematologic Neoplasm
    12: '126537000',     # Bone / Neoplasm of bone 
    13: '126952004',     # Brain /  Neoplasm of brain 
    14: '254837009',    # Breast / Neoplasm of breast
    15: '363354003',    # Cervix / Neoplasm of uterine cervix
    16: '363406005',    # Colon / Neoplasm of colon 
    17: '363402007',    # Esophagus /  Malignant neoplasm of esophagus
    18: '126854002',     # Gallbladder
    19: '126880001',     # Kidney
    20: '363429002',    # Larynx cancer
    21: '93143009',     # Leukemia
    22: '93870000',     # Liver cancer
    23: '363358000',      # Lung
    24: '118599009',    # Lymphoma / Hodgkin's disease
    25: '93655004',    # Melanoma / Skin
    26: '363505006',    # Mouth
    27: '126950007',    # Nervous system
    28: '363443007',    # Ovary
    29: '126859007',    # Pancreas
    30: '399068003',    # Prostate
    31: '363351006',    # Rectal cancer
    32: '372130007',    # Skin cancer
    33: '372130007',    # Skin cancer
    34: '363495004',    # Muscle cancer
    35: '363349007',    # Stomach
    36: '188220005',    # Testis cancer
    37: '363478007',    # Thyroid cancer
    38: '371973000',    # Uterine cancer
}

# Define the 4 columns with numeric diagnosis codes
cancer_diagnosis_columns = ['first_cancer_diagnosis','second_cancer_diagnosis','third_cancer_diagnosis']

# Function to create the diagnosis string
def create_diagnosis_string(row):
    diseases = []
    
    # Process standard 1/2 encoded columns
    for col, code in diagnosis_mapping.items():
        if row[col] == 1:
            diseases.append(code)

    for col in cancer_diagnosis_columns:
        if pd.notna(row[col]) and row[col] in cancer_mapping:
            diseases.append(cancer_mapping[row[col]])  # Map number to code

    return ', '.join(diseases) if diseases else ''  # Return joined string or empty if no diagnosis

# Apply function row-wise
conditions_df['all_diagnosis'] = conditions_df.apply(create_diagnosis_string, axis=1)

conditions_df.head()

Unnamed: 0,SEQN,asthma_diagnosis,arthritis_diagnosis,congestive_hf_diagnosis,coronheart_disease_diagnosis,angina_diagnosis,heart_attack_diagnosis,stroke_diagnosis,thyroid_diagnosis,copd_diagnosis,...,viral_hepatitis_diagnosis,autoimmune_hepatitis_diagnosis,other_liver_diagnosis,cancer_diagnosis,first_cancer_diagnosis,second_cancer_diagnosis,third_cancer_diagnosis,high_bp_diagnosis,high_chol_diagnosis,all_diagnosis
0,130378,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,,,,2.0,,,,1.0,2.0,38341003
1,130379,2.0,,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,,,,1.0,30.0,,,1.0,2.0,"363346000, 38341003, 399068003"
2,130380,2.0,,2.0,2.0,2.0,2.0,2.0,1.0,2.0,...,,,,2.0,,,,2.0,1.0,"14304000, 13644009"
3,130381,2.0,,,,,,,,,...,,,,,,,,,,
4,130382,2.0,,,,,,,,,...,,,,,,,,,,


In [19]:
conditions_df.to_csv("../data/nhanes_data/filtered/nhanes_medconditions.csv")

### Alcohol Use Data

In [20]:
alcohol_use_df = pd.read_sas("../data/nhanes_data/AlcoholUse_Questionnaire.xpt")
alcohol_use_df = alcohol_use_df[['SEQN', 'ALQ121', 'ALQ130', 'ALQ142','ALQ151']]
alcohol_use_df.columns = ['SEQN', 'freq_of_drinking','avg_drinks_daily','days_45_drinks','chronic_drinker']

alcohol_use_df['SEQN'] = alcohol_use_df['SEQN'].astype(int)

alcohol_use_df['freq_of_drinking'] = alcohol_use_df['freq_of_drinking'].replace([77, 99], np.nan)
alcohol_use_df['avg_drinks_daily'] = alcohol_use_df['avg_drinks_daily'].replace([777, 999], np.nan)
alcohol_use_df['days_45_drinks'] = alcohol_use_df['days_45_drinks'].replace([77, 99], np.nan)
alcohol_use_df['chronic_drinker'] = alcohol_use_df['chronic_drinker'].replace([7, 9], np.nan)

threshold = 1e-10  # Define a small value threshold
alcohol_use_df = alcohol_use_df.mask(alcohol_use_df.abs() < threshold, 0)

alcohol_use_df.head()

Unnamed: 0,SEQN,freq_of_drinking,avg_drinks_daily,days_45_drinks,chronic_drinker
0,130378,,,,
1,130379,2.0,3.0,0.0,2.0
2,130380,10.0,1.0,0.0,2.0
3,130386,4.0,2.0,10.0,2.0
4,130387,0.0,,,2.0


In [21]:
# Function to classify drinking behavior
def classify_drinking(row):
    # Heavy Drinker
    if (row["freq_of_drinking"] in [1, 2]  # Drinks every day or nearly every day
        or row["avg_drinks_daily"] >= 4  # Drinks 4+ drinks per day
        or row["days_45_drinks"] <= 3  # Binge drinking 3+ times per week
        or row["chronic_drinker"] == 1):  # Drinks 4/5+ drinks every day
        return "86933000"
    
    # Moderate Drinker
    elif (row["freq_of_drinking"] == 3  # Drinks 3-4 times a week
          or (2 <= row["avg_drinks_daily"] < 4)  # Drinks 1-3 drinks per day
          or row["days_45_drinks"] in [4,5,6]):  # Binge drinks once/twice a week
        return "43783005"
    
    # Light Drinker
    elif (row["freq_of_drinking"] in [4, 5, 6]  # Drinks 1-2 times per week
          or row["avg_drinks_daily"] <= 1  # Drinks <1 drink per day
          or row["days_45_drinks"] >= 7):  # Binge drinks once a month or less
        return "228277002"
    
    # Occasional Drinker
    elif ((row["freq_of_drinking"] >= 7  # Drinks once a month or less
          or row["days_45_drinks"] <= 6)  # No frequent binge drinking
          and (row["chronic_drinker"] != 1)):  # Does not drink 4/5+ drinks daily
        return "228276006"
    
    return ""

# Apply classification function
alcohol_use_df["Alcohol_Consumption_SNOMED"] = alcohol_use_df.apply(classify_drinking, axis=1)

# Display first few rows
alcohol_use_df.head()

Unnamed: 0,SEQN,freq_of_drinking,avg_drinks_daily,days_45_drinks,chronic_drinker,Alcohol_Consumption_SNOMED
0,130378,,,,,
1,130379,2.0,3.0,0.0,2.0,86933000.0
2,130380,10.0,1.0,0.0,2.0,86933000.0
3,130386,4.0,2.0,10.0,2.0,43783005.0
4,130387,0.0,,,2.0,


In [22]:
alcohol_use_df.to_csv("../data/nhanes_data/filtered/nhanes_alchoholuse.csv")