# Feature Engineering

In [1038]:
# imports

import pandas as pd
import numpy as np
import pickle as pkl
import os
import sys

module_path = os.path.abspath(os.path.join('..','utils'))
if module_path not in sys.path:
    sys.path.append(module_path)

import data_utils as du
import importlib
importlib.reload(du) # while developing the module

<module 'data_utils' from '/Users/jack/Repos/apnea-predictor/utils/data_utils.py'>

## Demographics Subset

In [1039]:
# loading

with open('../data/processed/raw_subsets/demographics_subset_raw.pkl', 'rb') as f:
    demographics_df = pkl.load(f)
    
demographics_df.name = 'Demographics Subset - Pre Feature Engineering'
demographics_df.head()


    

Unnamed: 0_level_0,Participant's year of birth,Participant's age,Participant's sex,Height in feet,Height in inches,Weight in pounds,Body mass index (BMI),Participant's ethnicity (hispanic or latino),Participant's ethnicity (sub hispanic or latino origin),Participant's race (main),...,"Self-reported work end time, next shift","Self-reported work start time, 3rd shift","Self-reported work start time, no 3rd shift","Self-reported work end time, 3rd shift",How often change work shifts,Number of people living in your household,Number of children aged 5 years or younger living in your household,Number of children aged 6 - 17 living in your household,Number of adults aged 18 - 59 living in your household,Number of adults aged 60 years of older living in your household
Unnamed: 0_level_1,dem_0100,modified_dem_0110,dem_0500,dem_0600,dem_0610,dem_0700,dem_0800,dem_0900,dem_0910,dem_1000,...,sched_1200,sched_1500,sched_1501,sched_1600,sched_2100,bthbts_0500,bthbts_0510,bthbts_0520,bthbts_0530,bthbts_0540
0,1960.0,58.0,F,5.0,2.0,168.0,30.7,0.0,,1.0,...,,,,,,,,,,
1,1987.0,30.0,F,5.0,7.0,188.0,29.4,0.0,,1.0,...,,,,,,4.0,0.0,0.0,2.0,2.0
2,1988.0,30.0,F,5.0,7.0,165.0,25.8,0.0,,1.0,...,19:00:00,,1.0,,1.0,3.0,0.0,2.0,1.0,0.0
3,1976.0,42.0,M,5.0,4.0,156.0,26.8,0.0,,1.0,...,,,,,,0.0,0.0,0.0,0.0,0.0
4,1982.0,36.0,M,5.0,3.0,255.0,45.2,0.0,,1.0,...,,,,,,3.0,1.0,0.0,2.0,0.0


In [1040]:
du.inspect_structure(demographics_df)

Structure of Demographics Subset - Pre Feature Engineering:
Shape:  1881  rows x  35  columns
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1881 entries, 0 to 1880
Data columns (total 35 columns):
 #   Column                                                                              Non-Null Count  Dtype  
---  ------                                                                              --------------  -----  
 0   (Participant's year of birth, dem_0100)                                             1859 non-null   float64
 1   (Participant's age, modified_dem_0110)                                              1859 non-null   float64
 2   (Participant's sex, dem_0500)                                                       1859 non-null   object 
 3   (Height in feet, dem_0600)                                                          1859 non-null   float64
 4   (Height in inches, dem_0610)                                                        1859 non-null   float64
 5   (We

In [1041]:
display_names = demographics_df.columns.get_level_values(0)

du.make_column_description_table(demographics_df, add_display_names=True, display_names= display_names, show_null_pct=True)

Unnamed: 0,Column Name,Display Name,Data Type,Null Percentage
0,dem_0100,Participant's year of birth,float64,1.17%
1,modified_dem_0110,Participant's age,float64,1.17%
2,dem_0500,Participant's sex,object,1.17%
3,dem_0600,Height in feet,float64,1.17%
4,dem_0610,Height in inches,float64,1.17%
5,dem_0700,Weight in pounds,float64,1.17%
6,dem_0800,Body mass index (BMI),float64,1.17%
7,dem_0900,Participant's ethnicity (hispanic or latino),float64,1.17%
8,dem_0910,Participant's ethnicity (sub hispanic or latin...,float64,94.16%
9,dem_1000,Participant's race (main),float64,1.17%


In [1042]:
edit_demographics_df = demographics_df.copy()

edit_demographics_df = edit_demographics_df.droplevel(1, axis=1)


#reformat column names

edit_demographics_df.columns = edit_demographics_df.columns.str.lower()
edit_demographics_df.columns = edit_demographics_df.columns.str.replace(' - ', '_')
edit_demographics_df.columns = edit_demographics_df.columns.str.replace('?', '')
edit_demographics_df.columns = edit_demographics_df.columns.str.replace(' ', '_')
edit_demographics_df.columns = edit_demographics_df.columns.str.replace('\'', '')
edit_demographics_df.columns = edit_demographics_df.columns.str.replace(',', '')


edit_demographics_df.columns



Index(['participants_year_of_birth', 'participants_age', 'participants_sex',
       'height_in_feet', 'height_in_inches', 'weight_in_pounds',
       'body_mass_index_(bmi)', 'participants_ethnicity_(hispanic_or_latino)',
       'participants_ethnicity_(sub_hispanic_or_latino_origin)',
       'participants_race_(main)', 'participants_race_(sub)',
       'english_as_native_language', 'participants_proficiency_in_english',
       'level_of_school', 'days_per_week_in_school', 'time_school_starts_',
       'time_school_starts_varies', 'time_school_ends',
       'time_school_ends_varies', 'regular_or_irregular_work_schedule',
       'do_you_work_a_split_shift', 'days_per_week_at_work',
       'self-reported_work_start_time_current_shift',
       'self-reported_work_end_time_current_shift',
       'self-reported_work_start_time_next_shift',
       'self-reported_work_end_time_next_shift',
       'self-reported_work_start_time_3rd_shift',
       'self-reported_work_start_time_no_3rd_shift',
  

In [1043]:

edit_demographics_df['day_start_time'] = pd.to_datetime(edit_demographics_df['self-reported_work_start_time_current_shift'], 
                                                        format='%H:%M:%S', errors='coerce')

edit_demographics_df['time_school_starts_'] = pd.to_datetime(edit_demographics_df['time_school_starts_'], 
                                                        format='%H:%M:%S', errors='coerce')

edit_demographics_df['time_school_starts_'] = edit_demographics_df['time_school_starts_'].dt.time

edit_demographics_df['day_start_time'] = edit_demographics_df['day_start_time'].dt.time


print(edit_demographics_df['day_start_time'].isnull().sum())

#add school values

edit_demographics_df['day_start_time'] = edit_demographics_df['day_start_time'].fillna(edit_demographics_df['time_school_starts_'])

    
edit_demographics_df['day_start_time'].head(10)
    
print(edit_demographics_df['day_start_time'].isnull().sum())

edit_demographics_df = edit_demographics_df.drop(columns=['self-reported_work_start_time_current_shift', 'time_school_starts_'])



794
716


In [1044]:
# make varying day start time column


edit_demographics_df['varying_day_start_time'] = edit_demographics_df['regular_or_irregular_work_schedule']

edit_demographics_df['varying_day_start_time'] = edit_demographics_df['varying_day_start_time'].fillna(edit_demographics_df['time_school_starts_varies'])

edit_demographics_df['varying_day_start_time'] = edit_demographics_df['varying_day_start_time'].map({1: 'Yes', 0: 'No'}) 

edit_demographics_df = edit_demographics_df.drop(columns=['regular_or_irregular_work_schedule', 'time_school_starts_varies'])

edit_demographics_df['varying_day_start_time'].value_counts(dropna=False)

varying_day_start_time
No     855
NaN    709
Yes    318
Name: count, dtype: int64

In [1045]:
# make end variables too and utilize the 2nd and 3rd shift ones

edit_demographics_df['day_end_time'] = pd.to_datetime(edit_demographics_df['self-reported_work_end_time_current_shift'], 
                                                      format='%H:%M:%S', errors='coerce')
edit_demographics_df['day_end_time'] = edit_demographics_df['day_end_time'].dt.time

print(edit_demographics_df['day_end_time'].isnull().sum())

edit_demographics_df['day_end_time'] = edit_demographics_df['day_end_time'].fillna(
    pd.to_datetime(edit_demographics_df['self-reported_work_end_time_next_shift'], format='%H:%M:%S', errors='coerce').dt.time)
edit_demographics_df['day_end_time'] = edit_demographics_df['day_end_time'].fillna(
    pd.to_datetime(edit_demographics_df['self-reported_work_end_time_3rd_shift'], format='%H:%M:%S', errors='coerce').dt.time)
print(edit_demographics_df['day_end_time'].isnull().sum())

edit_demographics_df['day_end_time'] = edit_demographics_df['time_school_ends'].combine_first(edit_demographics_df['day_end_time'])

print(edit_demographics_df['day_end_time'].isnull().sum())

edit_demographics_df = edit_demographics_df.drop(columns=['self-reported_work_end_time_current_shift',
                                                        'self-reported_work_end_time_next_shift',
                                                        'self-reported_work_end_time_3rd_shift',
                                                        'time_school_ends', 'time_school_ends_varies'])



795
792
715


In [1046]:
#re assess columns

edit_demographics_df = edit_demographics_df.drop(columns=['do_you_work_a_split_shift', 'self-reported_work_start_time_next_shift', 'self-reported_work_start_time_3rd_shift',
                                                       'how_often_change_work_shifts', 'level_of_school', 'self-reported_work_start_time_no_3rd_shift'  ])
edit_demographics_df.columns

Index(['participants_year_of_birth', 'participants_age', 'participants_sex',
       'height_in_feet', 'height_in_inches', 'weight_in_pounds',
       'body_mass_index_(bmi)', 'participants_ethnicity_(hispanic_or_latino)',
       'participants_ethnicity_(sub_hispanic_or_latino_origin)',
       'participants_race_(main)', 'participants_race_(sub)',
       'english_as_native_language', 'participants_proficiency_in_english',
       'days_per_week_in_school', 'days_per_week_at_work',
       'number_of_people_living_in_your_household',
       'number_of_children_aged_5_years_or_younger_living_in_your_household',
       'number_of_children_aged_6_17_living_in_your_household',
       'number_of_adults_aged_18_59_living_in_your_household',
       'number_of_adults_aged_60_years_of_older_living_in_your_household',
       'day_start_time', 'varying_day_start_time', 'day_end_time'],
      dtype='object')

In [1047]:
edit_demographics_df['days_per_week_at_work_or_school'] = edit_demographics_df['days_per_week_at_work']

edit_demographics_df['days_per_week_at_work_or_school'] = edit_demographics_df['days_per_week_at_work_or_school'].fillna(
    edit_demographics_df['days_per_week_in_school'])
edit_demographics_df = edit_demographics_df.drop(columns=['days_per_week_at_work', 'days_per_week_in_school'])
edit_demographics_df.columns

Index(['participants_year_of_birth', 'participants_age', 'participants_sex',
       'height_in_feet', 'height_in_inches', 'weight_in_pounds',
       'body_mass_index_(bmi)', 'participants_ethnicity_(hispanic_or_latino)',
       'participants_ethnicity_(sub_hispanic_or_latino_origin)',
       'participants_race_(main)', 'participants_race_(sub)',
       'english_as_native_language', 'participants_proficiency_in_english',
       'number_of_people_living_in_your_household',
       'number_of_children_aged_5_years_or_younger_living_in_your_household',
       'number_of_children_aged_6_17_living_in_your_household',
       'number_of_adults_aged_18_59_living_in_your_household',
       'number_of_adults_aged_60_years_of_older_living_in_your_household',
       'day_start_time', 'varying_day_start_time', 'day_end_time',
       'days_per_week_at_work_or_school'],
      dtype='object')

In [1048]:
edit_demographics_df = edit_demographics_df.drop(columns=['participants_ethnicity_(hispanic_or_latino)','participants_ethnicity_(sub_hispanic_or_latino_origin)',
                                                       'participants_race_(main)', 'participants_race_(sub)', 'english_as_native_language' , 'participants_proficiency_in_english' ])

In [1049]:

demos_df_v1 = edit_demographics_df.copy()
demos_df_v1.name = 'Demographics Subset - Post Feature Engineering v1.0'

du.inspect_structure(demos_df_v1)

Structure of Demographics Subset - Post Feature Engineering v1.0:
Shape:  1882  rows x  16  columns
<class 'pandas.core.frame.DataFrame'>
Index: 1882 entries, 0 to Null Percentage
Data columns (total 16 columns):
 #   Column                                                               Non-Null Count  Dtype  
---  ------                                                               --------------  -----  
 0   participants_year_of_birth                                           1860 non-null   float64
 1   participants_age                                                     1860 non-null   float64
 2   participants_sex                                                     1860 non-null   object 
 3   height_in_feet                                                       1860 non-null   float64
 4   height_in_inches                                                     1860 non-null   float64
 5   weight_in_pounds                                                     1860 non-null   float64
 6 

## Health Lifestyle Subset

In [1050]:
with open('../data/processed/raw_subsets/general_health_lifestyle_subset_raw.pkl', 'rb') as f:
    ghl_df = pkl.load(f)
    
ghl_df.name = 'General Health Lifestyle Subset - Pre Feature Engineering'
ghl_df.head()



Unnamed: 0_level_0,Fatigue Severity Scale: Motivation is lower when fatigued,Fatigue Severity Scale: Exercise brings on fatigue,Fatigue Severity Scale: I am easily fatigued,Fatigue Severity Scale: Fatigue interferes with physical functioning,Fatigue Severity Scale: Fatigue causes frequent problems for me,Fatigue Severity Scale: My fatigue prevents sustained physical functioning,Fatigue Severity Scale: Fatigue interferes with carrying out certain duties and responsibilities,Fatigue Severity Scale: Fatigue is among my three most disabling symptoms,"Fatigue Severity Scale: Fatigue interferes with my work, family, or social life",Fatigue Severity Scale: Total Score,...,"Cigarette smoking, time frame","Cigarette smoking, age stopped","Street or recreational drugs consumption, ever","Street or recreational drugs consumption, age started","Street or recreational drugs consumption, age stopped","Cigarette smoking, current smoker","Smokeless user, current smoker","Cigarette smoking, former smoker","Smokeless user, former smoker","Cigarette smoking, never smoker"
Unnamed: 0_level_1,fss_0100,fss_0200,fss_0300,fss_0400,fss_0500,fss_0600,fss_0700,fss_0800,fss_0900,fss_1000,...,soclhx_1310,soclhx_1400,soclhx_1500,soclhx_1700,soclhx_1800,current_cigarette_smoker,current_smokeless_user,former_cigarette_smoker,former_smokeless_user,never_cigarette_smoker
0,,,,,,,,,,,...,,,,,,,,,,
1,7.0,3.0,6.0,6.0,6.0,5.0,6.0,7.0,6.0,52.0,...,,,0.0,,,0.0,0.0,0.0,0.0,1.0
2,7.0,2.0,4.0,4.0,4.0,3.0,4.0,4.0,4.0,36.0,...,,,0.0,,,0.0,0.0,0.0,0.0,1.0
3,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,12.0,...,0.0,24.0,0.0,,,0.0,0.0,1.0,0.0,0.0
4,7.0,2.0,3.0,4.0,2.0,2.0,2.0,2.0,3.0,27.0,...,0.0,24.0,0.0,,,0.0,0.0,1.0,0.0,0.0


In [1051]:
display_names = ghl_df.columns.get_level_values(0)

with pd.option_context('display.max_rows', 25):
    display(du.make_column_description_table(ghl_df, add_display_names=True, display_names= display_names, show_null_pct=True))
#NOTE: change the git ignore to keep the folders where the pickels are for easier copying. nothign to do with this section just worth mentioning



Unnamed: 0,Column Name,Display Name,Data Type,Null Percentage
0,fss_0100,Fatigue Severity Scale: Motivation is lower wh...,float64,6.27%
1,fss_0200,Fatigue Severity Scale: Exercise brings on fat...,float64,6.27%
2,fss_0300,Fatigue Severity Scale: I am easily fatigued,float64,6.27%
3,fss_0400,Fatigue Severity Scale: Fatigue interferes wit...,float64,6.27%
4,fss_0500,Fatigue Severity Scale: Fatigue causes frequen...,float64,6.27%
...,...,...,...,...
85,current_cigarette_smoker,"Cigarette smoking, current smoker",float64,10.89%
86,current_smokeless_user,"Smokeless user, current smoker",float64,10.89%
87,former_cigarette_smoker,"Cigarette smoking, former smoker",float64,10.89%
88,former_smokeless_user,"Smokeless user, former smoker",float64,10.89%


In [1052]:
# column reformatting again
edit_ghl_df = ghl_df.copy()

edit_ghl_df = edit_ghl_df.droplevel(1, axis=1)


#reformat column names

edit_ghl_df.columns = edit_ghl_df.columns.str.lower()
edit_ghl_df.columns = edit_ghl_df.columns.str.replace(' - ', '_')
edit_ghl_df.columns = edit_ghl_df.columns.str.replace('?', '')
edit_ghl_df.columns = edit_ghl_df.columns.str.replace(' ', '_')
edit_ghl_df.columns = edit_ghl_df.columns.str.replace('\'', '')
edit_ghl_df.columns = edit_ghl_df.columns.str.replace(',', '')
edit_ghl_df.columns = edit_ghl_df.columns.str.replace(':', '')


display(edit_ghl_df.columns)


Index(['fatigue_severity_scale_motivation_is_lower_when_fatigued',
       'fatigue_severity_scale_exercise_brings_on_fatigue',
       'fatigue_severity_scale_i_am_easily_fatigued',
       'fatigue_severity_scale_fatigue_interferes_with_physical_functioning',
       'fatigue_severity_scale_fatigue_causes_frequent_problems_for_me',
       'fatigue_severity_scale_my_fatigue_prevents_sustained_physical_functioning',
       'fatigue_severity_scale_fatigue_interferes_with_carrying_out_certain_duties_and_responsibilities',
       'fatigue_severity_scale_fatigue_is_among_my_three_most_disabling_symptoms',
       'fatigue_severity_scale_fatigue_interferes_with_my_work_family_or_social_life',
       'fatigue_severity_scale_total_score',
       'generalized_anxiety_disorder-7_questionnaire_feeling_nervous_anxious_or_on_edge_',
       'generalized_anxiety_disorder-7_questionnaire_not_being_able_to_stop_or_control_worrying_',
       'generalized_anxiety_disorder-7_questionnaire_worrying_too_much_ab

In [1053]:
# dropping columns I don't believe to be important 

edit_ghl_df = edit_ghl_df.drop(columns=[
       'usual_additional_meal/snack_time1',
       'usually_no_additional_meal/snack1',
       'usual_additional_meal/snack_time2',
       'usually_no_additional_meal/snack2',
       'usual_additional_meal/snack_time3',
       'usually_no_additional_meal/snack3',
       'usual_additional_meal/snack_time4',
       'usually_no_additional_meal/snack4',
       'usual_additional_meal/snack_time5',
       'usually_no_additional_meal/snack5',
        'percentage_of_snack_2_among_all_food_intake_over_24_hours',
       'percentage_of_snack_3_among_all_food_intake_over_24_hours',
       'percentage_of_snack_4_among_all_food_intake_over_24_hours',
       'percentage_of_snack_5_among_all_food_intake_over_24_hours',
       'percentage_of_breakfast_among_all_food_intake_over_24_hours',
       'percentage_of_lunch_among_all_food_intake_over_24_hours',
       'percentage_of_dinner_among_all_food_intake_over_24_hours',
       'percentage_of_snack_1_among_all_food_intake_over_24_hours',
       'food_intake_no_regular_meals'
])

display(edit_ghl_df.columns)


Index(['fatigue_severity_scale_motivation_is_lower_when_fatigued',
       'fatigue_severity_scale_exercise_brings_on_fatigue',
       'fatigue_severity_scale_i_am_easily_fatigued',
       'fatigue_severity_scale_fatigue_interferes_with_physical_functioning',
       'fatigue_severity_scale_fatigue_causes_frequent_problems_for_me',
       'fatigue_severity_scale_my_fatigue_prevents_sustained_physical_functioning',
       'fatigue_severity_scale_fatigue_interferes_with_carrying_out_certain_duties_and_responsibilities',
       'fatigue_severity_scale_fatigue_is_among_my_three_most_disabling_symptoms',
       'fatigue_severity_scale_fatigue_interferes_with_my_work_family_or_social_life',
       'fatigue_severity_scale_total_score',
       'generalized_anxiety_disorder-7_questionnaire_feeling_nervous_anxious_or_on_edge_',
       'generalized_anxiety_disorder-7_questionnaire_not_being_able_to_stop_or_control_worrying_',
       'generalized_anxiety_disorder-7_questionnaire_worrying_too_much_ab

In [1054]:
# further subsetting on surveys 

fss_df = edit_ghl_df.filter(like='fatigue_severity_scale_', axis=1)
gad7_df = edit_ghl_df.filter(like='generalized_anxiety_disorder-7_', axis=1)
phq9_df = edit_ghl_df.filter(like='patient_health_questionnaire_9_', axis=1)
nose_df = edit_ghl_df[['nasal_congestion_or_stuffiness', 'nasal_blockage_or_obstruction',
       'trouble_breathing_through_nose', 'trouble_sleeping',
       'unable_to_get_enough_air_through_nose_during_exercise_or_exertion',
       'nose_total_score']]

edit2_ghl_df = edit_ghl_df.drop(columns=fss_df.columns.tolist() + gad7_df.columns.tolist() + phq9_df.columns.tolist() + nose_df.columns.tolist())

In [1055]:
edit2_ghl_df = edit2_ghl_df.rename(columns={'dry_and/or_irritated_eyes' : 'dry_and_or_irritated_eyes_days_per_week'}) 

In [1056]:
du.inspect_structure(edit2_ghl_df)

Structure of DataFrame:
Shape:  1882  rows x  37  columns
<class 'pandas.core.frame.DataFrame'>
Index: 1882 entries, 0 to Null Percentage
Data columns (total 37 columns):
 #   Column                                                  Non-Null Count  Dtype  
---  ------                                                  --------------  -----  
 0   heartburn_or_belching_after_going_to_bed_days_per_week  1738 non-null   float64
 1   perspire_heavily_during_the_night_days_per_week         1738 non-null   float64
 2   dry_and_or_irritated_eyes_days_per_week                 1738 non-null   float64
 3   routinely_travel_to_other_time_zones                    1648 non-null   float64
 4   eating_impact_on_alertness/wakefulness                  1595 non-null   float64
 5   feel_more_alert_if_skip_lunch                           1596 non-null   float64
 6   sleep_less_soundly_if_skip_dinner                       1597 non-null   float64
 7   self-perception_of_weight                               160

In [1057]:
# clean up some things

ghl_df_v1 = edit2_ghl_df.copy()
ghl_df_v1.name = 'General Health Lifestyle Subset - Post Feature Engineering v1.0'

fss_df_v1 = fss_df.copy()
fss_df_v1.name = 'Fatigue Severity Scale Subset - Post Feature Engineering v1.0'

gad7_df_v1 = gad7_df.copy()
gad7_df_v1.name = 'Generalized Anxiety Disorder-7 Subset - Post Feature Engineering v1.0'

phq9_df_v1 = phq9_df.copy()
phq9_df_v1.name = 'Patient Health Questionnaire-9 Subset - Post Feature Engineering v1.0'

nose_df_v1 = nose_df.copy()
nose_df_v1.name = 'Nasal Obstruction Symptom Evaluation Subset - Post Feature Engineering v1.0'


## Medical History Subset

In [1058]:
with open('../data/processed/raw_subsets/medical_history_subset_raw.pkl', 'rb') as f:
    mdxh_df  = pkl.load(f)
    
mdxh_df.name = 'Medical History Subset - Pre Feature Engineering'
mdxh_df.head()

Unnamed: 0_level_0,Family History of Insomnia,Family History of Sleep Apnea,Family History of Narcolepsy,Family History of Restless Leg Syndrome,Family History of Other Sleep Disorder,Family History of Sleepwalking,Family History of Fibromyalgia or Chronic Fatigue,Family History of Depression,Family History of Anxiety,Family History of Other Psychiatric Illness,...,Hypercholesterolemia: Self-reported,Type 2 Diabetes: Self-reported,Endocrine or Metabolic Problem: Self-reported,Urologic or Kidney Problem: Self-reported,Dialysis: Self-reported,Pain or Fatigue: Self-reported,Psychiatric or Mental Health Problem: Self-reported,"Medical Problem or Surgery, other: Self-reported",Genetic Testing: Self-reported,"Genetic Testing, source: Self-reported"
Unnamed: 0_level_1,famhx_0100,famhx_0200,famhx_0300,famhx_0400,famhx_0500,famhx_0600,famhx_0700,famhx_0800,famhx_0900,famhx_1000,...,mdhx_6300,mdhx_6310,mdhx_6320,mdhx_6400,mdhx_6420,mdhx_6500,mdhx_6600,mdhx_6700,mdhx_6900,mdhx_6910
0,,,,,,,,,,,...,,,,,,,,,,
1,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,1.0,,1.0,1.0,1.0,0.0,
2,-55.0,0.0,0.0,0.0,-55.0,0.0,-55.0,-55.0,-55.0,-55.0,...,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,
3,-55.0,-55.0,0.0,1.0,-55.0,-55.0,-55.0,-55.0,1.0,-55.0,...,0.0,0.0,0.0,0.0,,1.0,1.0,0.0,0.0,
4,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,,0.0,1.0,0.0,0.0,


In [1059]:
edit_mdxh_df = mdxh_df.copy()

edit_mdxh_df = edit_mdxh_df.droplevel(1, axis=1)

#reformat column names

edit_mdxh_df.columns = edit_mdxh_df.columns.str.lower()
edit_mdxh_df.columns = edit_mdxh_df.columns.str.replace(' - ', '_')
edit_mdxh_df.columns = edit_mdxh_df.columns.str.replace('?', '')
edit_mdxh_df.columns = edit_mdxh_df.columns.str.replace(' ', '_')
edit_mdxh_df.columns = edit_mdxh_df.columns.str.replace('\'', '')
edit_mdxh_df.columns = edit_mdxh_df.columns.str.replace(',', '')
edit_mdxh_df.columns = edit_mdxh_df.columns.str.replace(':', '')
display(edit_mdxh_df.columns)

Index(['family_history_of_insomnia', 'family_history_of_sleep_apnea',
       'family_history_of_narcolepsy',
       'family_history_of_restless_leg_syndrome',
       'family_history_of_other_sleep_disorder',
       'family_history_of_sleepwalking',
       'family_history_of_fibromyalgia_or_chronic_fatigue',
       'family_history_of_depression', 'family_history_of_anxiety',
       'family_history_of_other_psychiatric_illness',
       'family_history_of_psychiatric_treatment',
       'family_history_of_death_during_sleep',
       'number_of_full_siblings_from_the_same_birth_parents',
       'pregnancy_current', 'menopausal_status',
       'oophorectomy_bilateral_self-reported', 'hypertension_self-reported',
       'congestive_heart_failure_self-reported',
       'cardiovascular_problem_other_self-reported', 'asthma_self-reported',
       'chronic_obstructive_pulmonary_disease_self-reported',
       'pulmonary_problem_other_self-reported',
       'allergies_or_sinus_problems_self-reporte

In [1060]:
edit_mdxh_df = edit_mdxh_df.drop(columns=['genetic_testing_source_self-reported', 'dialysis_self-reported', 'dentures_removed_while_sleeping_self-reported'])
#extremely high null count                                          
    

In [1061]:
du.inspect_structure(edit_mdxh_df)

Structure of DataFrame:
Shape:  1881  rows x  37  columns
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1881 entries, 0 to 1880
Data columns (total 37 columns):
 #   Column                                                      Non-Null Count  Dtype  
---  ------                                                      --------------  -----  
 0   family_history_of_insomnia                                  1684 non-null   float64
 1   family_history_of_sleep_apnea                               1699 non-null   float64
 2   family_history_of_narcolepsy                                1660 non-null   float64
 3   family_history_of_restless_leg_syndrome                     1673 non-null   float64
 4   family_history_of_other_sleep_disorder                      1646 non-null   float64
 5   family_history_of_sleepwalking                              1669 non-null   float64
 6   family_history_of_fibromyalgia_or_chronic_fatigue           1622 non-null   float64
 7   family_history_of_depression 

In [None]:
mdhx_df_v1 = edit_mdxh_df.copy()
mdhx_df_v1.name = 'Medical History Subset - Post Feature Engineering v1.0'

## Questionnares Pt. 1