# Feature Engineering

In [617]:
# imports

import pandas as pd
import numpy as np
import pickle as pkl
import os
import sys

module_path = os.path.abspath(os.path.join('..','utils'))
if module_path not in sys.path:
    sys.path.append(module_path)

import data_utils as du
import importlib
importlib.reload(du) # while developing the module

<module 'data_utils' from '/Users/jack/Repos/apnea-predictor/utils/data_utils.py'>

## Demographics Subset

In [618]:
# loading

with open('../data/processed/raw_subsets/demographics_subset_raw.pkl', 'rb') as f:
    demographics_df = pkl.load(f)
    
demographics_df.name = 'Demographics Subset - Pre Feature Engineering'
demographics_df.head()


    

Unnamed: 0_level_0,Participant's year of birth,Participant's age,Participant's sex,Height in feet,Height in inches,Weight in pounds,Body mass index (BMI),Participant's ethnicity (hispanic or latino),Participant's ethnicity (sub hispanic or latino origin),Participant's race (main),...,"Self-reported work end time, next shift","Self-reported work start time, 3rd shift","Self-reported work start time, no 3rd shift","Self-reported work end time, 3rd shift",How often change work shifts,Number of people living in your household,Number of children aged 5 years or younger living in your household,Number of children aged 6 - 17 living in your household,Number of adults aged 18 - 59 living in your household,Number of adults aged 60 years of older living in your household
Unnamed: 0_level_1,dem_0100,modified_dem_0110,dem_0500,dem_0600,dem_0610,dem_0700,dem_0800,dem_0900,dem_0910,dem_1000,...,sched_1200,sched_1500,sched_1501,sched_1600,sched_2100,bthbts_0500,bthbts_0510,bthbts_0520,bthbts_0530,bthbts_0540
0,1960.0,58.0,F,5.0,2.0,168.0,30.7,0.0,,1.0,...,,,,,,,,,,
1,1987.0,30.0,F,5.0,7.0,188.0,29.4,0.0,,1.0,...,,,,,,4.0,0.0,0.0,2.0,2.0
2,1988.0,30.0,F,5.0,7.0,165.0,25.8,0.0,,1.0,...,19:00:00,,1.0,,1.0,3.0,0.0,2.0,1.0,0.0
3,1976.0,42.0,M,5.0,4.0,156.0,26.8,0.0,,1.0,...,,,,,,0.0,0.0,0.0,0.0,0.0
4,1982.0,36.0,M,5.0,3.0,255.0,45.2,0.0,,1.0,...,,,,,,3.0,1.0,0.0,2.0,0.0


In [619]:
du.inspect_structure(demographics_df)

Structure of Demographics Subset - Pre Feature Engineering:
Shape:  1881  rows x  35  columns
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1881 entries, 0 to 1880
Data columns (total 35 columns):
 #   Column                                                                              Non-Null Count  Dtype  
---  ------                                                                              --------------  -----  
 0   (Participant's year of birth, dem_0100)                                             1859 non-null   float64
 1   (Participant's age, modified_dem_0110)                                              1859 non-null   float64
 2   (Participant's sex, dem_0500)                                                       1859 non-null   object 
 3   (Height in feet, dem_0600)                                                          1859 non-null   float64
 4   (Height in inches, dem_0610)                                                        1859 non-null   float64
 5   (We

In [620]:
display_names = demographics_df.columns.get_level_values(0)

du.make_column_description_table(demographics_df, add_display_names=True, display_names= display_names, show_null_pct=True)

Unnamed: 0,Column Name,Display Name,Data Type,Null Percentage
0,dem_0100,Participant's year of birth,float64,1.17%
1,modified_dem_0110,Participant's age,float64,1.17%
2,dem_0500,Participant's sex,object,1.17%
3,dem_0600,Height in feet,float64,1.17%
4,dem_0610,Height in inches,float64,1.17%
5,dem_0700,Weight in pounds,float64,1.17%
6,dem_0800,Body mass index (BMI),float64,1.17%
7,dem_0900,Participant's ethnicity (hispanic or latino),float64,1.17%
8,dem_0910,Participant's ethnicity (sub hispanic or latin...,float64,94.16%
9,dem_1000,Participant's race (main),float64,1.17%


In [621]:
edit_demographics_df = demographics_df.copy()

edit_demographics_df = edit_demographics_df.droplevel(1, axis=1)


#reformat column names

edit_demographics_df.columns = edit_demographics_df.columns.str.lower()
edit_demographics_df.columns = edit_demographics_df.columns.str.replace(' - ', '_')
edit_demographics_df.columns = edit_demographics_df.columns.str.replace('?', '')
edit_demographics_df.columns = edit_demographics_df.columns.str.replace(' ', '_')
edit_demographics_df.columns = edit_demographics_df.columns.str.replace('\'', '')
edit_demographics_df.columns = edit_demographics_df.columns.str.replace(',', '')


edit_demographics_df.columns



Index(['participants_year_of_birth', 'participants_age', 'participants_sex',
       'height_in_feet', 'height_in_inches', 'weight_in_pounds',
       'body_mass_index_(bmi)', 'participants_ethnicity_(hispanic_or_latino)',
       'participants_ethnicity_(sub_hispanic_or_latino_origin)',
       'participants_race_(main)', 'participants_race_(sub)',
       'english_as_native_language', 'participants_proficiency_in_english',
       'level_of_school', 'days_per_week_in_school', 'time_school_starts_',
       'time_school_starts_varies', 'time_school_ends',
       'time_school_ends_varies', 'regular_or_irregular_work_schedule',
       'do_you_work_a_split_shift', 'days_per_week_at_work',
       'self-reported_work_start_time_current_shift',
       'self-reported_work_end_time_current_shift',
       'self-reported_work_start_time_next_shift',
       'self-reported_work_end_time_next_shift',
       'self-reported_work_start_time_3rd_shift',
       'self-reported_work_start_time_no_3rd_shift',
  

In [622]:

edit_demographics_df['day_start_time'] = pd.to_datetime(edit_demographics_df['self-reported_work_start_time_current_shift'], 
                                                        format='%H:%M:%S', errors='coerce')

edit_demographics_df['time_school_starts_'] = pd.to_datetime(edit_demographics_df['time_school_starts_'], 
                                                        format='%H:%M:%S', errors='coerce')

edit_demographics_df['time_school_starts_'] = edit_demographics_df['time_school_starts_'].dt.time

edit_demographics_df['day_start_time'] = edit_demographics_df['day_start_time'].dt.time


print(edit_demographics_df['day_start_time'].isnull().sum())

#add school values

edit_demographics_df['day_start_time'] = edit_demographics_df['day_start_time'].fillna(edit_demographics_df['time_school_starts_'])

    
edit_demographics_df['day_start_time'].head(10)
    
print(edit_demographics_df['day_start_time'].isnull().sum())

edit_demographics_df = edit_demographics_df.drop(columns=['self-reported_work_start_time_current_shift', 'time_school_starts_'])



794
716


In [623]:
# make varying day start time column


edit_demographics_df['varying_day_start_time'] = edit_demographics_df['regular_or_irregular_work_schedule']

edit_demographics_df['varying_day_start_time'] = edit_demographics_df['varying_day_start_time'].fillna(edit_demographics_df['time_school_starts_varies'])

edit_demographics_df['varying_day_start_time'] = edit_demographics_df['varying_day_start_time'].map({1: 'Yes', 0: 'No'}) 

edit_demographics_df = edit_demographics_df.drop(columns=['regular_or_irregular_work_schedule', 'time_school_starts_varies'])

edit_demographics_df['varying_day_start_time'].value_counts(dropna=False)

varying_day_start_time
No     855
NaN    709
Yes    318
Name: count, dtype: int64

In [624]:
# make end variables too and utilize the 2nd and 3rd shift ones

edit_demographics_df['day_end_time'] = pd.to_datetime(edit_demographics_df['self-reported_work_end_time_current_shift'], 
                                                      format='%H:%M:%S', errors='coerce')
edit_demographics_df['day_end_time'] = edit_demographics_df['day_end_time'].dt.time

print(edit_demographics_df['day_end_time'].isnull().sum())

edit_demographics_df['day_end_time'] = edit_demographics_df['day_end_time'].fillna(
    pd.to_datetime(edit_demographics_df['self-reported_work_end_time_next_shift'], format='%H:%M:%S', errors='coerce').dt.time)
edit_demographics_df['day_end_time'] = edit_demographics_df['day_end_time'].fillna(
    pd.to_datetime(edit_demographics_df['self-reported_work_end_time_3rd_shift'], format='%H:%M:%S', errors='coerce').dt.time)
print(edit_demographics_df['day_end_time'].isnull().sum())

edit_demographics_df['day_end_time'] = edit_demographics_df['time_school_ends'].combine_first(edit_demographics_df['day_end_time'])

print(edit_demographics_df['day_end_time'].isnull().sum())

edit_demographics_df = edit_demographics_df.drop(columns=['self-reported_work_end_time_current_shift',
                                                        'self-reported_work_end_time_next_shift',
                                                        'self-reported_work_end_time_3rd_shift',
                                                        'time_school_ends', 'time_school_ends_varies'])



795
792
715


In [625]:
#re assess columns

edit_demographics_df = edit_demographics_df.drop(columns=['do_you_work_a_split_shift', 'self-reported_work_start_time_next_shift', 'self-reported_work_start_time_3rd_shift',
                                                       'how_often_change_work_shifts', 'level_of_school', 'self-reported_work_start_time_no_3rd_shift'  ])
edit_demographics_df.columns

Index(['participants_year_of_birth', 'participants_age', 'participants_sex',
       'height_in_feet', 'height_in_inches', 'weight_in_pounds',
       'body_mass_index_(bmi)', 'participants_ethnicity_(hispanic_or_latino)',
       'participants_ethnicity_(sub_hispanic_or_latino_origin)',
       'participants_race_(main)', 'participants_race_(sub)',
       'english_as_native_language', 'participants_proficiency_in_english',
       'days_per_week_in_school', 'days_per_week_at_work',
       'number_of_people_living_in_your_household',
       'number_of_children_aged_5_years_or_younger_living_in_your_household',
       'number_of_children_aged_6_17_living_in_your_household',
       'number_of_adults_aged_18_59_living_in_your_household',
       'number_of_adults_aged_60_years_of_older_living_in_your_household',
       'day_start_time', 'varying_day_start_time', 'day_end_time'],
      dtype='object')

In [626]:
edit_demographics_df['days_per_week_at_work_or_school'] = edit_demographics_df['days_per_week_at_work']

edit_demographics_df['days_per_week_at_work_or_school'] = edit_demographics_df['days_per_week_at_work_or_school'].fillna(
    edit_demographics_df['days_per_week_in_school'])
edit_demographics_df = edit_demographics_df.drop(columns=['days_per_week_at_work', 'days_per_week_in_school'])
edit_demographics_df.columns

Index(['participants_year_of_birth', 'participants_age', 'participants_sex',
       'height_in_feet', 'height_in_inches', 'weight_in_pounds',
       'body_mass_index_(bmi)', 'participants_ethnicity_(hispanic_or_latino)',
       'participants_ethnicity_(sub_hispanic_or_latino_origin)',
       'participants_race_(main)', 'participants_race_(sub)',
       'english_as_native_language', 'participants_proficiency_in_english',
       'number_of_people_living_in_your_household',
       'number_of_children_aged_5_years_or_younger_living_in_your_household',
       'number_of_children_aged_6_17_living_in_your_household',
       'number_of_adults_aged_18_59_living_in_your_household',
       'number_of_adults_aged_60_years_of_older_living_in_your_household',
       'day_start_time', 'varying_day_start_time', 'day_end_time',
       'days_per_week_at_work_or_school'],
      dtype='object')

In [627]:
edit_demographics_df = edit_demographics_df.drop(columns=['participants_ethnicity_(hispanic_or_latino)','participants_ethnicity_(sub_hispanic_or_latino_origin)',
                                                       'participants_race_(main)', 'participants_race_(sub)', 'english_as_native_language' , 'participants_proficiency_in_english' ])

In [628]:
du.inspect_structure(edit_demographics_df)

Structure of DataFrame:
Shape:  1882  rows x  16  columns
<class 'pandas.core.frame.DataFrame'>
Index: 1882 entries, 0 to Null Percentage
Data columns (total 16 columns):
 #   Column                                                               Non-Null Count  Dtype  
---  ------                                                               --------------  -----  
 0   participants_year_of_birth                                           1860 non-null   float64
 1   participants_age                                                     1860 non-null   float64
 2   participants_sex                                                     1860 non-null   object 
 3   height_in_feet                                                       1860 non-null   float64
 4   height_in_inches                                                     1860 non-null   float64
 5   weight_in_pounds                                                     1860 non-null   float64
 6   body_mass_index_(bmi)                   