# Data Preparation Notebook for the STAGES Dataset
datset provided by the NSRR

CURRENTLY WORKING ON: combining flagged columns from subsets

In [285]:
# libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.preprocessing import LabelEncoder
import os
import sys


In [286]:
# settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['axes.titlesize'] = 16
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['legend.fontsize'] = 12
plt.rcParams['figure.titlesize'] = 18
np.random.seed(42)


In [287]:
# function import

module_path = os.path.abspath(os.path.join('..','utils'))
if module_path not in sys.path:
    sys.path.append(module_path)

import data_utils
import importlib
importlib.reload(data_utils) # while developing the module

<module 'data_utils' from '/Users/jack/Repos/apnea-predictor/utils/data_utils.py'>

## Loading Data

In [288]:
raw_stages_df = pd.read_csv('../data/stages/datasets/stages-dataset-0.3.0.csv')
raw_stages_harmonized_df = pd.read_csv('../data/stages/datasets/stages-harmonized-dataset-0.3.0.csv')


raw_stages_df.info()
raw_stages_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1881 entries, 0 to 1880
Columns: 433 entries, modified_created_at to visitcode
dtypes: float64(402), int64(1), object(30)
memory usage: 6.2+ MB


Unnamed: 0,modified_created_at,modified_completed,subject_code,dem_0100,modified_dem_0110,dem_0500,dem_0600,dem_0610,dem_0700,dem_0800,dem_0900,dem_0910,dem_1000,dem_1010,dem_1100,dem_1120,mdhx_0200,mdhx_0400,mdhx_0700,mdhx_0800,mdhx_1200,mdhx_1300,mdhx_1400,mdhx_5500,mdhx_5600,mdhx_5700,mdhx_5710,mdhx_5720,mdhx_5800,mdhx_5810,mdhx_5820,mdhx_5900,mdhx_5910,mdhx_5920,mdhx_5950,mdhx_6000,mdhx_6030,mdhx_6100,mdhx_6200,mdhx_6300,mdhx_6310,mdhx_6320,mdhx_6400,mdhx_6420,mdhx_6500,mdhx_6600,mdhx_6700,mdhx_6900,mdhx_6910,pap_0100,pap_0200,pap_0300,pap_0400,pap_0500,pap_0600,pap_0700,pap_0800,pap_1200,pap_1300,pap_1400,pap_1600,pap_1700,pap_1800,pap_1900,pap_2100,nose_0100,nose_0200,nose_0300,nose_0400,nose_0500,nose_0600,famhx_0100,famhx_0200,famhx_0300,famhx_0400,famhx_0500,famhx_0600,famhx_0700,famhx_0800,famhx_0900,famhx_1000,famhx_1100,famhx_1200,famhx_1300,bthbts_0100,bthbts_0300,bthbts_0500,bthbts_0510,bthbts_0520,bthbts_0530,bthbts_0540,sched_0100,sched_0200,sched_0300,sched_0301,sched_0400,sched_0401,sched_0500,sched_0510,sched_0600,sched_0700,sched_0800,sched_0900,sched_0901,sched_1000,sched_1001,sched_1100,sched_1200,sched_1300,sched_1301,sched_1400,sched_1500,sched_1501,sched_1600,sched_1700,sched_1800,sched_1900,sched_1901,sched_2000,sched_2001,sched_2100,sched_2200,sched_2210,sched_2300,sched_2310,sched_2400,sched_2500,sched_2510,sched_2600,sched_2700,sched_2710,sched_2800,sched_2810,sched_2900,sched_3000,sched_3010,sched_3100,sched_3200,sched_3210,sched_3300,sched_3310,sched_3400,sched_3500,sched_3510,sched_3600,sched_3700,sched_3710,sched_3800,sched_3810,sched_3900,sched_4000,sched_4010,sched_4100,sched_4200,sched_4201,sched_4210,soclhx_0100,soclhx_0110,soclhx_0200,soclhx_0210,soclhx_0300,soclhx_0400,soclhx_0501,soclhx_0520,soclhx_0600,soclhx_0700,soclhx_0701,soclhx_0710,soclhx_0730,soclhx_0800,soclhx_0900,soclhx_0901,soclhx_1000,soclhx_1200,soclhx_1300,soclhx_1310,soclhx_1400,soclhx_1500,soclhx_1700,soclhx_1800,map_0100,map_0200,map_0300,map_0400,map_0500,map_0600,map_0700,map_0800,map_0900,map_1000,map_1010,map_1020,map_1030,map_1040,map_1041,map_1100,map_1110,map_1120,map_1130,map_1131,index_1,index_3,index_4,map_lr,score,osa_0100,osa_0200,osa_0300,ess_0100,ess_0200,ess_0300,ess_0400,ess_0500,ess_0600,ess_0700,ess_0800,ess_0900,slpy_0101,slpy_0100,slpy_0110,slpy_0201,slpy_0200,slpy_0210,slpy_0301,slpy_0300,slpy_0310,slpy_0400,slpy_0410,isi_0100,isi_0200,isi_0300,isi_0400,isi_0500,isi_0600,isi_0700,isi_score,tab_0100,tab_0200,tab_0300,tab_0400,tab_0500,tab_0600,tab_0700,tab_0800,tab_0900,tab_1000,isq_0100,isq_0110,isq_0120,isq_0200,isq_0210,isq_0220,isq_0300,isq_0310,isq_0320,isq_0400,isq_0410,isq_0420,isq_0500,isq_0510,isq_0520,isq_0600,isq_0700,isq_0800,isq_0900,isq_1000,isq_1100,isq_1200,isq_1300,isq_score,rls_0100,rls_0200,rls_0310,rls_0400,rls_0410,rls_0500,rls_0510,rls_0600,rls_0610,rls_0700,rls_0710,cir_0100,cir_0200,cir_0300,cir_0400,cir_0500,cir_0600,cir_0700,narc_0050,narc_0100,narc_0110,narc_0200,narc_0210,narc_0300,narc_0310,narc_0400,narc_0410,narc_0500,narc_0510,narc_0600,narc_0700,narc_0800,narc_0900,narc_1000,narc_1100,narc_1200,narc_1300,narc_1400,narc_1500,narc_1600,narc_1610,narc_1650,narc_1700,narc_1701,narc_1900,narc_2000,narc_2100,narc_2110,narc_2200,par_0100,par_0101,par_0110,par_0200,par_0201,par_0210,par_0230,par_0300,par_0301,par_0310,par_0400,par_0500,par_0501,par_0510,par_0530,par_0531,par_0600,par_0601,par_0610,par_0630,par_0631,par_0700,par_0701,par_0710,par_0800,par_0900,par_0901,par_0910,fosq_0100,fosq_0200,fosq_0300,fosq_0400,fosq_0500,fosq_0600,fosq_0700,fosq_0800,fosq_0900,fosq_1000,fosq_1100,phq_0100,phq_0200,phq_0300,phq_0400,phq_0500,phq_0600,phq_0700,phq_0800,phq_0900,phq_1000,gad_0100,gad_0200,gad_0300,gad_0400,gad_0500,gad_0600,gad_0700,gad_0800,fss_0100,fss_0200,fss_0300,fss_0400,fss_0500,fss_0600,fss_0700,fss_0800,fss_0900,fss_1000,diet_0300,diet_0310,diet_0320,diet_0330,diet_0340,diet_0350,diet_0360,diet_0370,diet_0380,diet_0400,diet_0500,diet_0600,diet_0700,diet_0800,diet_0801,diet_0810,diet_0811,diet_0820,diet_0821,diet_0830,diet_0831,diet_0840,diet_0841,diet_0850,diet_0851,diet_0860,diet_0861,diet_0870,diet_0871,rls_0300,rls_0800,rls_0801,rls_0900,rls_0910,rls_probability,rls_severity,sched_1401,sched_1701,sched_1801,soclhx_0101,narc_1710,never_cigarette_smoker,former_cigarette_smoker,former_smokeless_user,current_cigarette_smoker,current_smokeless_user,visitcode
0,28SEP18:17:25:32,,BOGN00002,1960.0,58.0,F,5.0,2.0,168.0,30.7,0.0,,1.0,0.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1
1,12OCT18:08:07:19,12OCT18:08:48:00,BOGN00004,1987.0,30.0,F,5.0,7.0,188.0,29.4,0.0,,1.0,0.0,1.0,,1.0,,,,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,1.0,,1.0,1.0,1.0,0.0,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,0.0,5.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,4.0,0.0,0.0,2.0,2.0,,,,,,,0.0,0.0,,08:00:00,17:00:00,22:30:00,,06:15:00,,,,,,,,,,,,22:30:00,1.0,09:30:00,1.0,,0.0,30.0,6.0,30.0,1.0,0.0,10.0,3.0,,,,,,,,,,,,,,,,,0.0,30.0,9.0,0.0,1.0,0.0,10.0,3.0,,1.0,,,,,,,,0.0,0.0,4.0,2.0,0.0,1.0,3.0,0.0,3.0,0.0,17:00:00,,,,,0.0,,,1.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,,,,,0.0,0.0,,,,0.0,0.333333,1.0,0.0,0.666977,0.095419,1.0,3.0,1.0,1.0,1.0,0.0,1.0,2.0,0.0,0.0,0.0,5.0,0.0,5.0,0.0,0.0,3.0,1.0,1.0,,,0.0,,1.0,0.0,0.0,3.0,4.0,2.0,4.0,14.0,2.0,2.0,3.0,3.0,1.0,3.0,0.0,1.0,0.0,1.0,3.0,5.0,2.0,1.0,10.0,2.0,0.0,,,0.0,,,4.0,10.0,2.0,4.0,3.0,3.0,3.0,4.0,4.0,4.0,3.0,1.0,0.0,0.0,,,,,,,,,,0.0,3.0,1.0,3.0,3.0,2.0,12.0,0.0,0.0,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,,,,,1.0,,,1.0,,,1.0,,0.0,0.0,,1.0,,,,,1.0,,,,,1.0,,0.0,1.0,,1.0,1.0,2.0,3.0,2.0,1.0,1.0,2.0,1.0,1.0,2.0,8.0,2.0,1.0,1.0,3.0,0.0,0.0,1.0,0.0,0.0,8.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,2.0,7.0,3.0,6.0,6.0,6.0,5.0,6.0,7.0,6.0,52.0,30.0,20.0,20.0,30.0,,,,,,2.0,-44.0,-44.0,2.0,,,,,,,,,,,,,,,,,,,,,,Unlikely,,,,,,0.0,1.0,0.0,0.0,0.0,0.0,1
2,08MAR19:07:35:09,08MAR19:07:55:00,BOGN00007,1988.0,30.0,F,5.0,7.0,165.0,25.8,0.0,,1.0,1.0,1.0,,1.0,,,,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,4.0,0.0,20.0,-55.0,0.0,0.0,0.0,-55.0,0.0,-55.0,-55.0,-55.0,-55.0,-55.0,0.0,1.0,0.0,1.0,3.0,0.0,2.0,1.0,0.0,,,,,,,1.0,0.0,3.0,07:00:00,19:00:00,21:00:00,,06:00:00,,07:00:00,19:00:00,21:00:00,,06:00:00,,1.0,,,,22:00:00,,07:00:00,,1.0,2.0,0.0,4.0,0.0,3.0,1.0,0.0,1.0,1.0,0.0,4.0,0.0,3.0,2.0,0.0,1.0,,,,,,,,-66.0,2.0,0.0,5.0,0.0,3.0,1.0,0.0,1.0,8.0,,0.0,,,,,,,0.0,0.0,1.0,2.0,0.0,1.0,10.0,0.0,1.0,0.0,07:00:00,,,,,0.0,,,0.0,4.0,0.0,0.0,4.0,0.0,4.0,2.0,0.0,0.0,,,,,0.0,0.0,,,,0.0,0.0,1.33333,0.0,0.666977,0.04767,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,7.0,1.0,1.0,,,1.0,,,,,4.0,4.0,4.0,4.0,3.0,3.0,4.0,26.0,2.0,2.0,4.0,4.0,1.0,1.0,2.0,4.0,2.0,2.0,4.0,6.0,2.0,4.0,6.0,2.0,4.0,6.0,2.0,4.0,6.0,2.0,4.0,6.0,2.0,4.0,2.0,2.0,3.0,4.0,4.0,4.0,4.0,1.0,0.0,0.0,,,,,,,,,,0.0,3.0,2.0,5.0,4.0,2.0,16.0,0.0,0.0,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,,,,,1.0,,,1.0,,,1.0,,0.0,-55.0,,1.0,,,,,1.0,,,,,1.0,,0.0,,-55.0,,1.0,2.0,4.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,2.0,1.0,0.0,0.0,2.0,0.0,2.0,0.0,5.0,7.0,2.0,4.0,4.0,4.0,3.0,4.0,4.0,4.0,36.0,20.0,40.0,20.0,20.0,,,,,,2.0,-44.0,-44.0,1.0,,,,,,,,,,,,,,,,,,,,,,Unlikely,,,,,,0.0,1.0,0.0,0.0,0.0,0.0,1
3,18OCT18:16:06:54,18OCT18:16:43:00,BOGN00008,1976.0,42.0,M,5.0,4.0,156.0,26.8,0.0,,1.0,4.0,1.0,,1.0,,,,,,,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,1.0,0.0,0.0,0.0,0.0,0.0,,1.0,1.0,0.0,0.0,,,,,,,,,,,,,,,,,,1.0,0.0,0.0,3.0,0.0,20.0,-55.0,-55.0,0.0,1.0,-55.0,-55.0,-55.0,-55.0,1.0,-55.0,-55.0,-55.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,7.0,14:00:00,1.0,20:00:00,1.0,0.0,0.0,5.0,06:00:00,08:00:00,22:00:00,,04:30:00,,,,,,,,,,,,22:00:00,,07:00:00,,,2.0,0.0,4.0,0.0,3.0,1.0,0.0,2.0,,,,,,,,,,,,,,,,,1.0,0.0,4.0,0.0,2.0,0.0,30.0,2.0,,1.0,,,,,,,,0.0,0.0,0.0,3.0,0.0,0.0,2.0,1.0,3.0,0.0,08:30:00,10.0,100.0,0.0,24.0,0.0,,,2.0,3.0,0.0,0.0,4.0,0.0,2.0,2.0,0.0,1.0,1.0,2.0,3.0,,-55.0,3.0,5.0,2.0,,-55.0,0.666667,0.666667,2.0,0.666977,0.309199,3.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,4.0,0.0,2.0,1.0,1.0,,,0.0,2.0,1.0,0.0,,2.0,1.0,0.0,3.0,2.0,1.0,2.0,11.0,3.0,0.0,4.0,1.0,0.0,3.0,0.0,2.0,2.0,0.0,3.0,11.0,2.0,2.0,11.0,2.0,3.0,4.0,2.0,4.0,11.0,2.0,4.0,11.0,2.0,2.0,0.0,0.0,2.0,3.0,1.0,1.0,2.0,1.0,2.0,2.0,,,,,,,,,,0.0,4.0,1.0,3.0,4.0,6.0,17.0,0.0,0.0,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,1.0,3.0,,,,,-55.0,,,1.0,,,,1.0,,2.0,2.0,,1.0,31.0,,,-55.0,,,,5.0,,2.0,-55.0,,-55.0,,3.0,3.0,4.0,4.0,3.0,4.0,3.0,2.0,2.0,4.0,16.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,3.0,0.0,6.0,3.0,1.0,1.0,3.0,3.0,2.0,0.0,13.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,12.0,,,,,44.0,,,,,2.0,-44.0,0.0,1.0,,,,,,,,,,,,,,,,,1.0,,,,,Unlikely,,,,,,0.0,0.0,1.0,0.0,0.0,0.0,1
4,01MAR19:11:55:20,01MAR19:12:23:00,BOGN00009,1982.0,36.0,M,5.0,3.0,255.0,45.2,0.0,,1.0,1.0,1.0,,1.0,0.0,7.0,7.0,,,,0.0,,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,1.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4.0,4.0,3.0,3.0,1.0,0.0,3.0,1.0,1.0,1.0,0.0,30.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,3.0,0.0,3.0,1.0,0.0,2.0,0.0,2.0,5.0,,1.0,,1.0,0.0,0.0,5.0,08:15:00,17:30:00,22:00:00,,06:00:00,,,,,,,,,,,,23:00:00,,06:30:00,,,0.0,15.0,6.0,0.0,3.0,0.0,0.0,3.0,,,,,,,,,,,,,,,,,0.0,15.0,6.0,0.0,3.0,0.0,0.0,3.0,6.0,,30.0,,,,,,,1.0,,,2.0,0.0,0.0,1.0,0.0,2.0,0.0,14:00:00,16.0,8.0,0.0,24.0,0.0,,,0.0,3.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,,,,,0.0,0.0,,,,0.0,0.0,0.333333,0.0,0.666977,0.837208,0.0,0.0,0.0,2.0,2.0,0.0,3.0,1.0,0.0,0.0,0.0,8.0,0.0,2.0,1.0,1.0,,,1.0,,,0.0,,0.0,2.0,1.0,3.0,1.0,1.0,1.0,9.0,2.0,2.0,1.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0,1.0,2.0,2.0,2.0,3.0,5.0,2.0,1.0,6.0,1.0,1.0,4.0,1.0,2.0,1.0,1.0,1.0,3.0,3.0,3.0,3.0,0.0,0.0,0.0,,,,,,,,,,0.0,4.0,2.0,4.0,5.0,6.0,20.0,0.0,0.0,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,,,,,1.0,,,1.0,,,,1.0,,0.0,,1.0,,,,,1.0,,,,,1.0,,0.0,,1.0,,2.0,3.0,4.0,4.0,4.0,3.0,3.0,2.0,2.0,2.0,15.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,5.0,1.0,1.0,1.0,1.0,1.0,2.0,0.0,7.0,7.0,2.0,3.0,4.0,2.0,2.0,2.0,2.0,3.0,27.0,10.0,25.0,40.0,25.0,,,,,,2.0,0.0,0.0,2.0,,,,,,,,,,,,,,,,,,,,,,Unlikely,,,,,,0.0,0.0,1.0,0.0,0.0,0.0,1


In [289]:
raw_stages_harmonized_df.info()
raw_stages_harmonized_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1881 entries, 0 to 1880
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   subject_code         1881 non-null   object 
 1   visitcode            1881 non-null   int64  
 2   nsrr_age             1859 non-null   float64
 3   nsrr_age_gt89        1881 non-null   object 
 4   nsrr_sex             1859 non-null   object 
 5   nsrr_race            1859 non-null   object 
 6   nsrr_ethnicity       1881 non-null   object 
 7   nsrr_bmi             1859 non-null   float64
 8   nsrr_current_smoker  1881 non-null   object 
 9   nsrr_ever_smoker     1881 non-null   object 
dtypes: float64(2), int64(1), object(7)
memory usage: 147.1+ KB


Unnamed: 0,subject_code,visitcode,nsrr_age,nsrr_age_gt89,nsrr_sex,nsrr_race,nsrr_ethnicity,nsrr_bmi,nsrr_current_smoker,nsrr_ever_smoker
0,BOGN00002,1,58.0,no,female,white,not hispanic or latino,30.7,not reported,not reported
1,BOGN00004,1,30.0,no,female,white,not hispanic or latino,29.4,no,no
2,BOGN00007,1,30.0,no,female,white,not hispanic or latino,25.8,no,no
3,BOGN00008,1,42.0,no,male,white,not hispanic or latino,26.8,no,yes
4,BOGN00009,1,36.0,no,male,white,not hispanic or latino,45.2,no,yes


# Stages Data Preparation

In [290]:
# due to the size of the dataset, it will be initially split into subsections of columns for clarity

# subsetting via data dictionary groupings

# demographics

dem_cols = raw_stages_df.columns[raw_stages_df.columns.str.contains('dem')].tolist()
sched_cols = raw_stages_df.columns[raw_stages_df.columns.str.contains('sched')].tolist()
bthbts_cols = raw_stages_df.columns[raw_stages_df.columns.str.contains('bthbts')].tolist()

# columns that contain "sched" but in the data dicitonary are part of the sleep questionnaire subset
sleep_disturbance_sched_cols = [
    'sched_2400', 'sched_2500', 'sched_2510', 'sched_2600', 'sched_2900', 'sched_3000',
    'sched_3010', 'sched_3100', 'sched_3400', 'sched_3500', 'sched_3510', 'sched_3600',
    'sched_3900', 'sched_4000', 'sched_4010', 'sched_4100', 'sched_4200', 'sched_4201', 'sched_4210'
]

sleep_habits_sched_cols = [
    'sched_0900', 'sched_0901', 'sched_1000', 'sched_1001', 'sched_1300', 'sched_1301',
    'sched_1400', 'sched_1401', 'sched_1700', 'sched_1701', 'sched_1800', 'sched_1801',
    'sched_1900', 'sched_1901', 'sched_2000', 'sched_2001', 'sched_2200', 'sched_2210',
    'sched_2300', 'sched_2310', 'sched_2700', 'sched_2710', 'sched_2800', 'sched_2810',
    'sched_3200', 'sched_3210', 'sched_3300', 'sched_3310', 'sched_3700', 'sched_3710',
    'sched_3800', 'sched_3810'
]

sleep_habits_bthbts_cols = [
    'bthbts_0300'
]

lifestyle_bthbts_cols = [
    'bthbts_0100'
]

dem_sched_cols = [col for col in sched_cols if col not in sleep_disturbance_sched_cols and col not in sleep_habits_sched_cols]

dem_bthbts_cols = [col for col in bthbts_cols if col not in sleep_habits_bthbts_cols and col  not in lifestyle_bthbts_cols]

demos = raw_stages_df[dem_cols + dem_sched_cols + dem_bthbts_cols]
demos.info() #as with data dictionary should have 17 ssched cols


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1881 entries, 0 to 1880
Data columns (total 35 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   dem_0100           1859 non-null   float64
 1   modified_dem_0110  1859 non-null   float64
 2   dem_0500           1859 non-null   object 
 3   dem_0600           1859 non-null   float64
 4   dem_0610           1859 non-null   float64
 5   dem_0700           1859 non-null   float64
 6   dem_0800           1859 non-null   float64
 7   dem_0900           1859 non-null   float64
 8   dem_0910           109 non-null    float64
 9   dem_1000           1859 non-null   float64
 10  dem_1010           1755 non-null   float64
 11  dem_1100           1810 non-null   float64
 12  dem_1120           203 non-null    float64
 13  sched_0100         184 non-null    float64
 14  sched_0200         171 non-null    float64
 15  sched_0300         126 non-null    object 
 16  sched_0301         88 no

In [291]:
#demo cols from harmonized dataset

#TODO : add harmonized demo cols here

In [292]:
# general health and lifestyle

fss_cols = raw_stages_df.columns[raw_stages_df.columns.str.contains('fss')].tolist()
gad_cols = raw_stages_df.columns[raw_stages_df.columns.str.contains('gad')].tolist()
phq_cols = raw_stages_df.columns[raw_stages_df.columns.str.contains('phq')].tolist()
nose_cols = raw_stages_df.columns[raw_stages_df.columns.str.contains('nose')].tolist()
osa_cols = raw_stages_df.columns[raw_stages_df.columns.str.contains('osa')].tolist()
cir_cols = raw_stages_df.columns[raw_stages_df.columns.str.contains('cir')].tolist()
diet_cols = raw_stages_df.columns[raw_stages_df.columns.str.contains('diet')].tolist()
soclhx_cols = raw_stages_df.columns[raw_stages_df.columns.str.contains('soclhx')].tolist()

sleep_habits_soclhx_cols = [
    'soclhx_0100', 'soclhx_0101', 'soclhx_0110',
    'soclhx_0200', 'soclhx_0210', 'soclhx_0300','soclhx_0400', 'soclhx_0800'
]

sleep_quest_cir_cols = [
    'cir_0200', 'cir_0300', 'cir_0400', 'cir_0500', 'cir_0600', 'cir_0700' 
]


# TODO: add /check smoking cols from harmonized dataset

smoking_cols = [
    'current_cigarette_smoker', 'current_smokeless_user',
    'former_cigarette_smoker', 'former_smokeless_user', 'never_cigarette_smoker'
] #manual add


lifestyle_solchsx_cols = [col for col in soclhx_cols if col not in sleep_habits_soclhx_cols]

lifestyle_cir_cols = [col for col in cir_cols if col not in sleep_quest_cir_cols]

general_health_lifestyle = raw_stages_df[fss_cols + gad_cols + phq_cols
                                         + nose_cols + osa_cols + lifestyle_cir_cols + diet_cols + lifestyle_bthbts_cols
                                         + lifestyle_solchsx_cols + smoking_cols]

general_health_lifestyle.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1881 entries, 0 to 1880
Data columns (total 90 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   fss_0100                  1763 non-null   float64
 1   fss_0200                  1763 non-null   float64
 2   fss_0300                  1763 non-null   float64
 3   fss_0400                  1763 non-null   float64
 4   fss_0500                  1763 non-null   float64
 5   fss_0600                  1763 non-null   float64
 6   fss_0700                  1763 non-null   float64
 7   fss_0800                  1763 non-null   float64
 8   fss_0900                  1763 non-null   float64
 9   fss_1000                  1763 non-null   float64
 10  gad_0100                  1764 non-null   float64
 11  gad_0200                  1764 non-null   float64
 12  gad_0300                  1764 non-null   float64
 13  gad_0400                  1764 non-null   float64
 14  gad_0500

In [293]:
# medical history

famhx_cols = raw_stages_df.columns[raw_stages_df.columns.str.contains('famhx')].tolist()
mdhx_cols = raw_stages_df.columns[raw_stages_df.columns.str.contains('mdhx')].tolist()

sleep_quest_mdhx_cols = ['mdhx_0200', 'mdhx_5500', 'mdhx_5600']
sleep_treatment_mdhx_cols = ['mdhx_0700', 'mdhx_0800', 'mdhx_0400']

med_history_mdhx_cols = [col for col in mdhx_cols if col not in sleep_quest_mdhx_cols and col not in sleep_treatment_mdhx_cols]

medical_history = raw_stages_df[famhx_cols + med_history_mdhx_cols]
medical_history.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1881 entries, 0 to 1880
Data columns (total 40 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   famhx_0100  1684 non-null   float64
 1   famhx_0200  1699 non-null   float64
 2   famhx_0300  1660 non-null   float64
 3   famhx_0400  1673 non-null   float64
 4   famhx_0500  1646 non-null   float64
 5   famhx_0600  1669 non-null   float64
 6   famhx_0700  1622 non-null   float64
 7   famhx_0800  1636 non-null   float64
 8   famhx_0900  1632 non-null   float64
 9   famhx_1000  1600 non-null   float64
 10  famhx_1100  1627 non-null   float64
 11  famhx_1200  1617 non-null   float64
 12  famhx_1300  1763 non-null   float64
 13  mdhx_1200   924 non-null    float64
 14  mdhx_1300   911 non-null    float64
 15  mdhx_1400   919 non-null    float64
 16  mdhx_5700   1793 non-null   float64
 17  mdhx_5710   1793 non-null   float64
 18  mdhx_5720   1793 non-null   float64
 19  mdhx_5800   1793 non-null  

In [294]:
# Sleep Questionnaires

ess_cols = raw_stages_df.columns[raw_stages_df.columns.str.contains('ess')].tolist()
map_cols = raw_stages_df.columns[raw_stages_df.columns.str.contains('map')].tolist()
narc_cols = raw_stages_df.columns[raw_stages_df.columns.str.contains('narc')].tolist()
par_cols = raw_stages_df.columns[raw_stages_df.columns.str.contains('par')].tolist()
slpy_cols = raw_stages_df.columns[raw_stages_df.columns.str.contains('slpy')].tolist()
rls_cols = raw_stages_df.columns[raw_stages_df.columns.str.contains('rls')].tolist()
fosq_cols = raw_stages_df.columns[raw_stages_df.columns.str.contains('fosq')].tolist()
isi_cols = raw_stages_df.columns[raw_stages_df.columns.str.contains('isi')].tolist()
isq_cols = raw_stages_df.columns[raw_stages_df.columns.str.contains('isq')].tolist()
tab_cols = raw_stages_df.columns[raw_stages_df.columns.str.contains('tab')].tolist()
index_cols = ['index_1', 'index_3', 'index_4', 'score']

ess_cols.remove('former_smokeless_user') if 'former_smokeless_user' in ess_cols else None  # in case of overlap with smoking_cols
ess_cols.remove('current_smokeless_user') if 'current_smokeless_user' in ess_cols else None  # in case of overlap with smoking_cols

isi_cols.remove('visitcode') if 'visitcode' in isi_cols else None  # in case of overlap with demo cols

sleep_quest_sched_cols = sleep_disturbance_sched_cols + sleep_habits_sched_cols

sleep_quest_others = (sleep_quest_cir_cols + sleep_quest_mdhx_cols + sleep_habits_bthbts_cols + sleep_habits_soclhx_cols)

print(sleep_quest_others)

sleep_questionnaires = raw_stages_df[ess_cols + map_cols + narc_cols + par_cols
                                    + slpy_cols + rls_cols + fosq_cols + isi_cols
                                    + isq_cols + tab_cols + index_cols
                                    + sleep_quest_others + sleep_quest_sched_cols
                                    ]

sleep_questionnaires.info()


['cir_0200', 'cir_0300', 'cir_0400', 'cir_0500', 'cir_0600', 'cir_0700', 'mdhx_0200', 'mdhx_5500', 'mdhx_5600', 'bthbts_0300', 'soclhx_0100', 'soclhx_0101', 'soclhx_0110', 'soclhx_0200', 'soclhx_0210', 'soclhx_0300', 'soclhx_0400', 'soclhx_0800']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1881 entries, 0 to 1880
Columns: 245 entries, ess_0100 to sched_3810
dtypes: float64(236), object(9)
memory usage: 3.5+ MB


In [295]:
# sleep treatmnet
pap_cols = raw_stages_df.columns[raw_stages_df.columns.str.contains('pap')].tolist()

sleep_treatment = raw_stages_df[ pap_cols + sleep_treatment_mdhx_cols ]
sleep_treatment.info()

print(sleep_treatment.columns.to_list())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1881 entries, 0 to 1880
Data columns (total 19 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pap_0100   125 non-null    float64
 1   pap_0200   124 non-null    float64
 2   pap_0300   124 non-null    float64
 3   pap_0400   124 non-null    float64
 4   pap_0500   123 non-null    float64
 5   pap_0600   122 non-null    float64
 6   pap_0700   124 non-null    float64
 7   pap_0800   71 non-null     float64
 8   pap_1200   70 non-null     float64
 9   pap_1300   123 non-null    float64
 10  pap_1400   120 non-null    float64
 11  pap_1600   121 non-null    float64
 12  pap_1700   121 non-null    float64
 13  pap_1800   121 non-null    float64
 14  pap_1900   122 non-null    float64
 15  pap_2100   122 non-null    float64
 16  mdhx_0700  121 non-null    float64
 17  mdhx_0800  116 non-null    float64
 18  mdhx_0400  135 non-null    float64
dtypes: float64(19)
memory usage: 279.3 KB
['pap_0100

In [296]:
# check for all columns

all_columns = (demos.columns.tolist() + general_health_lifestyle.columns.tolist() +
               medical_history.columns.tolist() + sleep_questionnaires.columns.tolist() +
               sleep_treatment.columns.tolist()
              )

print(f'Total columns in raw dataset: {raw_stages_df.shape[1]}')
print(f'Total columns in subsets: {len(all_columns)}')
#missing 8 

missing_cols = set(raw_stages_df.columns.tolist()) - set(all_columns)
print(f'Missing columns: {missing_cols}')

Total columns in raw dataset: 433
Total columns in subsets: 429
Missing columns: {'subject_code', 'modified_completed', 'visitcode', 'modified_created_at'}


In [297]:
# splitting sleep_questionnares into two smaller subsets

# cols for first sleep_questionnares subset
sq_p1_cols = (ess_cols+ map_cols + narc_cols + par_cols + slpy_cols + index_cols + rls_cols)
#NOTE: map includes index_3 and index_4 and score and index_1

sq_p2_cols = (fosq_cols + isi_cols + isq_cols + tab_cols + sleep_quest_others + sleep_quest_sched_cols)
sleep_questionnaires_p1 = raw_stages_df[sq_p1_cols]
sleep_questionnaires_p2 = raw_stages_df[sq_p2_cols]

sleep_questionnaires_p1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1881 entries, 0 to 1880
Columns: 123 entries, ess_0100 to rls_severity
dtypes: float64(122), object(1)
memory usage: 1.8+ MB


In [298]:
sleep_questionnaires_p2.info() 

#now split into 122 and 123 column subsets

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1881 entries, 0 to 1880
Columns: 122 entries, fosq_0100 to sched_3810
dtypes: float64(114), object(8)
memory usage: 1.8+ MB


In [299]:
# data dictionary adding
data_dict = pd.read_csv('../data/stages/datasets/stages-data-dictionary-0.3.0-variables.csv')
data_dict.info()
display_names = data_dict.set_index('id')['display_name'].to_dict()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 441 entries, 0 to 440
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   folder         441 non-null    object 
 1   id             441 non-null    object 
 2   display_name   441 non-null    object 
 3   description    365 non-null    object 
 4   type           441 non-null    object 
 5   units          107 non-null    object 
 6   domain         301 non-null    object 
 7   labels         295 non-null    object 
 8   calculation    24 non-null     object 
 9   commonly_used  15 non-null     object 
 10  forms          0 non-null      float64
dtypes: float64(1), object(10)
memory usage: 38.0+ KB


## Demographics


In [300]:
'''demos.name = 'demographics'
copy_demos = demos.copy()

demo_display_names = {k: v for k, v in display_names.items() if k in copy_demos.columns}

copy_demos = data_utils.add_multi_index(demos, display_names)

copy_demos.head()


    '''

"demos.name = 'demographics'\ncopy_demos = demos.copy()\n\ndemo_display_names = {k: v for k, v in display_names.items() if k in copy_demos.columns}\n\ncopy_demos = data_utils.add_multi_index(demos, display_names)\n\ncopy_demos.head()\n\n\n    "

In [301]:
'''data_utils.inspect_structure(copy_demos)'''

'data_utils.inspect_structure(copy_demos)'

In [302]:
'''# check dupes and nulls

data_utils.check_duplicates(copy_demos) # these don't seem to be actually duplicates just lots of nulls 
'''

"# check dupes and nulls\n\ndata_utils.check_duplicates(copy_demos) # these don't seem to be actually duplicates just lots of nulls \n"

In [303]:
'''data_utils.check_nulls(copy_demos) # lot with > 80% nulls'''

'data_utils.check_nulls(copy_demos) # lot with > 80% nulls'

In [304]:
'''# flagging columns with high null percentages

flagged_demos = data_utils.flag_high_nulls(copy_demos, threshold=0.8)

non_flagged_demos = copy_demos.drop(columns=flagged_demos)

non_flagged_demos.info()
'''

'# flagging columns with high null percentages\n\nflagged_demos = data_utils.flag_high_nulls(copy_demos, threshold=0.8)\n\nnon_flagged_demos = copy_demos.drop(columns=flagged_demos)\n\nnon_flagged_demos.info()\n'

## General Health & Lifestyle

In [None]:
# NOTE: export_column_description_table was changed to make_column_description_table with return_md as a parameter.
'''copy_lifestyle = general_health_lifestyle.copy()
copy_lifestyle.name = 'general_health_lifestyle'

lifestyle_display_names = {k: v for k, v in display_names.items() if k in copy_lifestyle.columns}
#data_utils.export_column_description_table(copy_lifestyle, lifestyle_display_names, "lifestyle_column_description.md")
copy_lifestyle = data_utils.add_multi_index(copy_lifestyle, lifestyle_display_names)

copy_lifestyle.head()'''

'copy_lifestyle = general_health_lifestyle.copy()\ncopy_lifestyle.name = \'general_health_lifestyle\'\n\nlifestyle_display_names = {k: v for k, v in display_names.items() if k in copy_lifestyle.columns}\n#data_utils.export_column_description_table(copy_lifestyle, lifestyle_display_names, "lifestyle_column_description.md")\ncopy_lifestyle = data_utils.add_multi_index(copy_lifestyle, lifestyle_display_names)\n\ncopy_lifestyle.head()'

## Brief Subset Analysis

In [306]:
#testing

copy_demos = demos.copy()
copy_lifestyle = general_health_lifestyle.copy()
copy_mdx_history = medical_history.copy()
copy_sq_h1 = sleep_questionnaires_p1.copy()
copy_sq_h2 = sleep_questionnaires_p2.copy()
copy_slp_treat = sleep_treatment.copy()

copy_demos.name = 'demographics'
copy_lifestyle.name = 'general_health_lifestyle'
copy_mdx_history.name = 'medical_history'
copy_sq_h1.name = 'sleep_questionnaires_part_1'
copy_sq_h2.name = 'sleep_questionnaires_part_2'
copy_slp_treat.name = 'sleep_treatment'

subsets = [copy_demos, copy_lifestyle, copy_mdx_history, copy_sq_h1, copy_sq_h2, copy_slp_treat]

for df in subsets:
    temp_display_names = {k: v for k, v in display_names.items() if k in df.columns}
    df = data_utils.add_multi_index(df, temp_display_names)


for df in subsets:
    data_utils.inspect_structure(df)
    print('\n\n')


Structure of demographics:
Shape:  1881  rows x  35  columns
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1881 entries, 0 to 1880
Data columns (total 35 columns):
 #   Column                                                                              Non-Null Count  Dtype  
---  ------                                                                              --------------  -----  
 0   (Participant's year of birth, dem_0100)                                             1859 non-null   float64
 1   (Participant's age, modified_dem_0110)                                              1859 non-null   float64
 2   (Participant's sex, dem_0500)                                                       1859 non-null   object 
 3   (Height in feet, dem_0600)                                                          1859 non-null   float64
 4   (Height in inches, dem_0610)                                                        1859 non-null   float64
 5   (Weight in pounds, dem_0700)        

In [307]:
for df in subsets:
    data_utils.check_nulls(df)


 Dataset - Null Values Check:
                                                                      Null Count  \
Participant's year of birth                        dem_0100                   22   
Participant's age                                  modified_dem_0110          22   
Participant's sex                                  dem_0500                   22   
Height in feet                                     dem_0600                   22   
Height in inches                                   dem_0610                   22   
Weight in pounds                                   dem_0700                   22   
Body mass index (BMI)                              dem_0800                   22   
Participant's ethnicity (hispanic or latino)       dem_0900                   22   
Participant's ethnicity (sub hispanic or latino... dem_0910                 1772   
Participant's race (main)                          dem_1000                   22   
Participant's race (sub)                     

In [308]:
df_list = []


for df in subsets:
    flagged = data_utils.flag_high_nulls(df, threshold=0.8, return_df=True)
    df_list.append(flagged)


In [309]:
df_list[1] # 2nd subset output

Unnamed: 0,Column Name,Column Description,Null Percentage,Index
0,diet_0340,Food intake - No regular meals,0.842637,"(Food intake - No regular meals, diet_0340)"
1,diet_0800,Usual breakfast time,0.845295,"(Usual breakfast time, diet_0800)"
2,diet_0810,Usual lunch time,0.82403,"(Usual lunch time, diet_0810)"
3,diet_0820,Usual dinner time,0.809676,"(Usual dinner time, diet_0820)"
4,diet_0830,Usual additional meal/snack time1,0.846358,"(Usual additional meal/snack time1, diet_0830)"
5,diet_0840,Usual additional meal/snack time2,0.917597,"(Usual additional meal/snack time2, diet_0840)"
6,diet_0850,Usual additional meal/snack time3,0.971292,"(Usual additional meal/snack time3, diet_0850)"
7,diet_0860,Usual additional meal/snack time4,0.993089,"(Usual additional meal/snack time4, diet_0860)"
8,diet_0870,Usual additional meal/snack time5,0.996279,"(Usual additional meal/snack time5, diet_0870)"
9,soclhx_1700,"Street or recreational drugs consumption, age ...",0.888357,"(Street or recreational drugs consumption, age..."


In [310]:
df_list

[   Column Name                                 Column Description  \
 0     dem_0910  Participant's ethnicity (sub hispanic or latin...   
 1     dem_1120               Participant's proficiency in English   
 2   sched_0100                                    Level of school   
 3   sched_0200                            Days per week in school   
 4   sched_0300                                Time School Starts    
 5   sched_0301                        Time School Starts - Varies   
 6   sched_0400                                   Time School Ends   
 7   sched_0401                          Time School Ends - Varies   
 8   sched_1100          Self-reported work start time, next shift   
 9   sched_1200            Self-reported work end time, next shift   
 10  sched_1500           Self-reported work start time, 3rd shift   
 11  sched_1501        Self-reported work start time, no 3rd shift   
 12  sched_1600             Self-reported work end time, 3rd shift   
 13  sched_2100     

ok legit just removing all of the high marked columns for now can maybe come back for them later

In [311]:
# fixing
#copy_demos, copy_lifestyle, copy_mdx_history, copy_sq_h1, copy_sq_h2, copy_slp_treat


demo_cols_to_remove = df_list[0]['Index']
edit_demos = copy_demos.drop(columns=demo_cols_to_remove)

lifestyle_cols_to_remove = df_list[1]['Index']
edit_lifestyle = copy_lifestyle.drop(columns=lifestyle_cols_to_remove)

mdhx_cols_to_remove = df_list[2]['Index']
edit_mdhx = copy_mdx_history.drop(columns=mdhx_cols_to_remove)

sq_h1_cols_to_remove = df_list[3]['Index']
edit_sq_h1 = copy_sq_h1.drop(columns=sq_h1_cols_to_remove)

sq_h2_cols_to_remove = df_list[4]['Index']
edit_sq_h2 = copy_sq_h2.drop(columns=sq_h2_cols_to_remove)

slp_treat_cols_to_remove = df_list[5]['Index']
edit_slp_treat = copy_slp_treat.drop(columns=slp_treat_cols_to_remove)



In [312]:
#checking nulls again

edit_subsets_dict = {
    'demographics': edit_demos,
    'general_health_lifestyle': edit_lifestyle,
    'medical_history': edit_mdhx,
    'sleep_questionnaires_part_1': edit_sq_h1,
    'sleep_questionnaires_part_2': edit_sq_h2,
    'sleep_treatment': edit_slp_treat
}

data_utils.check_nulls(edit_subsets_dict)


 demographics - Null Values Check:
                                                                      Null Count  \
Participant's year of birth                        dem_0100                   22   
Participant's age                                  modified_dem_0110          22   
Participant's sex                                  dem_0500                   22   
Height in feet                                     dem_0600                   22   
Height in inches                                   dem_0610                   22   
Weight in pounds                                   dem_0700                   22   
Body mass index (BMI)                              dem_0800                   22   
Participant's ethnicity (hispanic or latino)       dem_0900                   22   
Participant's race (main)                          dem_1000                   22   
Participant's race (sub)                           dem_1010                  126   
English as native language              

i am now going to do some feature engineering on the subsets because I don't think its worth it to start with any EDA.
To prevent extreme clutter in the notebooks I am going to make another notebook for feature eningeering

In [317]:
# exporting subsets to be used in another notebook

import pickle as pkl

subset_filenames = {
    'demographics': 'demographics_subset_pre_fe.pkl',
    'general_health_lifestyle': 'general_health_lifestyle_subset_pre_fe.pkl',
    'medical_history': 'medical_history_subset_pre_fe.pkl',
    'sleep_questionnaires_part_1': 'sleep_questionnaires_part_1_subset_pre_fe.pkl',
    'sleep_questionnaires_part_2': 'sleep_questionnaires_part_2_subset_pre_fe.pkl',
    'sleep_treatment': 'sleep_treatment_subset_pre_fe.pkl'
}

for subset_name, df in edit_subsets_dict.items():
    filename = subset_filenames[subset_name]
    filepath = os.path.join('../data/processed/post_data_prep', filename)
    with open(filepath, 'wb') as file:
        pkl.dump(df, file)
    print(f'Exported {subset_name} to {filepath}')


Exported demographics to ../data/processed/post_data_prep/demographics_subset_pre_fe.pkl
Exported general_health_lifestyle to ../data/processed/post_data_prep/general_health_lifestyle_subset_pre_fe.pkl
Exported medical_history to ../data/processed/post_data_prep/medical_history_subset_pre_fe.pkl
Exported sleep_questionnaires_part_1 to ../data/processed/post_data_prep/sleep_questionnaires_part_1_subset_pre_fe.pkl
Exported sleep_questionnaires_part_2 to ../data/processed/post_data_prep/sleep_questionnaires_part_2_subset_pre_fe.pkl
Exported sleep_treatment to ../data/processed/post_data_prep/sleep_treatment_subset_pre_fe.pkl


In [320]:
# export subsets without dropped high null columns 

for df in subsets:
    subset_name = df.name
    filename = f'{subset_name}_subset_raw.pkl'
    filepath = os.path.join('../data/processed/raw_subsets/', filename)
    with open(filepath, 'wb') as file:
        pkl.dump(df, file)
    print(f'Exported {subset_name} to {filepath}')


Exported demographics to ../data/processed/raw_subsets/demographics_subset_raw.pkl
Exported general_health_lifestyle to ../data/processed/raw_subsets/general_health_lifestyle_subset_raw.pkl
Exported medical_history to ../data/processed/raw_subsets/medical_history_subset_raw.pkl
Exported sleep_questionnaires_part_1 to ../data/processed/raw_subsets/sleep_questionnaires_part_1_subset_raw.pkl
Exported sleep_questionnaires_part_2 to ../data/processed/raw_subsets/sleep_questionnaires_part_2_subset_raw.pkl
Exported sleep_treatment to ../data/processed/raw_subsets/sleep_treatment_subset_raw.pkl


## Quality Assessment