# Data Preparation Notebook for the STAGES Dataset
datset provided by the NSRR

In [1]:
# libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.preprocessing import LabelEncoder


In [2]:
# settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['axes.titlesize'] = 16
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['legend.fontsize'] = 12
plt.rcParams['figure.titlesize'] = 18
np.random.seed(42)


## Loading Data

In [6]:
raw_stages_df = pd.read_csv('../data/stages/datasets/stages-dataset-0.3.0.csv')
raw_stages_harmonized_df = pd.read_csv('../data/stages/datasets/stages-harmonized-dataset-0.3.0.csv')


raw_stages_df.info()
raw_stages_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1881 entries, 0 to 1880
Columns: 433 entries, modified_created_at to visitcode
dtypes: float64(402), int64(1), object(30)
memory usage: 6.2+ MB


Unnamed: 0,modified_created_at,modified_completed,subject_code,dem_0100,modified_dem_0110,dem_0500,dem_0600,dem_0610,dem_0700,dem_0800,dem_0900,dem_0910,dem_1000,dem_1010,dem_1100,dem_1120,mdhx_0200,mdhx_0400,mdhx_0700,mdhx_0800,mdhx_1200,mdhx_1300,mdhx_1400,mdhx_5500,mdhx_5600,mdhx_5700,mdhx_5710,mdhx_5720,mdhx_5800,mdhx_5810,mdhx_5820,mdhx_5900,mdhx_5910,mdhx_5920,mdhx_5950,mdhx_6000,mdhx_6030,mdhx_6100,mdhx_6200,mdhx_6300,mdhx_6310,mdhx_6320,mdhx_6400,mdhx_6420,mdhx_6500,mdhx_6600,mdhx_6700,mdhx_6900,mdhx_6910,pap_0100,pap_0200,pap_0300,pap_0400,pap_0500,pap_0600,pap_0700,pap_0800,pap_1200,pap_1300,pap_1400,pap_1600,pap_1700,pap_1800,pap_1900,pap_2100,nose_0100,nose_0200,nose_0300,nose_0400,nose_0500,nose_0600,famhx_0100,famhx_0200,famhx_0300,famhx_0400,famhx_0500,famhx_0600,famhx_0700,famhx_0800,famhx_0900,famhx_1000,famhx_1100,famhx_1200,famhx_1300,bthbts_0100,bthbts_0300,bthbts_0500,bthbts_0510,bthbts_0520,bthbts_0530,bthbts_0540,sched_0100,sched_0200,sched_0300,sched_0301,sched_0400,sched_0401,sched_0500,sched_0510,sched_0600,sched_0700,sched_0800,sched_0900,sched_0901,sched_1000,sched_1001,sched_1100,sched_1200,sched_1300,sched_1301,sched_1400,sched_1500,sched_1501,sched_1600,sched_1700,sched_1800,sched_1900,sched_1901,sched_2000,sched_2001,sched_2100,sched_2200,sched_2210,sched_2300,sched_2310,sched_2400,sched_2500,sched_2510,sched_2600,sched_2700,sched_2710,sched_2800,sched_2810,sched_2900,sched_3000,sched_3010,sched_3100,sched_3200,sched_3210,sched_3300,sched_3310,sched_3400,sched_3500,sched_3510,sched_3600,sched_3700,sched_3710,sched_3800,sched_3810,sched_3900,sched_4000,sched_4010,sched_4100,sched_4200,sched_4201,sched_4210,soclhx_0100,soclhx_0110,soclhx_0200,soclhx_0210,soclhx_0300,soclhx_0400,soclhx_0501,soclhx_0520,soclhx_0600,soclhx_0700,soclhx_0701,soclhx_0710,soclhx_0730,soclhx_0800,soclhx_0900,soclhx_0901,soclhx_1000,soclhx_1200,soclhx_1300,soclhx_1310,soclhx_1400,soclhx_1500,soclhx_1700,soclhx_1800,map_0100,map_0200,map_0300,map_0400,map_0500,map_0600,map_0700,map_0800,map_0900,map_1000,map_1010,map_1020,map_1030,map_1040,map_1041,map_1100,map_1110,map_1120,map_1130,map_1131,index_1,index_3,index_4,map_lr,score,osa_0100,osa_0200,osa_0300,ess_0100,ess_0200,ess_0300,ess_0400,ess_0500,ess_0600,ess_0700,ess_0800,ess_0900,slpy_0101,slpy_0100,slpy_0110,slpy_0201,slpy_0200,slpy_0210,slpy_0301,slpy_0300,slpy_0310,slpy_0400,slpy_0410,isi_0100,isi_0200,isi_0300,isi_0400,isi_0500,isi_0600,isi_0700,isi_score,tab_0100,tab_0200,tab_0300,tab_0400,tab_0500,tab_0600,tab_0700,tab_0800,tab_0900,tab_1000,isq_0100,isq_0110,isq_0120,isq_0200,isq_0210,isq_0220,isq_0300,isq_0310,isq_0320,isq_0400,isq_0410,isq_0420,isq_0500,isq_0510,isq_0520,isq_0600,isq_0700,isq_0800,isq_0900,isq_1000,isq_1100,isq_1200,isq_1300,isq_score,rls_0100,rls_0200,rls_0310,rls_0400,rls_0410,rls_0500,rls_0510,rls_0600,rls_0610,rls_0700,rls_0710,cir_0100,cir_0200,cir_0300,cir_0400,cir_0500,cir_0600,cir_0700,narc_0050,narc_0100,narc_0110,narc_0200,narc_0210,narc_0300,narc_0310,narc_0400,narc_0410,narc_0500,narc_0510,narc_0600,narc_0700,narc_0800,narc_0900,narc_1000,narc_1100,narc_1200,narc_1300,narc_1400,narc_1500,narc_1600,narc_1610,narc_1650,narc_1700,narc_1701,narc_1900,narc_2000,narc_2100,narc_2110,narc_2200,par_0100,par_0101,par_0110,par_0200,par_0201,par_0210,par_0230,par_0300,par_0301,par_0310,par_0400,par_0500,par_0501,par_0510,par_0530,par_0531,par_0600,par_0601,par_0610,par_0630,par_0631,par_0700,par_0701,par_0710,par_0800,par_0900,par_0901,par_0910,fosq_0100,fosq_0200,fosq_0300,fosq_0400,fosq_0500,fosq_0600,fosq_0700,fosq_0800,fosq_0900,fosq_1000,fosq_1100,phq_0100,phq_0200,phq_0300,phq_0400,phq_0500,phq_0600,phq_0700,phq_0800,phq_0900,phq_1000,gad_0100,gad_0200,gad_0300,gad_0400,gad_0500,gad_0600,gad_0700,gad_0800,fss_0100,fss_0200,fss_0300,fss_0400,fss_0500,fss_0600,fss_0700,fss_0800,fss_0900,fss_1000,diet_0300,diet_0310,diet_0320,diet_0330,diet_0340,diet_0350,diet_0360,diet_0370,diet_0380,diet_0400,diet_0500,diet_0600,diet_0700,diet_0800,diet_0801,diet_0810,diet_0811,diet_0820,diet_0821,diet_0830,diet_0831,diet_0840,diet_0841,diet_0850,diet_0851,diet_0860,diet_0861,diet_0870,diet_0871,rls_0300,rls_0800,rls_0801,rls_0900,rls_0910,rls_probability,rls_severity,sched_1401,sched_1701,sched_1801,soclhx_0101,narc_1710,never_cigarette_smoker,former_cigarette_smoker,former_smokeless_user,current_cigarette_smoker,current_smokeless_user,visitcode
0,28SEP18:17:25:32,,BOGN00002,1960.0,58.0,F,5.0,2.0,168.0,30.7,0.0,,1.0,0.0,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1
1,12OCT18:08:07:19,12OCT18:08:48:00,BOGN00004,1987.0,30.0,F,5.0,7.0,188.0,29.4,0.0,,1.0,0.0,1.0,,1.0,,,,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,1.0,,1.0,1.0,1.0,0.0,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,1.0,0.0,5.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,4.0,0.0,0.0,2.0,2.0,,,,,,,0.0,0.0,,08:00:00,17:00:00,22:30:00,,06:15:00,,,,,,,,,,,,22:30:00,1.0,09:30:00,1.0,,0.0,30.0,6.0,30.0,1.0,0.0,10.0,3.0,,,,,,,,,,,,,,,,,0.0,30.0,9.0,0.0,1.0,0.0,10.0,3.0,,1.0,,,,,,,,0.0,0.0,4.0,2.0,0.0,1.0,3.0,0.0,3.0,0.0,17:00:00,,,,,0.0,,,1.0,0.0,0.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,,,,,0.0,0.0,,,,0.0,0.333333,1.0,0.0,0.666977,0.095419,1.0,3.0,1.0,1.0,1.0,0.0,1.0,2.0,0.0,0.0,0.0,5.0,0.0,5.0,0.0,0.0,3.0,1.0,1.0,,,0.0,,1.0,0.0,0.0,3.0,4.0,2.0,4.0,14.0,2.0,2.0,3.0,3.0,1.0,3.0,0.0,1.0,0.0,1.0,3.0,5.0,2.0,1.0,10.0,2.0,0.0,,,0.0,,,4.0,10.0,2.0,4.0,3.0,3.0,3.0,4.0,4.0,4.0,3.0,1.0,0.0,0.0,,,,,,,,,,0.0,3.0,1.0,3.0,3.0,2.0,12.0,0.0,0.0,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,,,,,1.0,,,1.0,,,1.0,,0.0,0.0,,1.0,,,,,1.0,,,,,1.0,,0.0,1.0,,1.0,1.0,2.0,3.0,2.0,1.0,1.0,2.0,1.0,1.0,2.0,8.0,2.0,1.0,1.0,3.0,0.0,0.0,1.0,0.0,0.0,8.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,2.0,7.0,3.0,6.0,6.0,6.0,5.0,6.0,7.0,6.0,52.0,30.0,20.0,20.0,30.0,,,,,,2.0,-44.0,-44.0,2.0,,,,,,,,,,,,,,,,,,,,,,Unlikely,,,,,,0.0,1.0,0.0,0.0,0.0,0.0,1
2,08MAR19:07:35:09,08MAR19:07:55:00,BOGN00007,1988.0,30.0,F,5.0,7.0,165.0,25.8,0.0,,1.0,1.0,1.0,,1.0,,,,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,0.0,0.0,0.0,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,4.0,0.0,20.0,-55.0,0.0,0.0,0.0,-55.0,0.0,-55.0,-55.0,-55.0,-55.0,-55.0,0.0,1.0,0.0,1.0,3.0,0.0,2.0,1.0,0.0,,,,,,,1.0,0.0,3.0,07:00:00,19:00:00,21:00:00,,06:00:00,,07:00:00,19:00:00,21:00:00,,06:00:00,,1.0,,,,22:00:00,,07:00:00,,1.0,2.0,0.0,4.0,0.0,3.0,1.0,0.0,1.0,1.0,0.0,4.0,0.0,3.0,2.0,0.0,1.0,,,,,,,,-66.0,2.0,0.0,5.0,0.0,3.0,1.0,0.0,1.0,8.0,,0.0,,,,,,,0.0,0.0,1.0,2.0,0.0,1.0,10.0,0.0,1.0,0.0,07:00:00,,,,,0.0,,,0.0,4.0,0.0,0.0,4.0,0.0,4.0,2.0,0.0,0.0,,,,,0.0,0.0,,,,0.0,0.0,1.33333,0.0,0.666977,0.04767,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,7.0,1.0,1.0,,,1.0,,,,,4.0,4.0,4.0,4.0,3.0,3.0,4.0,26.0,2.0,2.0,4.0,4.0,1.0,1.0,2.0,4.0,2.0,2.0,4.0,6.0,2.0,4.0,6.0,2.0,4.0,6.0,2.0,4.0,6.0,2.0,4.0,6.0,2.0,4.0,2.0,2.0,3.0,4.0,4.0,4.0,4.0,1.0,0.0,0.0,,,,,,,,,,0.0,3.0,2.0,5.0,4.0,2.0,16.0,0.0,0.0,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,,,,,1.0,,,1.0,,,1.0,,0.0,-55.0,,1.0,,,,,1.0,,,,,1.0,,0.0,,-55.0,,1.0,2.0,4.0,4.0,3.0,3.0,3.0,3.0,3.0,3.0,14.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,2.0,1.0,0.0,0.0,2.0,0.0,2.0,0.0,5.0,7.0,2.0,4.0,4.0,4.0,3.0,4.0,4.0,4.0,36.0,20.0,40.0,20.0,20.0,,,,,,2.0,-44.0,-44.0,1.0,,,,,,,,,,,,,,,,,,,,,,Unlikely,,,,,,0.0,1.0,0.0,0.0,0.0,0.0,1
3,18OCT18:16:06:54,18OCT18:16:43:00,BOGN00008,1976.0,42.0,M,5.0,4.0,156.0,26.8,0.0,,1.0,4.0,1.0,,1.0,,,,,,,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,1.0,0.0,0.0,0.0,0.0,0.0,,1.0,1.0,0.0,0.0,,,,,,,,,,,,,,,,,,1.0,0.0,0.0,3.0,0.0,20.0,-55.0,-55.0,0.0,1.0,-55.0,-55.0,-55.0,-55.0,1.0,-55.0,-55.0,-55.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,7.0,14:00:00,1.0,20:00:00,1.0,0.0,0.0,5.0,06:00:00,08:00:00,22:00:00,,04:30:00,,,,,,,,,,,,22:00:00,,07:00:00,,,2.0,0.0,4.0,0.0,3.0,1.0,0.0,2.0,,,,,,,,,,,,,,,,,1.0,0.0,4.0,0.0,2.0,0.0,30.0,2.0,,1.0,,,,,,,,0.0,0.0,0.0,3.0,0.0,0.0,2.0,1.0,3.0,0.0,08:30:00,10.0,100.0,0.0,24.0,0.0,,,2.0,3.0,0.0,0.0,4.0,0.0,2.0,2.0,0.0,1.0,1.0,2.0,3.0,,-55.0,3.0,5.0,2.0,,-55.0,0.666667,0.666667,2.0,0.666977,0.309199,3.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,4.0,0.0,2.0,1.0,1.0,,,0.0,2.0,1.0,0.0,,2.0,1.0,0.0,3.0,2.0,1.0,2.0,11.0,3.0,0.0,4.0,1.0,0.0,3.0,0.0,2.0,2.0,0.0,3.0,11.0,2.0,2.0,11.0,2.0,3.0,4.0,2.0,4.0,11.0,2.0,4.0,11.0,2.0,2.0,0.0,0.0,2.0,3.0,1.0,1.0,2.0,1.0,2.0,2.0,,,,,,,,,,0.0,4.0,1.0,3.0,4.0,6.0,17.0,0.0,0.0,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,1.0,3.0,,,,,-55.0,,,1.0,,,,1.0,,2.0,2.0,,1.0,31.0,,,-55.0,,,,5.0,,2.0,-55.0,,-55.0,,3.0,3.0,4.0,4.0,3.0,4.0,3.0,2.0,2.0,4.0,16.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,3.0,0.0,6.0,3.0,1.0,1.0,3.0,3.0,2.0,0.0,13.0,4.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,12.0,,,,,44.0,,,,,2.0,-44.0,0.0,1.0,,,,,,,,,,,,,,,,,1.0,,,,,Unlikely,,,,,,0.0,0.0,1.0,0.0,0.0,0.0,1
4,01MAR19:11:55:20,01MAR19:12:23:00,BOGN00009,1982.0,36.0,M,5.0,3.0,255.0,45.2,0.0,,1.0,1.0,1.0,,1.0,0.0,7.0,7.0,,,,0.0,,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,,0.0,1.0,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,4.0,4.0,3.0,3.0,1.0,0.0,3.0,1.0,1.0,1.0,0.0,30.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,3.0,0.0,3.0,1.0,0.0,2.0,0.0,2.0,5.0,,1.0,,1.0,0.0,0.0,5.0,08:15:00,17:30:00,22:00:00,,06:00:00,,,,,,,,,,,,23:00:00,,06:30:00,,,0.0,15.0,6.0,0.0,3.0,0.0,0.0,3.0,,,,,,,,,,,,,,,,,0.0,15.0,6.0,0.0,3.0,0.0,0.0,3.0,6.0,,30.0,,,,,,,1.0,,,2.0,0.0,0.0,1.0,0.0,2.0,0.0,14:00:00,16.0,8.0,0.0,24.0,0.0,,,0.0,3.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0,,,,,0.0,0.0,,,,0.0,0.0,0.333333,0.0,0.666977,0.837208,0.0,0.0,0.0,2.0,2.0,0.0,3.0,1.0,0.0,0.0,0.0,8.0,0.0,2.0,1.0,1.0,,,1.0,,,0.0,,0.0,2.0,1.0,3.0,1.0,1.0,1.0,9.0,2.0,2.0,1.0,2.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0,1.0,2.0,2.0,2.0,3.0,5.0,2.0,1.0,6.0,1.0,1.0,4.0,1.0,2.0,1.0,1.0,1.0,3.0,3.0,3.0,3.0,0.0,0.0,0.0,,,,,,,,,,0.0,4.0,2.0,4.0,5.0,6.0,20.0,0.0,0.0,,0.0,,,,,,,,,,,,,,,,,,,,,,0.0,0.0,0.0,,,,,1.0,,,1.0,,,,1.0,,0.0,,1.0,,,,,1.0,,,,,1.0,,0.0,,1.0,,2.0,3.0,4.0,4.0,4.0,3.0,3.0,2.0,2.0,2.0,15.0,1.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,5.0,1.0,1.0,1.0,1.0,1.0,2.0,0.0,7.0,7.0,2.0,3.0,4.0,2.0,2.0,2.0,2.0,3.0,27.0,10.0,25.0,40.0,25.0,,,,,,2.0,0.0,0.0,2.0,,,,,,,,,,,,,,,,,,,,,,Unlikely,,,,,,0.0,0.0,1.0,0.0,0.0,0.0,1


In [7]:
raw_stages_harmonized_df.info()
raw_stages_harmonized_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1881 entries, 0 to 1880
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   subject_code         1881 non-null   object 
 1   visitcode            1881 non-null   int64  
 2   nsrr_age             1859 non-null   float64
 3   nsrr_age_gt89        1881 non-null   object 
 4   nsrr_sex             1859 non-null   object 
 5   nsrr_race            1859 non-null   object 
 6   nsrr_ethnicity       1881 non-null   object 
 7   nsrr_bmi             1859 non-null   float64
 8   nsrr_current_smoker  1881 non-null   object 
 9   nsrr_ever_smoker     1881 non-null   object 
dtypes: float64(2), int64(1), object(7)
memory usage: 147.1+ KB


Unnamed: 0,subject_code,visitcode,nsrr_age,nsrr_age_gt89,nsrr_sex,nsrr_race,nsrr_ethnicity,nsrr_bmi,nsrr_current_smoker,nsrr_ever_smoker
0,BOGN00002,1,58.0,no,female,white,not hispanic or latino,30.7,not reported,not reported
1,BOGN00004,1,30.0,no,female,white,not hispanic or latino,29.4,no,no
2,BOGN00007,1,30.0,no,female,white,not hispanic or latino,25.8,no,no
3,BOGN00008,1,42.0,no,male,white,not hispanic or latino,26.8,no,yes
4,BOGN00009,1,36.0,no,male,white,not hispanic or latino,45.2,no,yes


# Stages Data Preparation

In [None]:
# due to the size of the dataset, it will be initially split into subsections of columns for clarity

