In [1]:
import numpy as np
import pandas as pd



In [2]:
data = pd.read_csv('data/nhanes32006data.csv')

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,age,agestrat,sex,race,ses,smokehx,bmi,obese,htnhx,...,higlu,hba1c,dm,hf,stroke,mi,fasttime,fast,mortstat,PERMTH_INT
0,1,21.75,21-35,male,non hispanic black,0.641,yes,25.5,no,hypertensive,...,no,5.3,no,no,no,no,6.52,no,alive,203.0
1,2,32.0,21-35,female,non hispanic black,4.803,yes,23.4,yes,no,...,no,4.6,no,no,no,no,6.12,no,alive,201.0
2,3,48.583333,35-50,female,hispanic,3.747,no,27.6,yes,no,...,no,5.3,no,no,no,no,13.73,no,alive,201.0
3,4,35.666667,35-50,male,hispanic,5.406,yes,29.4,yes,hypertensive,...,no,4.6,no,no,no,no,15.45,no,alive,196.0
4,5,48.5,35-50,male,non hispanic black,1.676,no,25.0,no,no,...,yes,9.1,diabetes,no,no,no,1.85,yes,alive,190.0


In [4]:
data.shape

(18414, 33)

In [5]:
required_columns = ['obese', 'uralb', 'trig', 'hdl', 'chol', 'hldhx', 'htnhx', 'sbp', 'dbp', 'bmi', 'smokehx', 'ses', 'race', 'sex', 'age', 'hf', 'stroke', 'mi', 'hba1c']
target = "mortstat"

In [6]:
data[required_columns].dtypes

obese       object
uralb      float64
trig       float64
hdl        float64
chol       float64
hldhx      float64
htnhx       object
sbp        float64
dbp        float64
bmi        float64
smokehx     object
ses        float64
race        object
sex         object
age        float64
hf          object
stroke      object
mi          object
hba1c      float64
dtype: object

In [7]:
data = data[required_columns + [target]]

In [8]:
data.head()

Unnamed: 0,obese,uralb,trig,hdl,chol,hldhx,htnhx,sbp,dbp,bmi,smokehx,ses,race,sex,age,hf,stroke,mi,hba1c,mortstat
0,no,5.8,174.0,38.0,268.0,1.0,hypertensive,120.0,67.0,25.5,yes,0.641,non hispanic black,male,21.75,no,no,no,5.3,alive
1,yes,2.2,84.0,55.0,160.0,1.0,no,126.0,86.0,23.4,yes,4.803,non hispanic black,female,32.0,no,no,no,4.6,alive
2,yes,1.6,98.0,66.0,236.0,2.0,no,131.0,73.0,27.6,no,3.747,hispanic,female,48.583333,no,no,no,5.3,alive
3,yes,8.5,109.0,43.0,225.0,2.0,hypertensive,130.0,82.0,29.4,yes,5.406,hispanic,male,35.666667,no,no,no,4.6,alive
4,no,11.3,94.0,51.0,260.0,1.0,no,120.0,70.0,25.0,no,1.676,non hispanic black,male,48.5,no,no,no,9.1,alive


In [9]:
data.shape

(18414, 20)

In [10]:
data = data[(data['age'] >= 45 ) & (data['age'] <= 65)]

In [11]:
data.shape

(4545, 20)

Read the number of nan values

In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4545 entries, 2 to 18407
Data columns (total 20 columns):
obese       4115 non-null object
uralb       4115 non-null float64
trig        3974 non-null float64
hdl         3949 non-null float64
chol        3980 non-null float64
hldhx       4418 non-null float64
htnhx       4525 non-null object
sbp         4108 non-null float64
dbp         4107 non-null float64
bmi         4141 non-null float64
smokehx     4545 non-null object
ses         4545 non-null float64
race        4545 non-null object
sex         4545 non-null object
age         4545 non-null float64
hf          4541 non-null object
stroke      4543 non-null object
mi          4484 non-null object
hba1c       4022 non-null float64
mortstat    4541 non-null object
dtypes: float64(11), object(9)
memory usage: 745.7+ KB


Percentage of missing data

In [13]:
data.isna().any(axis = 1).sum()

821

In [14]:
100*821/data.shape[0]

18.063806380638063

In [15]:
categorical_columns = data.select_dtypes('object').columns
categorical_columns

Index(['obese', 'htnhx', 'smokehx', 'race', 'sex', 'hf', 'stroke', 'mi',
       'mortstat'],
      dtype='object')

In [16]:
numerical_columns = list(set(data.columns) - set(categorical_columns))
numerical_columns

['hdl',
 'trig',
 'bmi',
 'hba1c',
 'ses',
 'sbp',
 'dbp',
 'chol',
 'hldhx',
 'uralb',
 'age']

Missing data in Categorical Columns

In [17]:
def count_empty(data, columns):
    return data[columns].isna().sum()

Percentage of missing data in each column

In [18]:
def percentage_empty(data, columns):
    return 100*data[columns].isna().sum()/data.shape[0]

In [19]:
percentage_empty(data, categorical_columns)

obese       9.460946
htnhx       0.440044
smokehx     0.000000
race        0.000000
sex         0.000000
hf          0.088009
stroke      0.044004
mi          1.342134
mortstat    0.088009
dtype: float64

In [20]:
data.loc[data['obese'].isnull(), 'obese'] = "Not_Available"
data.loc[data['htnhx'].isnull(), 'htnhx'] = "Not_Available"
data.loc[data['mi'].isnull(), 'mi'] = "Not_Available"

In [21]:
percentage_empty(data, categorical_columns)

obese       0.000000
htnhx       0.000000
smokehx     0.000000
race        0.000000
sex         0.000000
hf          0.088009
stroke      0.044004
mi          0.000000
mortstat    0.088009
dtype: float64

In [22]:
count_empty(data, categorical_columns)

obese       0
htnhx       0
smokehx     0
race        0
sex         0
hf          4
stroke      2
mi          0
mortstat    4
dtype: int64

In [23]:
data['hf'].value_counts(dropna = False)

no     4344
yes     197
NaN       4
Name: hf, dtype: int64

In [24]:
data['stroke'].value_counts(dropna = False)

no     4423
yes     120
NaN       2
Name: stroke, dtype: int64

In [25]:
data.loc[data['hf'].isna(), 'hf'] = 'no'
data.loc[data['stroke'].isna(), 'stroke'] = 'no'

In [26]:
data.dropna(subset=['mortstat'], inplace=True)

In [27]:
data.shape

(4541, 20)

In [28]:
count_empty(data, categorical_columns)

obese       0
htnhx       0
smokehx     0
race        0
sex         0
hf          0
stroke      0
mi          0
mortstat    0
dtype: int64

Missing values in Numeric columns

In [29]:
data[numerical_columns].head()

Unnamed: 0,hdl,trig,bmi,hba1c,ses,sbp,dbp,chol,hldhx,uralb,age
2,66.0,98.0,27.6,5.3,3.747,131.0,73.0,236.0,2.0,1.6,48.583333
4,51.0,94.0,25.0,9.1,1.676,120.0,70.0,260.0,1.0,11.3,48.5
9,42.0,171.0,37.0,5.0,5.69,128.0,73.0,156.0,2.0,0.4,56.75
12,47.0,204.0,25.1,5.9,3.416,117.0,74.0,244.0,2.0,1.8,50.166667
14,30.0,479.0,37.5,5.7,0.622,155.0,91.0,212.0,1.0,105.0,48.416667


In [30]:
count_empty(data, numerical_columns)

hdl      594
trig     569
bmi      402
hba1c    521
ses        0
sbp      435
dbp      436
chol     563
hldhx    126
uralb    428
age        0
dtype: int64

In [31]:
data[numerical_columns].describe()

Unnamed: 0,hdl,trig,bmi,hba1c,ses,sbp,dbp,chol,hldhx,uralb,age
count,3947.0,3972.0,4139.0,4020.0,4541.0,4106.0,4105.0,3978.0,4415.0,4113.0,4541.0
mean,50.940208,168.083082,28.259169,5.856741,90829.2267,130.559425,78.114007,220.635244,1.653454,16481.607464,55.274829
std,16.529811,140.306292,5.752427,1.371941,269259.686231,18.245931,9.881913,44.678731,0.475923,119715.813978,6.021957
min,12.0,22.0,13.3,3.3,0.0,81.0,43.0,59.0,1.0,0.4,45.0
25%,40.0,93.0,24.299999,5.2,1.345,118.0,71.0,191.0,1.0,2.7,50.0
50%,48.0,133.0,27.4,5.5,2.701,128.0,77.0,218.0,2.0,6.8,55.333333
75%,59.0,197.0,31.299999,6.0,4.81,141.0,84.0,246.0,2.0,16.2,60.833333
max,191.0,3616.0,67.300003,16.200001,888888.0,237.0,134.0,702.0,2.0,888888.0,65.0


In [32]:
data.loc[data['uralb'] == 888888.000, 'uralb'] = np.nan
data.loc[data['ses'] == 888888.000, 'ses'] = np.nan

In [33]:
data[numerical_columns].describe()

Unnamed: 0,hdl,trig,bmi,hba1c,ses,sbp,dbp,chol,hldhx,uralb,age
count,3947.0,3972.0,4139.0,4020.0,4077.0,4106.0,4105.0,3978.0,4415.0,4037.0,4541.0
mean,50.940208,168.083082,28.259169,5.856741,2.817377,130.559425,78.114007,220.635244,1.653454,57.806168,55.274829
std,16.529811,140.306292,5.752427,1.371941,1.967585,18.245931,9.881913,44.678731,0.475923,406.745718,6.021957
min,12.0,22.0,13.3,3.3,0.0,81.0,43.0,59.0,1.0,0.4,45.0
25%,40.0,93.0,24.299999,5.2,1.215,118.0,71.0,191.0,1.0,2.7,50.0
50%,48.0,133.0,27.4,5.5,2.383,128.0,77.0,218.0,2.0,6.6,55.333333
75%,59.0,197.0,31.299999,6.0,4.02,141.0,84.0,246.0,2.0,15.0,60.833333
max,191.0,3616.0,67.300003,16.200001,11.29,237.0,134.0,702.0,2.0,12400.0,65.0


Data shows outliers in trig and uralb

In [35]:
np.nanpercentile(data['trig'], 99)

717.5799999999999

In [36]:
np.nanpercentile(data['uralb'], 99.5)

2700.0

In [37]:
data.loc[data['trig'] > np.nanpercentile(data['trig'], 99), 'trig'] = np.nanpercentile(data['trig'], 99)

In [38]:
data.loc[data['uralb'] > np.nanpercentile(data['uralb'], 99.5), 'uralb'] = np.nanpercentile(data['uralb'], 99.5)

In [40]:
data.describe()

Unnamed: 0,uralb,trig,hdl,chol,hldhx,sbp,dbp,bmi,ses,age,hba1c
count,4037.0,3972.0,3947.0,3978.0,4415.0,4106.0,4105.0,4139.0,4077.0,4541.0,4020.0
mean,46.896086,165.133736,50.940208,220.635244,1.653454,130.559425,78.114007,28.259169,2.817377,55.274829,5.856741
std,243.498289,114.508905,16.529811,44.678731,0.475923,18.245931,9.881913,5.752427,1.967585,6.021957,1.371941
min,0.4,22.0,12.0,59.0,1.0,81.0,43.0,13.3,0.0,45.0,3.3
25%,2.7,93.0,40.0,191.0,1.0,118.0,71.0,24.299999,1.215,50.0,5.2
50%,6.6,133.0,48.0,218.0,2.0,128.0,77.0,27.4,2.383,55.333333,5.5
75%,15.0,197.0,59.0,246.0,2.0,141.0,84.0,31.299999,4.02,60.833333,6.0
max,2700.0,717.58,191.0,702.0,2.0,237.0,134.0,67.300003,11.29,65.0,16.200001


In [43]:
for column in numerical_columns:
    data.loc[data[column].isna(), column] = data[column].mean()

In [44]:
count_empty(data, numerical_columns)

hdl      0
trig     0
bmi      0
hba1c    0
ses      0
sbp      0
dbp      0
chol     0
hldhx    0
uralb    0
age      0
dtype: int64

In [45]:
data.columns

Index(['obese', 'uralb', 'trig', 'hdl', 'chol', 'hldhx', 'htnhx', 'sbp', 'dbp',
       'bmi', 'smokehx', 'ses', 'race', 'sex', 'age', 'hf', 'stroke', 'mi',
       'hba1c', 'mortstat'],
      dtype='object')