Import the Pandas and Numpy libraries and load in the data file
---

In [42]:
import pandas as pd
import numpy as np

demo = pd.read_csv('Demographics.csv')
bmx = pd.read_csv('BodyMeasures.csv')

How much data is missing from each column?
---

In [43]:
valid_entries = demo.count()
total_rows = len(demo.index)
missing_data = total_rows - valid_entries
missing_data.head()

SEQN          0
SDDSRVYR    211
RIDSTATR    209
RIDEXMON    920
RIAGENDR    211
dtype: int64

As a percentage
---

In [44]:
missing_percentage = missing_data / total_rows * 100
missing_percentage.head()

SEQN        0.000000
SDDSRVYR    1.993199
RIDSTATR    1.974306
RIDEXMON    8.690724
RIAGENDR    1.993199
dtype: float64

How much data is missing from each row?
---

In [45]:
missing_data = np.sum(demo.isnull(), axis=1)
num_cols = len(demo.columns)
missing_percentage = missing_data / num_cols * 100

missing_percentage.head()

0    8.333333
1    6.944444
2    4.861111
3    8.333333
4    5.555556
dtype: float64

What about entries equivalent to unknown?
---

DMDSCHOL - Now attending school? ([codebook entry](https://wwwn.cdc.gov/Nchs/Nhanes/1999-2000/DEMO.htm#DMDSCHOL))

|Code|Meaning|
|:--- |:--- |
|1| In school|
|2|	On vacation from school (between grades)|
|3|	Neither in school or on vacation from school (between grades)|
|7|	Refused|
|9|	Don't know|

In [46]:
perc = (len(demo.index)-demo['DMDSCHOL'].count())/len(demo.index)*100
print('Percent missing: %d' % perc)

num_refused = sum(demo['DMDSCHOL'] == 7)
num_dontknow = sum(demo['DMDSCHOL'] == 9)
print('Number refused: %d' % num_refused)
print('Number unknown: %d' % num_dontknow)

Percent missing: 67
Number refused: 0
Number unknown: 2


In [47]:
unknown_ind = demo.loc[:,'DMDSCHOL'] > 3
demo.loc[unknown_ind,'DMDSCHOL'] = np.nan
print(demo['DMDSCHOL'].unique())

[nan  1.  3.  2.]


In [48]:
perc = (len(demo.index)-demo['DMDSCHOL'].count())/len(demo.index)*100
print('Percent missing: %d' % perc)

num_refused = sum(demo['DMDSCHOL'] == 7)
num_dontknow = sum(demo['DMDSCHOL'] == 9)
print('Number refused: %d' % num_refused)
print('Number unknown: %d' % num_dontknow)

Percent missing: 67
Number refused: 0
Number unknown: 0


Wrong Information
---

In [49]:
print('Demographics:')
print(demo.dtypes.head())
print('\nBody Measures:')
print(bmx.dtypes.head())

Demographics:
SEQN          int64
SDDSRVYR    float64
RIDSTATR     object
RIDEXMON    float64
RIAGENDR     object
dtype: object

Body Measures:
SEQN         object
BMAEXLEN    float64
BMAEXSTS    float64
BMAEXCMT    float64
BMXWT       float64
dtype: object


In [50]:
bmx.loc[:,'SEQN'] = pd.to_numeric(bmx['SEQN'], downcast='integer')

ValueError: Unable to parse string "Jane Doe" at position 1385

In [51]:
bmx.loc[:,'SEQN'] = pd.to_numeric(bmx['SEQN'], errors='coerce', downcast='integer')
bmx.dtypes.head()

SEQN        float64
BMAEXLEN    float64
BMAEXSTS    float64
BMAEXCMT    float64
BMXWT       float64
dtype: object

In [52]:
ind = np.isnan(bmx['SEQN'])
bmx = bmx.loc[~ind,:]
bmx.loc[:,'SEQN'] = pd.to_numeric(bmx['SEQN'], errors='coerce', downcast='integer')
bmx.dtypes.head()

SEQN          int16
BMAEXLEN    float64
BMAEXSTS    float64
BMAEXCMT    float64
BMXWT       float64
dtype: object

Sensitive Data: Minor Marital Status
---

In [53]:
minor_ind = demo.loc[:,'RIDAGEYR'] < 18
print(demo.loc[minor_ind, 'DMDMARTL'].count())

1244


In [54]:
demo.loc[minor_ind, 'DMDMARTL'] = np.nan
print(demo.loc[minor_ind, 'DMDMARTL'].count())

0


Individual Practice
---

1. Find the column with the highest percentage of missing information in demographics
2. Remove confidential pregnancy status for minors (rows 'RIDEXPRG' and 'RIDPREG')


In [58]:
valid_entries = demo.count()
total_rows = len(demo.index)
missing_data = total_rows - valid_entries
missing_percentage = missing_data / total_rows * 100
missing_percentage.head()

SEQN        0.000000
SDDSRVYR    1.993199
RIDSTATR    1.974306
RIDEXMON    8.690724
RIAGENDR    1.993199
dtype: float64

In [59]:
missing_percentage.sort_values(ascending=False)

DMARACE     99.782732
DMAETHN     99.773285
RIDPREG     81.758927
DMDYRSUS    81.267712
RIDEXPRG    80.445872
              ...    
WTMREP03     1.955413
WTMREP47     1.955413
WTMREP30     1.955413
WTIREP40     1.945966
SEQN         0.000000
Length: 144, dtype: float64

In [60]:
missing_percentage.idxmax()

'DMARACE'

In [61]:
demo['DMARACE'].head()

0   NaN
1   NaN
2   NaN
3   NaN
4   NaN
Name: DMARACE, dtype: float64

In [64]:
minor_ind = demo.loc[:,'RIDAGEYR'] < 18
print(demo.loc[minor_ind, 'RIDEXPRG'].count())
demo.loc[minor_ind, 'RIDEXPRG'] = np.nan
print(demo.loc[minor_ind, 'RIDEXPRG'].count())

14
0


In [66]:
print(demo.loc[minor_ind, 'RIDPREG'].count())
demo.loc[minor_ind, 'RIDPREG'] = np.nan
print(demo.loc[minor_ind, 'RIDPREG'].count())

14
0


Save Data Files
---

In [67]:
demo.to_csv('Demographics.csv', index=False)
bmx.to_csv('BodyMeasures.csv', index=False)