Import the Pandas and Numpy libraries and load in the data file
---

In [1]:
import pandas as pd
import numpy as np

demo = pd.read_csv('Demographics.csv')
bmx = pd.read_csv('BodyMeasures.csv')
ocq = pd.read_csv('Occupation.csv')

Impossible Data - Continuous Values
---

BMXWT - weight in kg

In [2]:
bmx['BMXWT'].describe()

count     9185.000000
mean        67.506522
std        282.389202
min       -149.000000
25%         39.100000
50%         63.000000
75%         79.700000
max      12870.000000
Name: BMXWT, dtype: float64

In [3]:
ind = bmx['BMXWT'] < 0
bmx.loc[ind,'BMXWT'] = np.nan
bmx['BMXWT'].describe()

count     9179.000000
mean        67.606840
std        282.452412
min          3.100000
25%         39.200000
50%         63.000000
75%         79.750000
max      12870.000000
Name: BMXWT, dtype: float64

Impossible Data - Categorical Values
---
BMIWT - Weight Comment ([codebook entry](https://wwwn.cdc.gov/Nchs/Nhanes/1999-2000/BMX.htm#BMIWT))


|Code|Meaning|
|:--- |:--- |
|1|	Could not obtain|	
|2|	Exceeds capacity|	
|3|	Clothing|
|4|	Medical appliance|

In [4]:
bmx['BMIWT'].unique()

array([ 3., nan,  4.,  1., 11.,  7.])

In [5]:
ind = bmx['BMIWT'] > 4
bmx.loc[ind, 'BMIWT'] = np.nan
bmx['BMIWT'].unique()

array([ 3., nan,  4.,  1.])

Extreme Data
---

Heaviest human weight on record: 635 kg

In [6]:
bmx['BMXWT'].describe()

count     9179.000000
mean        67.606840
std        282.452412
min          3.100000
25%         39.200000
50%         63.000000
75%         79.750000
max      12870.000000
Name: BMXWT, dtype: float64

In [7]:
ind = bmx['BMXWT'] > 635
bmx.loc[ind,'BMXWT'] = np.nan
bmx['BMXWT'].describe()

count    9170.000000
mean       59.957381
std        29.842889
min         3.100000
25%        39.200000
50%        62.970000
75%        79.600000
max       193.300000
Name: BMXWT, dtype: float64

In [8]:
mean_wt = np.nanmean(bmx['BMXWT'])
std_wt = np.nanstd(bmx['BMXWT'])

min_wt = np.nanmin(bmx['BMXWT'])
max_wt = np.nanmax(bmx['BMXWT'])

low_wt_zscore = (min_wt - mean_wt)/std_wt
high_wt_zscore = (max_wt - mean_wt)/std_wt

print('Max weight z-score: ' + str(high_wt_zscore))
print('Min weight z-score: ' + str(low_wt_zscore))

Max weight z-score: 4.468397485658741
Min weight z-score: -1.9053276258294776


Saturated Data
---

Survey instructions: list ages 85 and above as 85

In [None]:
np.nanmax(demo['RIDAGEYR'])

In [9]:
ind = demo['RIDAGEYR'] > 85
demo.loc[ind,'RIDAGEYR'] = 85

Individual Practice
---

1. Find and remove any non-sensible categorical values in the occupation file
    * Check OCQ130, OCQ140, OCQ150, OCQ160
2. Find the z-score of the maximum and minimum values in the occupation column OCQ180 - Hours worked last week at all jobs ([see codebook for more information](https://wwwn.cdc.gov/Nchs/Nhanes/1999-2000/OCQ.htm#OCQ180))
    * Exclude values 7777 and 9999


In [11]:
ocq['OCQ130'].describe()

count    1316.000000
mean        7.165653
std         6.520391
min         1.000000
25%         7.000000
50%         7.000000
75%         7.000000
max        99.000000
Name: OCQ130, dtype: float64

In [12]:
ocq['OCQ130'].unique()

array([nan,  7.,  3.,  2.,  1.,  4.,  6.,  5., 38., 99., 21., 54., 68.,
       29., 17., 58., 69., 48., 14., 34., 42.])

In [13]:
ind = ((ocq['OCQ130'] > 7) & (ocq['OCQ130'] < 77))


In [14]:
len(ind)

7749

In [16]:
ocq.loc[ind, 'OCQ130'].head()

876     38.0
1912    21.0
2769    54.0
3235    68.0
3943    29.0
Name: OCQ130, dtype: float64

In [17]:
ocq.loc[ind, 'OCQ130'] = np.nan

In [18]:
ocq['OCQ130'].unique()

array([nan,  7.,  3.,  2.,  1.,  4.,  6.,  5., 99.])

In [24]:
ocq.loc[:,'OCQ150'].unique()

array([ 2.,  1.,  4., nan,  3., 66., 99., 44.,  7.,  9., 11.,  8., 33.,
       77.])

In [27]:
index = ocq['OCQ150']>9
ocq.loc[index, 'OCQ150'] = np.nan

In [28]:
ocq.loc[:,'OCQ150'].unique()

array([ 2.,  1.,  4., nan,  3.,  7.,  9.,  8.])

In [29]:
ocq['OCQ150'].describe()

count    6269.000000
mean        2.447121
std         1.481740
min         1.000000
25%         1.000000
50%         2.000000
75%         4.000000
max         9.000000
Name: OCQ150, dtype: float64

In [30]:
ocq['OCQ160'].describe()

count    293.000000
mean       2.085324
std        1.304454
min        1.000000
25%        2.000000
50%        2.000000
75%        2.000000
max        9.000000
Name: OCQ160, dtype: float64

In [32]:
ocq['OCQ160'].unique()

array([ 2., nan,  1.,  7.,  9.])

2. Find the z-score of the maximum and minimum values in the occupation column OCQ180 - Hours worked last week at all jobs ([see codebook for more information](https://wwwn.cdc.gov/Nchs/Nhanes/1999-2000/OCQ.htm#OCQ180))
    * Exclude values 7777 and 9999

In [37]:
a = ocq['OCQ180'] > 105

In [38]:
ocq.loc[a, 'OCQ180'] = np.nan

In [39]:
ocq['OCQ180'].unique()

array([ nan,  40.,  45.,  53.,   4.,  32.,   6.,  60.,  35.,  27.,  38.,
        20.,  50.,  55.,  30.,  12.,  48.,  58.,  16.,  46.,  42.,  36.,
         3.,  10.,   8.,  54.,  75.,  24.,  43.,  44.,  47.,  49.,  25.,
        80.,  70.,  17.,  96.,  15.,  65.,  57.,  63.,  14.,  37.,  26.,
        91.,  39.,  23.,  34.,  13.,  22.,  18.,   1.,  52.,  21.,  33.,
         5., 104.,  28.,  56.,  31.,  67.,   7.,  41.,  62.,   2.,  29.,
        66.,  59.,  82.,  64.,  79., 105.,  84.,  19.,  72.,   9.,  77.,
        68.,  85., 100.,  51.,  61.,  78.,  86.,  88.,  11.,  90.,  76.])

In [40]:
mean = np.nanmean(ocq['OCQ180'])
std = np.nanstd(ocq['OCQ180'])

min = np.nanmin(ocq['OCQ180'])
max = np.nanmax(ocq['OCQ180'])

low_zscore = (min - mean)/std
high_zscore = (max - mean)/std

print('Max hours z-score: ' + str(high_zscore))
print('Min hours z-score: ' + str(low_zscore))

Max hours z-score: 4.3764058992483355
Min hours z-score: -2.527646800029202


Save The Data
---

In [None]:
# demo.to_csv('Demographics.csv', index=False)
# bmx.to_csv('BodyMeasures.csv', index=False)
# ocq.to_csv('Occupation.csv', index=False)