In [1]:
import numpy as np
import pandas as pd
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import statsmodels.api as sm

In [3]:
da = pd.read_csv('nhanes_2015_2016.csv')
da.head()

Unnamed: 0,SEQN,ALQ101,ALQ110,ALQ130,SMQ020,RIAGENDR,RIDAGEYR,RIDRETH1,DMDCITZN,DMDEDUC2,...,BPXSY2,BPXDI2,BMXWT,BMXHT,BMXBMI,BMXLEG,BMXARML,BMXARMC,BMXWAIST,HIQ210
0,83732,1.0,,1.0,1,1,62,3,1.0,5.0,...,124.0,64.0,94.8,184.5,27.8,43.3,43.6,35.9,101.1,2.0
1,83733,1.0,,6.0,1,1,53,3,2.0,3.0,...,140.0,88.0,90.4,171.4,30.8,38.0,40.0,33.2,107.9,
2,83734,1.0,,,1,1,78,3,1.0,3.0,...,132.0,44.0,83.4,170.1,28.8,35.6,37.0,31.0,116.5,2.0
3,83735,2.0,1.0,1.0,2,2,56,3,1.0,5.0,...,134.0,68.0,109.8,160.9,42.4,38.5,37.7,38.3,110.1,2.0
4,83736,2.0,1.0,1.0,2,2,42,4,1.0,4.0,...,114.0,54.0,55.2,164.9,20.3,37.4,36.0,27.2,80.4,2.0


## Investigating and cleaning the data


In [6]:
da['RIAGENDRx'] = da['RIAGENDR'].replace({1:'Male', 2:'Female'})
da['SMQ020x'] = da['SMQ020'].replace({1:'Yes', 2:'No', 7: np.nan, 9: np.nan})

In [8]:
dx = da[['SMQ020x', 'RIAGENDRx']].dropna()
dx.head()

Unnamed: 0,SMQ020x,RIAGENDRx
0,Yes,Male
1,Yes,Male
2,Yes,Male
3,No,Female
4,No,Female


In [9]:
pd.crosstab(dx.SMQ020x, dx.RIAGENDRx)

RIAGENDRx,Female,Male
SMQ020x,Unnamed: 1_level_1,Unnamed: 2_level_1
No,2066,1340
Yes,906,1413


In [10]:
dx['SMQ020x'] = dx['SMQ020x'].replace({'Yes':1, 'No':0})
dx.head()

Unnamed: 0,SMQ020x,RIAGENDRx
0,1,Male
1,1,Male
2,1,Male
3,0,Female
4,0,Female


In [16]:
dz = dx.groupby('RIAGENDRx').agg({'SMQ020x': [np.mean, np.size]})
dz.columns = ['Proportion', 'Total n']
dz

Unnamed: 0_level_0,Proportion,Total n
RIAGENDRx,Unnamed: 1_level_1,Unnamed: 2_level_1
Female,0.304845,2972
Male,0.513258,2753


### Difference of two populations proportions (smoking data)

In [19]:
#Females
p1 = 0.304845
n1 = 2972
se_fem = np.sqrt((p1*(1-p1))/n1)
print(se_fem)


#Males
p2 = 0.513258
n2 = 2753
se_male = np.sqrt((p2*(1-p2))/n2)
print(se_male)

0.00844415041930423
0.009526078787008965


In [20]:
#Standard error of the difference
se_diff = np.sqrt(se_fem**2 + se_male**2)
se_diff

0.012729880335656654

In [22]:
#Difference of the population proportions
d = 0.304845 - 0.513258

#Upper and lower bounds
lcb = d - 1.96*se_diff
ucb = d + 1.96*se_diff
(lcb,ucb)

(-0.23336356545788706, -0.18346243454211297)

### Difference of the two populations means (BMI)

In [26]:
da['BMXBMI'].head()

0    27.8
1    30.8
2    28.8
3    42.4
4    20.3
Name: BMXBMI, dtype: float64

In [32]:
dy = da.groupby('RIAGENDRx').agg({'BMXBMI': [np.mean, np.std, np.size]})
dy.columns = ["Mean", 'Standard deviation', 'Total n']
dy.head()

Unnamed: 0_level_0,Mean,Standard deviation,Total n
RIAGENDRx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Female,29.939946,7.753319,2976
Male,28.778072,6.252568,2759


In [34]:
#Females
sem_fem = 7.753319/np.sqrt(2976)
print(se_fem)

#Males
sem_males = 6.252568/np.sqrt(2759)
print(se_males)


0.14212523289878048
0.11903716451870151


In [36]:
se_diff = np.sqrt(se_fem**2 + se_males**2)
se_diff

0.18538993598139303

In [38]:
#Difference of the two populations means

d = 29.939946 - 28.778072
d

1.1618739999999974

In [39]:
#Upper and lower bounds
lcb = d - 1.96*se_diff
ucb = d + 1.96*se_diff

(lcb,ucb)


(0.798509725476467, 1.5252382745235278)