In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import os
os.chdir("/Users/burke/Documents/research/bpCog/microsim")
from microsim.population import NHANESDirectSamplePopulation
from microsim.gender import NHANESGender
from microsim.race_ethnicity import NHANESRaceEthnicity

In [2]:
baseDF = pd.read_stata('microsim/data/fullyImputedDataset.dta')
baseDF['htn'] = 0
baseDF['control'] = 0
baseDF['htnLevelOnly'] = 0
baseDF['baseLevelAndSelfReport'] = 0

baseDF.loc[((baseDF['meanSBP'] >= 140) | (baseDF['meanDBP'] >= 90) | (baseDF['antiHypertensive'] > 0)), 'htn'] = 1
baseDF.loc[(baseDF.meanSBP < 140) & (baseDF.meanDBP < 90), 'control'] = 1
baseDF.loc[((baseDF['meanSBP'] >= 140) | (baseDF['meanDBP'] >= 90)), 'htnLevelOnly'] = 1
baseDF.loc[((baseDF['meanSBP'] >= 140) | (baseDF['meanDBP'] >= 90) | (baseDF['selfReportCurrentHtnMed']==1)), 'baseLevelAndSelfReport'] = 1

In [3]:
base15 = baseDF.loc[baseDF.year==2015].copy()
base17 = baseDF.loc[baseDF.year==2017].copy()

In [4]:
noImpute = pd.read_stata("/Users/burke/Documents/research/bpCog/nhanes/sourceData/nonImputedDataset.dta")
completeCase = pd.read_stata("/Users/burke/Documents/research/bpCog/nhanes/sourceData/completeCaseDataset.dta")

In [5]:
noImpute['htn'] = 0
noImpute['control'] = 0
noImpute['controlSelfReport'] = 0
noImpute['htnLevelOnly'] = 0
noImpute.loc[((noImpute['meanSBP'] >= 140) | (noImpute['meanDBP'] >= 90) | (noImpute['antiHypertensive'] > 0)), 'htn'] = 1
noImpute.loc[(noImpute.meanSBP < 140) & (noImpute.meanDBP < 90), 'control'] = 1
noImpute.loc[((noImpute['meanSBP'] >= 140) | (noImpute['meanDBP'] >= 90)), 'htnLevelOnly'] = 1

completeCase['htn'] = 0
completeCase['control'] = 0
completeCase['htnLevelOnly'] = 0
completeCase.loc[((completeCase['meanSBP'] >= 140) | (completeCase['meanDBP'] >= 90) | (completeCase['antiHypertensive'] > 0)), 'htn'] = 1
completeCase.loc[(completeCase.meanSBP < 140) & (completeCase.meanDBP < 90), 'control'] = 1
completeCase.loc[((completeCase['meanSBP'] >= 140) | (completeCase['meanDBP'] >= 90)), 'htnLevelOnly'] = 1

noImpute15 = noImpute.loc[noImpute.year==2015].copy()
completeCase15 = completeCase.loc[completeCase.year==2015].copy()
noImpute17 = noImpute.loc[noImpute.year==2017].copy()
completeCase17 = completeCase.loc[completeCase.year==2017].copy()

In [6]:
# Trends in Blood Pressure Control Among US Adults With Hypertension,
#1999-2000 to 2017-2018
#Paul Muntner, PhD; Shakia T. Hardy, PhD; Lawrence J. Fine, MD; Byron C. Jaeger, PhD; GregoryWozniak, PhD;
#Emily B. Levitan, ScD; Lisandro D. Colantonio,MD, PhD
# in the supplmenet...in 2015, out of 5485 individuals available for BP definitionm, 1,968 had htn
# in 2017 it was 2,094 out of 5,176

In [7]:
print(f"2015 target htn %: {1968/5485:.2f}, #: {1968} ")
print(f"2015 imputed htn %: {base15.htn.mean():.2f}, # {base15.htn.sum()} ")
print(f"2015 imputed htn + self report %: {base15.baseLevelAndSelfReport.mean():.2f}, # {base15.baseLevelAndSelfReport.sum()} ")

print(f"2015 complete case %: {completeCase15.htn.mean():.2f}, # {completeCase15.htn.sum()} ")
print(f"2015 no impute %: {noImpute15.htn.mean():.2f}, # {noImpute15.htn.sum()} \n")

print(f"2015  htn level only, impute %: {base15.htnLevelOnly.mean():.2f}, # {base15.htnLevelOnly.sum()} ")
print(f"2015 htn level only, complete case %: {completeCase15.htnLevelOnly.mean():.2f}, # {completeCase15.htnLevelOnly.sum()} ")
print(f"2015 htn level only, no impute %: {noImpute15.htnLevelOnly.mean():.2f}, # {noImpute15.htnLevelOnly.sum()} ")

2015 target htn %: 0.36, #: 1968 
2015 imputed htn %: 0.44, # 2634 
2015 imputed htn + self report %: 0.35, # 2123 
2015 complete case %: 0.60, # 535 
2015 no impute %: 0.39, # 2332 

2015  htn level only, impute %: 0.19, # 1146 
2015 htn level only, complete case %: 0.24, # 213 
2015 htn level only, no impute %: 0.18, # 1064 


In [8]:
print(f"2017 target htn %: {2094/5176:.2f}, # {2094} ")
print(f"2017 imputed htn %: {base17.htn.mean():.2f}, # {base17.htn.sum()} ")
print(f"2017 imputed htn + self report %: {base17.baseLevelAndSelfReport.mean():.2f}, # {base17.baseLevelAndSelfReport.sum()} ")

print(f"2017 complete case %: {completeCase17.htn.mean():.2f}, # {completeCase17.htn.sum()} ")
print(f"2017 no impute %: {noImpute17.htn.mean():.2f}, # {noImpute17.htn.sum()} \n")

print(f"2017  htn level only, impute %: {base17.htnLevelOnly.mean():.2f}, # {base17.htnLevelOnly.sum()} ")
print(f"2017 htn level only, complete case %: {completeCase17.htnLevelOnly.mean():.2f}, # {completeCase17.htnLevelOnly.sum()} ")
print(f"2017 htn level only, no impute %: {noImpute17.htnLevelOnly.mean():.2f}, # {noImpute17.htnLevelOnly.sum()} ")

2017 target htn %: 0.40, # 2094 
2017 imputed htn %: 0.48, # 2826 
2017 imputed htn + self report %: 0.40, # 2347 
2017 complete case %: 0.59, # 476 
2017 no impute %: 0.43, # 2500 

2017  htn level only, impute %: 0.23, # 1374 
2017 htn level only, complete case %: 0.23, # 183 
2017 htn level only, no impute %: 0.21, # 1237 


In [9]:
print(f"overall imputed htn %: {baseDF.htn.mean():.2f}, # {baseDF.htn.sum()} ")
print(f"complete case %: {completeCase.htn.mean():.2f}, # {completeCase.htn.sum()} ")
print(f"no impute %: {noImpute.htn.mean():.2f}, # {noImpute.htn.sum()} \n")

print(f"htn level only, impute %: {baseDF.htnLevelOnly.mean():.2f}, # {baseDF.htnLevelOnly.sum()} ")
print(f"htn level only, complete case %: {completeCase.htnLevelOnly.mean():.2f}, # {completeCase.htnLevelOnly.sum()} ")
print(f"htn level only, no impute %: {noImpute.htnLevelOnly.mean():.2f}, # {noImpute.htnLevelOnly.sum()} ")

overall imputed htn %: 0.41, # 24544 
complete case %: 0.56, # 4736 
no impute %: 0.37, # 21782 

htn level only, impute %: 0.20, # 11895 
htn level only, complete case %: 0.23, # 1954 
htn level only, no impute %: 0.18, # 10537 


In [10]:
### the age-adjusted version of this is in the Figure of the Munter et al paper — trend looks good and the overall levels are beliefabl

In [11]:
print(base15.loc[base15['baseLevelAndSelfReport']==1]['control'].value_counts(normalize=True))
print(base17.loc[base17['baseLevelAndSelfReport']==1]['control'].value_counts(normalize=True))

0    0.539802
1    0.460198
Name: control, dtype: float64
0    0.585428
1    0.414572
Name: control, dtype: float64


### upshot is that our measures of overall htn vary substantially based on how you define medications. 

if you look at level-based definitions, our measures correlate closely. if you define hypertension using the BP + self-reported BP meds, our measures correlate closely. if you define hypertension using BP + our imputed anti-hypertensives, we end up wiht a lot more "hypertension". although, that's not necessarily a problem — we're imputing anti-hypertensive medications whereas NHANES is measuring "blood pressure medications". teh distinction being that an anti-hypertensive medication (e.g. propranolol) might be used for a variety of non blood pressure indications (e.g. migraine, tremor) and other might be used for vascular protection in the absence of a hyupertension diagnosis (e.g. CHF, chronic renal disease, etc.)

In [12]:
base15.groupby('htn')['WTINT2YR'].sum()/base15['WTINT2YR'].sum()

htn
0    0.603151
1    0.396849
Name: WTINT2YR, dtype: float64

In [13]:
base15.groupby('baseLevelAndSelfReport')['WTINT2YR'].sum()/base15['WTINT2YR'].sum()

baseLevelAndSelfReport
0    0.683647
1    0.316353
Name: WTINT2YR, dtype: float64

In [14]:
### survey weighted...
base15.loc[base15.htn==True].groupby('control')['WTINT2YR'].sum()/base15.loc[base15.htn==True]['WTINT2YR'].sum()

control
0    0.409855
1    0.590145
Name: WTINT2YR, dtype: float64

In [15]:
### now compare to the 2013 NHANES population with htn, as identified in this paper: 
# Lamprea-Montealegre, J. A., Zelnick, L. R., Hall, Y. N., Bansal, N. & Boer, I. H. de. Prevalence of Hypertension and Cardiovascular Risk According to Blood Pressure Thresholds Used for Diagnosis. Hypertension 72, 602–609 (2018).

base13 = baseDF.loc[baseDF.year==2013].copy()
base13Htn = base13.loc[base13.htn==1].copy()
base13HtnLevelOnly = base13.loc[base13.htnLevelOnly==1].copy()
base13HtnLevelAndSelfReport = base13.loc[base13.baseLevelAndSelfReport==1].copy()


initBinaries = ["female", "white", "aa", "hispanic", "diabetes", "cvd", "hypertensionMed", "statinMed"]

for file in [base13Htn, base13HtnLevelOnly, base13HtnLevelAndSelfReport]:
    for var in initBinaries:
        file[var] = 0

    file.loc[(file.gender==2), 'female'] =1
    file.loc[(file.raceEthnicity.cat.codes==2), 'white'] = 1
    file.loc[(file.raceEthnicity.cat.codes==3), 'aa'] = 1
    file.loc[((file.raceEthnicity.cat.codes==0) | (file.raceEthnicity.cat.codes==1)),  'hispanic'] = 1
    file.loc[file.a1c >= 6.5, 'diabetes'] = 1
    file.loc[((file.selfReportMI==1) | (file.selfReportStroke==1)), 'cvd'] = 1
    file.loc[(file.antiHypertensive >= 1), 'hypertensionMed'] = 1
    file.loc[(file.statin >= 1), 'statinMed'] = 1

In [16]:
factors = {"age" : 60, "female" : .50, "white" : 0.71, "aa" :0.14, "hispanic": 0.10, "diabetes": 0.25, "cvd" : 0.19,
          "hypertensionMed" : 0.80, "statinMed" : 0.41, "bmi" : 31.0, "meanSBP" : 133.4, "meanDBP" : 71.6}

htnMeans = [base13Htn[factor].mean() for factor in factors]
htnLevelOnlyMeans = [base13HtnLevelOnly[factor].mean() for factor in factors]
htnLevelAndSelfReportMeans = [base13HtnLevelAndSelfReport[factor].mean() for factor in factors]

htnSurveyWeighted = base13Htn.sample(n=100000, weights =base13Htn.WTINT2YR, replace=True)
htnSurveyWeightedMeans = [htnSurveyWeighted[factor].mean() for factor in factors]


compareDF = pd.DataFrame({'variable' : list(factors.keys()), 'target': list(factors.values()), 
                          'htnOnly': htnMeans, 'levelOnly' :htnLevelOnlyMeans, 
                          'htnLevelAndSelfReportMeans' :  htnLevelAndSelfReportMeans, 'survey' : htnSurveyWeightedMeans})

compareDF

Unnamed: 0,variable,target,htnOnly,levelOnly,htnLevelAndSelfReportMeans,survey
0,age,60.0,59.843269,60.975586,61.304472,58.44268
1,female,0.5,0.50454,0.493164,0.521408,0.49759
2,white,0.71,0.442163,0.388672,0.438154,0.69729
3,aa,0.14,0.257402,0.288086,0.26784,0.13827
4,hispanic,0.1,0.187919,0.195312,0.182683,0.10358
5,diabetes,0.25,0.202132,0.186523,0.208373,0.17618
6,cvd,0.19,0.134623,0.106445,0.140343,0.11833
7,hypertensionMed,0.8,0.827477,0.573242,0.780209,0.82564
8,statinMed,0.41,0.407422,0.320312,0.41294,0.41691
9,bmi,31.0,30.691433,30.27834,30.931684,30.873554


In [17]:
### the gist is that the survey-weighted values look good by comparison to the table....whew!

In [18]:
pop13 = NHANESDirectSamplePopulation(500000, 2013)

In [19]:
def getDFForPop(pop):
    age = [person._age[-1] for i,person in pop._people.iteritems()]
    female = [person._gender==NHANESGender.FEMALE for i,person in pop._people.iteritems()]
    white = [person._raceEthnicity==NHANESRaceEthnicity.NON_HISPANIC_WHITE  for i,person in pop._people.iteritems()]
    aa = [person._raceEthnicity==NHANESRaceEthnicity.NON_HISPANIC_BLACK  for i,person in pop._people.iteritems()]
    hispanic = [person._raceEthnicity==NHANESRaceEthnicity.MEXICAN_AMERICAN or person._raceEthnicity==NHANESRaceEthnicity.OTHER_HISPANIC    for i,person in pop._people.iteritems()]
    diabetes = [person._current_diabetes  for i,person in pop._people.iteritems()]
    cvd = [person._stroke or person._mi  for i,person in pop._people.iteritems()]
    hypertensionMed = [person._antiHypertensiveCount[-1] >= 1  for i,person in pop._people.iteritems()]
    statinMed = [person._statin[-1] >= 1  for i,person in pop._people.iteritems()]
    bmi = [person._bmi[-1]   for i,person in pop._people.iteritems()]
    sbp = [person._sbp[-1]   for i,person in pop._people.iteritems()]
    dbp = [person._dbp[-1]   for i,person in pop._people.iteritems()]
    htn = [person._sbp[-1] >= 140 or person._dbp[-1] >= 90 or person._antiHypertensiveCount[-1] >= 1   for i,person in pop._people.iteritems()]


    df = pd.DataFrame({'age' : age, 'female' : female, 'white' : white, 'aa' : aa, 'hispanic' : hispanic,
                       'diabetes' : diabetes, 'cvd' : cvd, 'hypertensionMed' : hypertensionMed, 'statinMed' : statinMed,
                       'bmi' : bmi, 'meanSBP' : sbp, 'meanDBP' : dbp, 'htn' : htn})
    return df

In [23]:
df13 = getDFForPop(pop13)

df13Htn = df13.loc[df13.htn==1]

weightedFactorsPop = [df13Htn[factor].mean() for factor in factors]
compareDF['sim'] = weightedFactorsPop
compareDF.to_csv('compareNHANESHypertension.csv')

### we're low on CVD...but, that's ok — becuase they include CHF and we can't measure it — so i'll take it out of the pub...

In [25]:
pop7 = NHANESDirectSamplePopulation(250000, 2007)
pop9 = NHANESDirectSamplePopulation(250000, 2009)
df7 = getDFForPop(pop7)
df9 = getDFForPop(pop9)

In [26]:
df79 = pd.concat([df7, df9])
print(len(df79))
df79.head()

500000


Unnamed: 0,age,female,white,aa,hispanic,diabetes,cvd,hypertensionMed,statinMed,bmi,meanSBP,meanDBP,htn
0,44.0,False,True,False,False,False,False,False,False,27.01,117.333333,74.0,False
1,18.0,True,True,False,False,False,False,False,False,19.24,112.666667,50.666667,False
2,42.0,False,True,False,False,False,False,False,False,26.32,120.0,80.0,False
3,63.0,False,True,False,False,False,False,False,False,21.11,145.333333,89.333333,True
4,37.0,False,False,False,True,False,False,False,False,31.56,128.0,79.333333,False


In [30]:
target79 = {'age' : 45.9, 'female': 1-.483, 'white' : .684, 'aa' : .115, 'hispanic' : .136, 'bmi' : 28.5}
compare79DF = pd.DataFrame({'variable' : list(target79.keys()), 'target': list(target79.values())})
compare79DF['sim'] = [df79[factor].mean() for factor in target79.keys()]
compare79DF                     

Unnamed: 0,variable,target,sim
0,age,45.9,45.915826
1,female,0.517,0.516658
2,white,0.684,0.68338
3,aa,0.115,0.114234
4,hispanic,0.136,0.136058
5,bmi,28.5,28.528073


In [31]:
compare79DF.to_csv('compare79.csv')

### this looks pretty good! 

— we're low on CVD...but that included CHF in the papaer and we don't have CHF, so we shoudl be under
— we're also low on diabetes...but, again our definition is differnet than theirs which is glucose-lowering med or AIC > 6.5...we're just a1c > 6.5