### The Idea for this model is that a. fib is NOT present in NHANES. The best we can do is to infer its presence. So, here, we will use the first wave of the cohrot models to build a baseline model to predict a. fib. 


In [100]:
import pandas as pd
import numpy as np

cohort = pd.read_stata("/Users/burke/Documents/research/bpCog/meds.dta")
cohort = cohort[['newid', 'visitcounter', 'sbp', 'dbp', 'bmi', 'trig', 'smokestatus', 'choltot', 'cholhdl', 'cholldl', 
                 'age0', 'female0', 'educ0', 'racebpcog', 'hba1c', 'glucosef', 'afibslfrep', 'Hxafib', 'afibinc']]

cohort['newid'] = cohort['newid'].str[2:]
cohort['newid'] = cohort['newid'].str[:-1]

cohort['anyAfib'] = cohort.index.isin(cohort.loc[(cohort.afibslfrep==1) |  (cohort.Hxafib == 1) | (cohort.afibinc == 1)].index)
cohort.drop(labels=['afibslfrep', 'Hxafib', 'afibinc'], inplace=True, axis='columns')

cohort = cohort.assign(id=(cohort['newid']).astype('category').cat.codes)
cohort.drop(labels=['newid'], axis='columns', inplace=True)

In [101]:
cohort.rename(columns={'smokestatus' : 'smokingStatus', 'choltot' : 'totChol',
                      "cholhdl" : 'hdl', 'cholldl' : 'ldl', 'age0' : 'age', 'female0' : 'gender', 'educ0' : 'educ',
                      'hba1c' : 'a1c', 'racebpcog' : 'raceEthnicity'}, inplace=True)

cohort.gender.replace(to_replace=[0,1], value=[1, 2], inplace=True)
# have to compress "hispanic" to "other hispanic" and no representation for mexican american in BP Cog
cohort.raceEthnicity.replace(to_replace=[1,2,3,9], value=[4,3,2,5], inplace=True)

In [102]:
firstVisitOnly = cohort.loc[cohort.visitcounter ==1]

In [103]:
allFactorsImputation = ["a1c", "hdl", "totChol", "bmi", "dbp", "sbp", "ldl", "trig", "glucosef"]


firstVisitOnly = pd.concat([firstVisitOnly, pd.get_dummies(firstVisitOnly['raceEthnicity'], prefix="raceEth")], axis=1)
firstVisitOnly = pd.concat([firstVisitOnly, pd.get_dummies(firstVisitOnly['smokingStatus'], prefix="smoke")], axis=1)
firstVisitOnly.drop(['visitcounter', 'smokingStatus', 'raceEthnicity', 'educ'], axis='columns', inplace=True)

firstVisitOnly.rename(columns={'raceEth_2.0' : 'raceEth2', 'raceEth_3.0' : 'raceEth3', 'raceEth_4.0' : 'raceEth4',
                        'raceEth_5.0' : 'raceEth5', 'smoke_0.0' : 'smoke0', 'smoke_1.0' : 'smoke1' , 
                       'smoke_2.0' : 'smoke2'  }, inplace=True)


In [104]:
import statsmodels.imputation.mice as mice

def getFormulaForVariable(var):
    cols = list(firstVisitOnly.columns)
    cols.remove(var)
    colStrings = [col + "+"for col in cols]
    return "".join(colStrings)[:-1]
    
imputedCohort = mice.MICEData(firstVisitOnly)
binaryVars = ['smoke0', 'smoke1', 'smoke2', 'raceEth_2', 'raceEth_3', 'raceEth_4', 'raceEth_5']
for var in binaryVars:
    imputedCohort.set_imputer(var, formula=getFormulaForVariable(var), model_class="logit")
imputedCohort.update_all(20)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [105]:
imputedData = imputedCohort.data.copy()

# rebvuild categoricals
imputedData['smokingStatus'] = imputedData[['smoke0', 'smoke1', 'smoke2']].idxmax(axis=1)
imputedData['smokingStatus'] = imputedData['smokingStatus'].str[-1:]
imputedData['raceEthnicity'] = imputedData[['raceEth_2', 'raceEth_3', 'raceEth_4', 'raceEth_5']].idxmax(axis=1)
imputedData['raceEthnicity'] = imputedData['raceEthnicity'].str[-1:]



In [106]:
# drop the dummies
imputedData.drop(['smoke0', 'smoke1','smoke2'], axis='columns', inplace=True)
imputedData.drop(['raceEth_2', 'raceEth_3','raceEth_3', 'raceEth_4', 'raceEth_5'], axis='columns', inplace=True)


In [107]:
imputedData.raceEthnicity= imputedData.raceEthnicity.astype('category')
imputedData.smokingStatus= imputedData.smokingStatus.astype('category')
imputedData.sort_values(by=['id', 'age'], inplace=True)

In [108]:
imputedData.head()

Unnamed: 0,sbp,dbp,bmi,trig,totChol,hdl,ldl,age,gender,a1c,glucosef,anyAfib,id,smokingStatus,raceEthnicity
908,126.7,76.7,29.196743,135.0,215.0,40.0,148.0,29,2,6.6,86.0,False,0,0,4
36132,152.0,102.7,29.640689,90.0,128.0,22.0,88.0,54,1,8.2,87.0,False,1,2,3
10795,135.3,107.3,37.576373,195.0,183.0,34.0,110.0,30,2,5.6,92.0,False,2,2,3
30689,147.0,100.0,35.388487,102.0,146.0,20.0,105.02,56,2,4.8,73.0,False,3,1,4
29333,192.0,112.0,30.314982,68.0,207.0,39.0,154.554,51,2,6.4,116.0,False,4,2,4


In [126]:
import statsmodels.formula.api as statsmodel
import sys
import os
sys.path.append(os.path.abspath("../mcm/"))
from mcm.regression_model import RegressionModel

imputedData['anyAfib'] = imputedData['anyAfib'].astype(int)

model = statsmodel.ols(formula='anyAfib ~ sbp + dbp + bmi + trig + totChol + hdl + ldl + age + gender + a1c + glucosef + smokingStatus + raceEthnicity', data=imputedData)
results = model.fit()
results.summary()
mcmRegressionModel = RegressionModel(results.params.to_dict(), results.bse.to_dict(), results.resid.mean(), results.resid.std())
mcmRegressionModel.write_json(os.path.abspath("../mcm/mcm/data/BaselineAFibModelSpec.json"))

### left off here _ next step is to add the baseline a-fib model into the population code..probably in the NHANESDirectSample __init__ method after loading the nhanes population from data...then apply this model to add a. fib in