### The Idea for this model is that a. fib is NOT present in NHANES. The best we can do is to infer its presence. So, here, we will use the first wave of the cohrot models to build a baseline model to predict a. fib. 


In [1]:
import pandas as pd
import numpy as np


cohort = pd.read_stata("/Users/burke/Documents/research/bpCog/meds.dta")
cohort = cohort[['newid', 'visitcounter', 'sbp', 'dbp', 'bmi', 'trig', 'smokestatus', 'choltot', 'cholhdl', 'cholldl', 
                 'age0', 'female0', 'educ0', 'racebpcog', 'hba1c', 'glucosef', 'afibslfrep', 'Hxafib', 'afibinc',
                'educ0', 'physact', 'waistcm']]

cohort['newid'] = cohort['newid'].str[2:]
cohort['newid'] = cohort['newid'].str[:-1]

cohort['anyAfib'] = cohort.index.isin(cohort.loc[(cohort.afibslfrep==1) |  (cohort.Hxafib == 1) | (cohort.afibinc == 1)].index)
cohort.drop(labels=['afibslfrep', 'Hxafib', 'afibinc'], inplace=True, axis='columns')

cohort = cohort.assign(id=(cohort['newid']).astype('category').cat.codes)
cohort.drop(labels=['newid'], axis='columns', inplace=True)
cohort = cohort.loc[:,~cohort.columns.duplicated()]

In [2]:
cohort.head()

Unnamed: 0,visitcounter,sbp,dbp,bmi,trig,smokestatus,choltot,cholhdl,cholldl,age0,female0,educ0,racebpcog,hba1c,glucosef,physact,waistcm,anyAfib,id
0,1,154.0,122.0,24.03079,80.0,2.0,161.0,40.0,105.0,22,0,3.0,2,,108.0,1.0,86.0,False,70
1,2,126.7,97.3,23.599523,88.0,2.0,151.0,46.0,88.0,22,0,3.0,2,,,1.0,78.25,False,70
2,3,134.0,92.7,25.823186,147.0,2.0,186.0,38.0,119.0,22,0,3.0,2,,,1.0,90.5,False,70
3,4,119.3,69.3,26.320689,147.0,2.0,200.0,48.0,123.0,22,0,3.0,2,5.7,116.038199,1.0,90.5,False,70
4,1,142.7,110.7,21.707538,69.0,1.0,198.0,59.0,125.0,30,0,5.0,2,,93.0,1.0,80.0,False,109


In [3]:
cohort.rename(columns={'smokestatus' : 'smokingStatus', 'choltot' : 'totChol',
                      "cholhdl" : 'hdl', 'cholldl' : 'ldl', 'age0' : 'age', 'female0' : 'gender', 'educ0' : 'education',
                      'hba1c' : 'a1c', 'racebpcog' : 'raceEthnicity', 'waistcm' : 'waist', 'physact' : 'anyPhysicalActivity'}, inplace=True)

cohort.gender.replace(to_replace=[0,1], value=[1, 2], inplace=True)
# have to compress "hispanic" to "other hispanic" and no representation for mexican american in BP Cog
cohort.raceEthnicity.replace(to_replace=[1,2,3,9], value=[4,3,2,5], inplace=True)
# have to see if we need to make changes to education...

### we're trying to build a model for baseline a. fib...so, just going to focus in on the initial visit

In [4]:
firstVisitOnly = cohort.loc[cohort.visitcounter ==1]

In [5]:
firstVisitOnly.anyAfib.value_counts(dropna=False, normalize=True)

False    0.990166
True     0.009834
Name: anyAfib, dtype: float64

In [6]:
allFactorsImputation = ["a1c", "hdl", "totChol", "bmi", "dbp", "sbp", "ldl", "trig", "glucosef", 'waist', 'anyPhysicalActivity']


firstVisitOnly = pd.concat([firstVisitOnly, pd.get_dummies(firstVisitOnly['raceEthnicity'], prefix="raceEth")], axis=1)
firstVisitOnly = pd.concat([firstVisitOnly, pd.get_dummies(firstVisitOnly['smokingStatus'], prefix="smoke")], axis=1)
firstVisitOnly = pd.concat([firstVisitOnly, pd.get_dummies(firstVisitOnly['education'], prefix="educ")], axis=1)

firstVisitOnly.drop(['visitcounter', 'smokingStatus', 'raceEthnicity', 'education'], axis='columns', inplace=True)

firstVisitOnly.rename(columns={'raceEth_2' : 'raceEth2', 'raceEth_3' : 'raceEth3', 'raceEth_4' : 'raceEth4',
                        'raceEth_5' : 'raceEth5', 'smoke_0.0' : 'smoke0', 'smoke_1.0' : 'smoke1' , 
                       'smoke_2.0' : 'smoke2', 'educ_1.0' : 'educ1', 'educ_2.0' : 'educ2', 'educ_3.0' : 'educ3',
                              'educ_4.0' : 'educ4', 'educ_5.0' : 'educ5'}, inplace=True)


In [7]:
import statsmodels.imputation.mice as mice

def getFormulaForVariable(var):
    cols = list(firstVisitOnly.columns)
    cols.remove(var)
    colStrings = [col + "+"for col in cols]
    return "".join(colStrings)[:-1]
    
imputedCohort = mice.MICEData(firstVisitOnly)
binaryVars = ['smoke0', 'smoke1', 'smoke2',  'raceEth2', 'raceEth3', 'raceEth4', 'raceEth5', 
              'educ1', 'educ2', 'educ3', 'educ4', 'educ5']
for var in binaryVars:
    imputedCohort.set_imputer(var, formula=getFormulaForVariable(var), model_class="logit")
imputedCohort.update_all(20)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)


In [8]:
imputedData = imputedCohort.data.copy()

# rebvuild categoricals
imputedData['smokingStatus'] = imputedData[['smoke0', 'smoke1', 'smoke2']].idxmax(axis=1)
imputedData['smokingStatus'] = imputedData['smokingStatus'].str[-1:]
imputedData['raceEthnicity'] = imputedData[['raceEth2', 'raceEth3', 'raceEth4', 'raceEth5']].idxmax(axis=1)
imputedData['raceEthnicity'] = imputedData['raceEthnicity'].str[-1:]
imputedData['education'] = imputedData[['educ1', 'educ2', 'educ3', 'educ4', 'educ5']].idxmax(axis=1)
imputedData['education'] = imputedData['raceEthnicity'].str[-1:]

In [9]:
# drop the dummies
imputedData.drop(['smoke0', 'smoke1','smoke2'], axis='columns', inplace=True)
imputedData.drop(['raceEth2','raceEth3', 'raceEth4', 'raceEth5'], axis='columns', inplace=True)
imputedData.drop(['educ1', 'educ2','educ3', 'educ4', 'educ5'], axis='columns', inplace=True)

In [10]:
imputedData.raceEthnicity= imputedData.raceEthnicity.astype('category')
imputedData.smokingStatus= imputedData.smokingStatus.astype('category')
imputedData.education= imputedData.education.astype('category')

imputedData.sort_values(by=['id', 'age'], inplace=True)

In [11]:
import statsmodels.formula.api as statsmodel
import sys
import os
sys.path.append(os.path.abspath("../mcm/"))
from mcm.regression_model import RegressionModel

imputedData['anyAfib'] = imputedData['anyAfib'].astype(int)
# note...dropping race from the model because it leads to convergence issues and it does not have 
# predictiveness independently of age and sex...

model = statsmodel.logit(formula='anyAfib ~ sbp + dbp + bmi + trig + totChol + hdl + ldl + age + gender + a1c + waist + anyPhysicalActivity + smokingStatus  + education', data=imputedData)
results = model.fit()
results.summary()
mcmRegressionModel = RegressionModel(results.params.to_dict(), results.bse.to_dict(), None, None)
mcmRegressionModel.write_json(os.path.abspath("../mcm/mcm/data/BaselineAFibModelSpec.json"))

Optimization terminated successfully.
         Current function value: 0.045705
         Iterations 10


### left off here _ next step is to add the baseline a-fib model into the population code..probably in the NHANESDirectSample __init__ method after loading the nhanes population from data...then apply this model to add a. fib in

In [12]:
results.summary()

0,1,2,3
Dep. Variable:,anyAfib,No. Observations:,40268.0
Model:,Logit,Df Residuals:,40250.0
Method:,MLE,Df Model:,17.0
Date:,"Tue, 02 Jul 2019",Pseudo R-squ.:,0.1726
Time:,20:20:25,Log-Likelihood:,-1840.4
converged:,True,LL-Null:,-2224.3
,,LLR p-value:,3.343e-152

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-10.4787,0.850,-12.333,0.000,-12.144,-8.813
smokingStatus[T.1],0.0518,0.115,0.452,0.651,-0.173,0.277
smokingStatus[T.2],-0.0007,0.178,-0.004,0.997,-0.349,0.348
education[T.3],0.8575,0.219,3.918,0.000,0.429,1.286
education[T.4],0.4965,0.225,2.205,0.027,0.055,0.938
education[T.5],0.6913,0.340,2.035,0.042,0.025,1.357
sbp,-0.0050,0.003,-1.637,0.102,-0.011,0.001
dbp,0.0044,0.005,0.826,0.409,-0.006,0.015
bmi,0.0787,0.018,4.301,0.000,0.043,0.115


In [13]:
pd.Series(results.predict()).mean()

0.009834111453263166

In [14]:
import sys
import os
sys.path.append(os.path.abspath("../mcm/"))


from mcm.population import NHANESDirectSamplePopulation
os.chdir("/Users/burke/Documents/research/bpCog/mcm/")

pop = NHANESDirectSamplePopulation(10000, 1999)

In [15]:
from mcm.person import Person
import json
from mcm.statsmodel_logistic_risk_factor_model import StatsModelLogisticRiskFactorModel
from mcm.race_ethnicity import NHANESRaceEthnicity
from mcm.smoking_status import SmokingStatus
from mcm.education import Education

def initializeAFib(person):
    #abs_module_path = os.path.abspath(os.path.dirname(__file__))
    model_spec_path = os.path.normpath(os.path.join("/Users/burke/Documents/research/bpCog/mcm/mcm/data/",
                                                        "BaselineAFibModel" + "Spec.json"))
    with open(model_spec_path, 'r') as model_spec_file:
        model_spec = json.load(model_spec_file)
    model = RegressionModel(**model_spec)
    statsModel = StatsModelLogisticRiskFactorModel(model)
    return statsModel.estimate_next_risk(person)

tempPeople = [Person(row['age'], row['gender'], NHANESRaceEthnicity(int(row['raceEthnicity'])), row['sbp'], row['dbp'], row['a1c'],
                    row['hdl'], row['totChol'], row['bmi'], row['ldl'], row['trig'],
                    row['waist'], row['anyPhysicalActivity'], Education(int(row['education'])), SmokingStatus(int(row['smokingStatus'])),
                    initializeAFib) for index, row in imputedData.iterrows()]