In [327]:
# widely used imports
import pandas as pd
import numpy as np
import os
import numpy.random as random
import time
import functools

# working directory
os.chdir("/Users/burke/Documents/research/scrooge")

# import key NHATS fields
nhanesDF = pd.read_stata("nhanesForScrooge.dta")

# simple weighting schema using the NHATS weights and turnign into simple probability weights
nhanesDF['probWeight'] = nhanesDF.WTINT2YR / np.sum(nhanesDF.WTINT2YR) 
nhanesDF.patientID.astype("int64")

# load the file mapping screening services to patients
screeningRules = pd.read_excel("simplifiedPreventiveServices.xlsx")

In [175]:
class Person:
    def __init__(self, patientID, gender, age, race, bmi, dm, htn, hl, smoking):
        self.patientID = patientID
        self.gender = gender
        self.age = age
        self.bmi = bmi
        self.dm = dm
        self.htn = htn
        self.hl = hl
        self.smoking = smoking

In [326]:
parameter_annualPanelAttritionRate = 0.30
parameter_proportionOfAllVisitsToPCP = 0.51

parameter_maleVisitRates = {(18,24) : 119.6, (25,44) : 127.3, (45,64) : 312.1, (65,74) : 559.5, (75,80): 799.2}
parameter_femaleVisitRates = {(18,24) : 235.3, (25,44) : 302.7, (45,64) : 417.3, (65,74) : 606.7, (75,80): 736.2}

class Provider:
    def __init__(self, panelSize):
        self.panelSize = panelSize
        self.initPanel()
        self.visits = pd.DataFrame(data=None, columns=['visitDate', 'patientID', 'year', 'age', 'gender', 'raceEth', 'bmi',
                                                      'smokingStatus', 'selfReportHtn', 'selfReportHyperlipidemia', 
                                                      'selfReportDiabetes'])
        self.visits.patientID.astype("int")
    
        self.screeningServices = pd.DataFrame(data=None, columns=['patientID', 'serviceName', 'applicable', 'timeSpent', 'visitDate',])
        self.startYear = 2018
        self.year = self.startYear
        
    def initPanel(self):
        rowIndices = random.choice(nhanesDF.index.values, size = self.panelSize, replace=True, p=nhanesDF.probWeight)
        self.panel = nhanesDF.iloc[rowIndices]
        self.panel = self.panel.reset_index(drop=True)
        self.lifetimePanel = self.panel.copy()
        
    def advancePanelByYear(self, years):
        for i in range(0, years):
            self.year += 1
            self.losePatientsToAttrition(parameter_annualPanelAttritionRate)
            self.addNewPatients(parameter_annualPanelAttritionRate)
            self.generateVisitHistoryForPanel()
        
    def losePatientsToAttrition(self, attritionRate):
        self.panel = self.panel.drop(random.choice(self.panel.index.values, size=int(attritionRate * self.panelSize), replace=False)) 
    
    def addNewPatients(self, attritionRate):
        newRowIndices = random.choice(nhanesDF.index.values, size = int(attritionRate * self.panelSize), replace=True, p=nhanesDF.probWeight)
        self.panel = self.panel.append(nhanesDF.iloc[newRowIndices])
        self.lifetimePanel = self.lifetimePanel.append(nhanesDF.iloc[newRowIndices])
        self.panel = self.panel.reset_index(drop=True)
    
    def generateVisitHistoryForPanel(self):
        men = self.panel.loc[self.panel['gender'] == 'Male']
        women = self.panel.loc[self.panel['gender'] == 'Female']
        
        self.generateVisitsForGender(men, parameter_maleVisitRates)
        self.generateVisitsForGender(women, parameter_femaleVisitRates)
        
        self.applyScreeningsForVisits()
        
    def applyScreeningRuleToVisit(screeningRule, visit):
        applies = True
        if (screeningRule['minAge'] != None and visit.age < screeningRule['minAge']):
            applies = False
        if (screeningRule['maxAge'] != None and visit.age > screeningRule['maxAge']):
            applies = False
        if (screeningRule['Gender'] != None and visit.gender != screeningRule['Gender']):
            applies = False
        if (screeningRule['Smoking'] != None):
            if (screeningRule['Smoking'] == 'Current' and visit.smokingStatus != 1):
                applies = False
            if (screeningRule['Smoking'] == 'Former' and visit.smokingStatus == 0)
                applies = False
            if (screeningRule['Smoking']  == 'Never') and visit.smokingStatus != 0)
                applies = False
        if (screeningRule['VascularRiskFactor'] != None)
            if (screeningRule['VascularRiskFactor'] == 1 and visit.selfReportHtn == 'No' and visit.selfReportHyperlipidemia == 'No' and visit.selfReportDiabetes == 'No'):
                applies = False
            if (screeningRule['vascularRiskFactor'] == 0 and (visit.seltReportHtn == 'Yes' or visit.selfReportHyperlipidemia == 'Yes' or visit.selfReportDiabetes == 'Yes')):
                applies = False
        if (screeningRule['bmiOver' != None] and visit.bmi <= screeningRule['bmiOver']):
            applies = False
        if (screeningRule['proportionOfPopulationAtRisk'] != None and random.random() > (1-screeningRule['proportionOfPopulationAtRisk'])):
            applies = False
        
        # next step is to check to see if the rule has been implemented since the last rule interval..
        # for that, need to get all of the visits for a patient id...
        # maybe the thing to do is to cache those at the panel level?
        # searching through the whole list every time is going to be awfully time-consuming...
        
        return pd.DataFrame({'patientID' : visit.patientID, 'serviceName' : screeningRule['serviceName'],
                            'applicable' : applies, 'timeSpent' : screeningRule['Time'] if apples else 0,
                            'visitDate' : visit.visitDate})
                        
            
    
    def applyScreeningRulesToVisit(visit):
        return screeningRules.apply(partial(applyScreeningRuleToVisit, visit))
    
    def applyScreeningsToVisits():
        self.screeningServices.append(self.visits.apply(applyScreeningRulesToVisit, axis=1))            
    
    def generateVisitsForGender(self, patients, visitRatesByAge):
        for ageRange in visitRatesByAge.keys():
            patientsWithinAgeRange = patients.loc[(patients['age'] >= ageRange[0]) & (patients['age'] <= ageRange[1])]
            totalVisits = int(visitRatesByAge[ageRange] * len(patientsWithinAgeRange) * parameter_proportionOfAllVisitsToPCP/ 100)
            patients = self.panel.loc[random.choice(self.panel.index.values, size=totalVisits, replace=True)]
            timesForVisits = self.generateDatesForVisits("1/1/" + str(self.year), "12/31/" + str(self.year+1), len(patientIDsForVisits))
            newVisits = pd.DataFrame(data={"visitDate" : pd.Series(timesForVisits), "patientID" : patients.patientID, 
                                           'year' : [self.year] * len(patientIDsForVisits), 'age' : patients.age,
                                          'gender' : patients.gender, 'raceEth' : patients.raceEth, 'bmi' : patients.bmi,
                                          'smokingStatus' : patients.smokingStatus, 'selfReportHtn' : selfReportHtn,
                                          'selfReportHyperlipidemia', patients.selfReportHyperlipidemia, 
                                            'selfReportDiabetes' : patients.selfReportDiabetes})
            self.visits = self.visits.append(newVisits)

    def generateDatesForVisits(self, startTime, endTime, numTimes):
        times = []
        stime = time.mktime(time.strptime(startTime, "%m/%d/%Y"))
        etime = time.mktime(time.strptime(endTime, "%m/%d/%Y"))

        for i in range(0, numTimes):
            times.append(time.strftime("%m/%d/%Y", time.localtime(stime + random.rand() * (etime-stime))))
            
        return times

SyntaxError: invalid syntax (<ipython-input-326-670bd493cede>, line 52)

In [318]:
provider = Provider(panelSize=2000)

In [319]:
provider.advancePanelByYear(10)

In [316]:
provider.lifetimePanel.sort_values("patientID")

Unnamed: 0,patientID,gender,age,raceEth,WTINT2YR,WTMEC2YR,sdmvpsu,sdmvstra,bmi,selfReportDiabetes,smokingStatus,selfReportHtn,selfReportHyperlipidemia,probWeight
0,83732,Male,62,Non-Hispanic White,134671.370419,135629.507405,1,125,27.8,Yes,2.0,No,No,0.000567
0,83732,Male,62,Non-Hispanic White,134671.370419,135629.507405,1,125,27.8,Yes,2.0,No,No,0.000567
1,83733,Male,53,Non-Hispanic White,24328.560239,25282.425927,1,125,30.8,No,1.0,No,No,0.000102
3,83735,Female,56,Non-Hispanic White,102717.995647,102078.634508,1,131,42.4,No,0.0,No,No,0.000433
3,83735,Female,56,Non-Hispanic White,102717.995647,102078.634508,1,131,42.4,No,0.0,No,No,0.000433
1665,83741,Male,22,Non-Hispanic Black,37043.087007,39353.307397,2,128,28.0,No,1.0,No,No,0.000156
403,83744,Male,56,Non-Hispanic Black,20395.535310,20068.662891,2,126,33.6,Yes,0.0,Yes,No,0.000086
8,83744,Male,56,Non-Hispanic Black,20395.535310,20068.662891,2,126,33.6,Yes,0.0,Yes,No,0.000086
8,83744,Male,56,Non-Hispanic Black,20395.535310,20068.662891,2,126,33.6,Yes,0.0,Yes,No,0.000086
9,83747,Male,46,Non-Hispanic White,34513.077877,35673.964272,1,121,27.6,No,1.0,Yes,Yes,0.000145


In [323]:
screeningRules

Unnamed: 0,Service,Grade,Frequency,Min Age,Max Age,Gender,Smoking,VascularRiskFactor,bmiOver,Time,proportionOfPopulationAtRisk
0,Abdominal Aortic Aneurysm: Screening,B,Once,65.0,75.0,Male,"Current, Former",,,1,
1,Abdominal Aortic Aneurysm: Screening,C,Once,65.0,75.0,Male,Never,,,1,
2,Tobacco Smoking Cessation in Adults: Behaviora...,A,q1,,,,Current,,,3,
3,Syphilis Infection in Nonpregnant Adults and A...,A,q1,,,,,,,1,0.029
4,Statin Use for the Primary Prevention of Cardi...,B,q5,40.0,75.0,,,,,2.5,0.2
5,Statin Use for the Primary Prevention of Cardi...,C,q5,40.0,75.0,,,,,2.5,0.05
6,Skin Cancer Prevention: Behavioral Counseling,B,q1,18.0,24.0,,,,,1.5,0.7
7,Skin Cancer Prevention: Behavioral Counseling,C,q1,25.0,,,,,,1.5,0.7
8,Sexually Transmitted Infections: Behavioral Co...,B,q1,18.0,25.0,Female,,,,3,
9,Sexually Transmitted Infections: Behavioral Co...,B,q1,,,,,,,3,0.029
