In [56]:
import pandas as pd
from IPython.display import display

# Baseline data
This will take a data dump file from StarPanel, some period ending on 1/1/17, find the most recent LDL, LDL-C, HbA1c, and current statin therapy, and save that as baseline_data.csv  

Important notes
- Assumes all statins have "statin" in the name. 

In [105]:
def open_dump(filename):
    df = pd.read_csv(filename)
    df = df.rename(index=str,columns={"MedRecNo" : "MRN"})
    df["timestamp"] = (df.Date + " " + df.Time).apply(pd.Timestamp)
    df = df.set_index(["MRN","timestamp"]).drop(["Date", "Time"],axis=1)
    df["HgbA1C"] = pd.to_numeric(df["HgbA1C"], errors='coerce')
    df["LDL-C"] = pd.to_numeric(df["LDL-C"], errors='coerce')
    df["DirLDL"] = pd.to_numeric(df["DirLDL"], errors='coerce')
    return df

df = open_dump("baseline_dump.csv")
df

Unnamed: 0_level_0,Unnamed: 1_level_0,PtName,PtDOB,PtSex,PtRace,PtEthnicity,PtLang,PtAge,PtAddr,HgbA1C,LDL-C,DirLDL,PrListMed
MRN,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
35317700,2014-02-08 15:28:00,"FISHER, ERVIN LEON",11/28/1969,M,B,NH,ENGLISH,47.12,"721 JESSICA TAYLOR DRIVE, MADISON TN 37115-5579",,139.0,,
35317700,2014-02-08 16:52:00,"FISHER, ERVIN LEON",11/28/1969,M,B,NH,ENGLISH,47.12,"721 JESSICA TAYLOR DRIVE, MADISON TN 37115-5579",,,,metformin 1000 mg bid (dispensed for 30 days o...
35317700,2014-02-08 16:52:00,"FISHER, ERVIN LEON",11/28/1969,M,B,NH,ENGLISH,47.12,"721 JESSICA TAYLOR DRIVE, MADISON TN 37115-5579",,,,glyburide 5 mg bid (dispensed for 30 days on 0...
35317700,2014-02-08 16:52:00,"FISHER, ERVIN LEON",11/28/1969,M,B,NH,ENGLISH,47.12,"721 JESSICA TAYLOR DRIVE, MADISON TN 37115-5579",,,,lisinopril 40 mg qd (dispensed for 30 days on ...
35317700,2014-02-08 16:52:00,"FISHER, ERVIN LEON",11/28/1969,M,B,NH,ENGLISH,47.12,"721 JESSICA TAYLOR DRIVE, MADISON TN 37115-5579",,,,aspirin 325 mg 1/2 tablet qd (dispensed for 30...
35317700,2014-02-08 16:52:00,"FISHER, ERVIN LEON",11/28/1969,M,B,NH,ENGLISH,47.12,"721 JESSICA TAYLOR DRIVE, MADISON TN 37115-5579",,,,omeprazole 40 mg qd (dispensed for 30 days on ...
35317700,2014-02-08 16:52:00,"FISHER, ERVIN LEON",11/28/1969,M,B,NH,ENGLISH,47.12,"721 JESSICA TAYLOR DRIVE, MADISON TN 37115-5579",,,,test strips (dispensed for 30 days on 02/08/14)
27412410,2015-06-23 19:22:00,"PEARSON, DONNA",02/20/1962,F,W,NH,ENGLISH,54.89,"203 N CREST COMMONS CIRC, NASHVILLE TN 37211",5.5,,,
27412410,2015-06-23 19:22:00,"PEARSON, DONNA",02/20/1962,F,W,NH,ENGLISH,54.89,"203 N CREST COMMONS CIRC, NASHVILLE TN 37211",,,133.0,
27412410,2015-06-23 19:22:00,"PEARSON, DONNA",02/20/1962,F,W,NH,ENGLISH,54.89,"203 N CREST COMMONS CIRC, NASHVILLE TN 37211",,,,


In [126]:
def create_mostrecent_row(group):
    cols = ["HgbA1C_datetime","HgbA1C_baseline",
            "LDL-C_datetime","LDL-C_baseline",
            "DirLDL_datetime","DirLDL_baseline",
            "Statins_baseline","Statins_count"]
    data = []
    data += get_recent_lab(group,"HgbA1C")
    data += get_recent_lab(group,"LDL-C")
    data += get_recent_lab(group,"DirLDL")
    data += get_statins(group)
    return pd.DataFrame(data=[data],columns=cols)


def get_recent_lab(group,labname):
    lab_series = group[labname]
    if lab_series.count() == 0: return [pd.np.nan,pd.np.nan]
    lab_series = lab_series.dropna()
    lab_series.index = lab_series.index.droplevel()
    lab_series = lab_series.reset_index()
    return list(lab_series.iloc[-1])

def get_statins(group):
    meds = group["PrListMed"].dropna()
    statins = meds[meds.apply(lambda x: is_statin(x))]
    return [list(statins),statins.count()]

def is_statin(med):
    med = med.lower()
    return ("statin" in med) and ("nystatin" not in med) and ("diflucan" not in med)

baseline = df.groupby(level="MRN").apply(lambda x: create_mostrecent_row(x))
baseline.index = baseline.index.droplevel(level=1)
display(baseline)
baseline.to_csv("baseline_data.csv")

Unnamed: 0_level_0,HgbA1C_datetime,HgbA1C_baseline,LDL-C_datetime,LDL-C_baseline,DirLDL_datetime,DirLDL_baseline,Statins_baseline,Statins_count
MRN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2480937,,,2013-10-29 09:53:00,88.0,,,[],0
2993988,2016-11-12 16:37:00,6.5,2016-07-31 03:00:00,76.0,,,[atorvastatin 40 mg po qday (dispensed for 90 ...,1
3005089,,,2010-11-06 12:13:00,100.0,2012-09-01 17:21:00,89.0,[],0
3013810,,,,,,,[],0
3722931,2016-05-24 22:22:00,5.3,,,2016-05-24 22:22:00,102.0,[],0
3841947,2013-06-18 20:01:00,5.8,2016-03-12 17:17:00,101.0,2013-06-18 20:01:00,131.0,[],0
4224424,2016-03-15 19:32:00,6.3,2011-08-06 16:15:00,108.0,2015-08-01 17:14:00,69.0,[atorvastatin 40mg qHS (dispensed for 90 days ...,1
4230108,,,2013-02-02 13:59:00,108.0,,,[],0
4844361,2013-06-29 18:17:00,6.7,2013-06-29 18:17:00,64.0,2013-06-29 18:17:00,86.0,[],0
5382437,2016-11-29 20:55:00,5.7,2016-11-29 20:55:00,155.0,2011-11-05 17:16:00,126.0,[],0


In [107]:
baseline[(baseline["LDL-C_baseline"] > 70) |  (baseline["DirLDL_baseline"] > 70)]

Unnamed: 0_level_0,HgbA1C_datetime,HgbA1C_baseline,LDL-C_datetime,LDL-C_baseline,DirLDL_datetime,DirLDL_baseline,Statins_baseline,Statins_count
MRN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2480937,,,2013-10-29 09:53:00,88.0,,,[],0
2993988,2016-11-12 16:37:00,6.5,2016-07-31 03:00:00,76.0,,,[atorvastatin 40 mg po qday (dispensed for 90 ...,1
3005089,,,2010-11-06 12:13:00,100.0,2012-09-01 17:21:00,89.0,[],0
3722931,2016-05-24 22:22:00,5.3,,,2016-05-24 22:22:00,102.0,[],0
3841947,2013-06-18 20:01:00,5.8,2016-03-12 17:17:00,101.0,2013-06-18 20:01:00,131.0,[],0
4224424,2016-03-15 19:32:00,6.3,2011-08-06 16:15:00,108.0,2015-08-01 17:14:00,69.0,[atorvastatin 40mg qHS (dispensed for 90 days ...,1
4230108,,,2013-02-02 13:59:00,108.0,,,[],0
4844361,2013-06-29 18:17:00,6.7,2013-06-29 18:17:00,64.0,2013-06-29 18:17:00,86.0,[],0
5382437,2016-11-29 20:55:00,5.7,2016-11-29 20:55:00,155.0,2011-11-05 17:16:00,126.0,[],0
5555107,2015-03-24 20:01:00,7.7,2014-08-19 22:46:00,123.0,2014-03-25 21:59:00,127.0,[pravastatin 20mg qday (dispensed as 20mg bc o...,1


In [125]:
#get all MRNs with no LDL-C, DirLDL, HgbA1C
empty_mrn = pd.Series(baseline[pd.isnull(baseline[["LDL-C_baseline","DirLDL_baseline","HgbA1C_baseline"]]).apply(all,axis=1)].index)
empty_mrn.count()

118