In [9]:
import pandas as pd
from IPython.display import display

# Baseline data
This will take a data dump file from StarPanel, some period ending on 1/1/17, find the most recent LDL, LDL-C, HbA1c, and current statin therapy, and save that as baseline_data.csv  

Important notes
- Assumes all statins have "statin" in the name. 

In [59]:
def open_dump(filename):
    df = pd.read_csv(filename)
    df = df.rename(index=str,columns={"MedRecNo" : "MRN"})
    df["timestamp"] = (df.Date + " " + df.Time).apply(pd.Timestamp)
    df = df.drop(["Date", "Time"],axis=1)

#     df["LDL-C"] = pd.to_numeric(df["LDL-C"], errors='coerce')
#     df["DirLDL"] = pd.to_numeric(df["DirLDL"], errors='coerce')
    return df



df_all = open_dump("all_dump.csv")
df_dm = open_dump("dm_dump.csv")
df_dm["DM"] = 1

df_all = df_all.merge(df_dm[["PtName","DM"]],on="PtName",how="left")
df_all = df_all.drop(["A1c","HbA1c"],axis=1)
df_all["HgbA1C"] = pd.to_numeric(df_all["HgbA1C"],errors="coerce")
df_all = df_all.set_index(["MRN","timestamp"])

df_all.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,PtName,HgbA1C,DM
MRN,timestamp,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
35317700,2017-01-22 18:51:00,"FISHER, ERVIN LEON",,1.0
27412410,2015-06-23 19:22:00,"PEARSON, DONNA",5.5,
27412410,2016-11-08 19:22:00,"PEARSON, DONNA",5.4,
31025760,2013-08-03 17:57:00,"STEPHENS, TERRY D",5.5,
38573010,2015-03-03 21:51:00,"CASTILLO-HERNANDEZ, ADELF",6.6,1.0


In [84]:
def create_mostrecent_row(group):
    cols = ["PtName","HgbA1C_datetime","HgbA1C_baseline","DM"
#             ,
#             "LDL-C_datetime","LDL-C_baseline",
#             "DirLDL_datetime","DirLDL_baseline",
#             "Statins_baseline","Statins_count"
           ]
    data = [group["PtName"].iloc[0]]
    data += get_recent_lab(group,"HgbA1C")
    data.append(group["DM"].iloc[0])
#     data += get_recent_lab(group,"LDL-C")
#     data += get_recent_lab(group,"DirLDL")
#     data += get_statins(group)
    return pd.DataFrame(data=[data],columns=cols)


def get_recent_lab(group,labname):
    lab_series = group[labname]
    if lab_series.count() == 0: return [pd.np.nan,pd.np.nan]
    lab_series = lab_series.dropna()
    lab_series.index = lab_series.index.droplevel()
    lab_series = lab_series.reset_index()
    return list(lab_series.iloc[-1])

def get_statins(group):
    meds = group["PrListMed"].dropna()
    statins = meds[meds.apply(lambda x: is_statin(x))]
    return [list(statins),statins.count()]

def is_statin(med):
    med = med.lower()
    return ("statin" in med) and ("nystatin" not in med) and ("diflucan" not in med)

baseline = df_all.groupby(level="MRN").apply(lambda x: create_mostrecent_row(x))
baseline.index = baseline.index.droplevel(level=1)
baseline = baseline.sort_values(["HgbA1C_datetime"],na_position='first').drop_duplicates(subset=["PtName"],keep='last')
baseline.tail()

Unnamed: 0_level_0,PtName,HgbA1C_datetime,HgbA1C_baseline,DM
MRN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
21247994,"REED, GREGORY",2016-12-13 19:22:00,13.1,1.0
9869124,"BROOKS, STACEY DEJUAN JR.",2016-12-13 20:34:00,11.9,1.0
40545527,"MEJIA-PRIMERO, FRANCISCA",2016-12-13 21:12:00,9.8,1.0
29625233,"ALVARADO, SHARON",2016-12-13 21:24:00,10.8,
32834632,"LOPEZ-PEREZ, LORENZA",2016-12-15 12:12:00,5.4,


In [82]:
baseline[(baseline.DM == 1.0) & (pd.isnull(baseline.HgbA1C_baseline))]

Unnamed: 0_level_0,PtName,HgbA1C_datetime,HgbA1C_baseline,DM
MRN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
7481831,"MALEKADEH, RASTOM",NaT,,1.0
7583768,"PASCHAL, TAMMY DOREN",NaT,,1.0
10193266,"MARTIN, SHANTEL SHAQUIS",NaT,,1.0
12241881,"SILVA-LOPEZ, ISIDRA",NaT,,1.0
12956744,"HARRIS, DAVIS III",NaT,,1.0
13583174,"BREWER, CATHEY",NaT,,1.0
14330294,"GARRISON, DONALD LEE",NaT,,1.0
19190040,"WEBSTER, DEBORAH",NaT,,1.0
19587328,"DYER, HAZEL L",NaT,,1.0
20598728,"BREWER, CATHY",NaT,,1.0


In [85]:
baseline.to_csv("baseline_data.csv")