In [19]:
import pandas as pd
from thunderpack import ThunderReader
from openpyxl import load_workbook

In [20]:
# read in all patients
allPts = pd.read_excel('/media/cdac-c-15/External Drive/Dropbox/cardiac arrest project/cardiac_filtered_orginal .xlsx', sheet_name='cardiac_filtered_new')
print(len(allPts))

12146


In [21]:
# exclude the first 1807 rows of the spreadsheet
allPts = allPts.iloc[1807:]
print(len(allPts))

10339


In [22]:
# exclude all patients with no discharge date or discharged on same day
allPtsWDisch = allPts.dropna(subset=['HospitalDischargeDTS'])
print(len(allPtsWDisch))

allPtsWDisch.loc[:, 'HospitalAdmitDTS'] = pd.to_datetime(allPtsWDisch['HospitalAdmitDTS']).dt.date
allPtsWDisch.loc[:, 'HospitalDischargeDTS'] = pd.to_datetime(allPtsWDisch['HospitalDischargeDTS']).dt.date

mask = allPtsWDisch['HospitalDischargeDTS'] != allPtsWDisch['HospitalAdmitDTS']
allPtsWDisch = allPtsWDisch[mask]

print(len(allPtsWDisch))

8266
7706


In [23]:
# exclude patients with 'arrest' not in their 'reports'
word = 'arrest'

allPtsArrest = allPtsWDisch[allPtsWDisch['reports'].str.contains(word, case=False, na=False)]
print(len(allPtsArrest))

6844


In [24]:
# exclude patients with eeg after 72hrs from admission date

# convert both columns to dateTime format for easy comparison
allPtsArrest.loc[:, 'StartTime'] = pd.to_datetime(allPtsArrest['StartTime'])

allPtsEEG = allPtsArrest[(allPtsArrest['StartTime'] - allPtsArrest['HospitalAdmitDTS']) <= pd.Timedelta(hours=72)]
print(len(allPtsEEG))

3073


In [25]:
# limit to only those from site 1
siteOnePatients = allPtsArrest[allPtsArrest['SiteID'] != 'S0002']
print(len(siteOnePatients))

3148


In [26]:
#find the number of unique patients for site 1
print(siteOnePatients['BDSPPatientID'].nunique())

484


In [27]:
#limit to only those from site 2
siteTwoPatients = allPtsArrest[allPtsArrest['SiteID'] != 'S0001']
print(len(siteTwoPatients))

3696


In [28]:
# find the number of unique patients for site 2
print(siteTwoPatients['BDSPPatientID'].nunique())

552


In [29]:
# filter both datasets by the most recent to generate the chosen cohort
siteOnePatients.loc[:, 'HospitalAdmitDTS'] = pd.to_datetime(siteOnePatients['HospitalAdmitDTS'])
siteTwoPatients.loc[:, 'HospitalAdmitDTS'] = pd.to_datetime(siteTwoPatients['HospitalAdmitDTS'])

most_recent_admissions1 = siteOnePatients.groupby('BDSPPatientID')['HospitalAdmitDTS'].max().reset_index()
most_recent_admissions1 = most_recent_admissions1.sort_values(by='HospitalAdmitDTS', ascending=False)
top_250_patients1 = most_recent_admissions1.head(250)
siteOnePatients = siteOnePatients[siteOnePatients['BDSPPatientID'].isin(top_250_patients1['BDSPPatientID'])]

most_recent_admissions2 = siteTwoPatients.groupby('BDSPPatientID')['HospitalAdmitDTS'].max().reset_index()
most_recent_admissions2 = most_recent_admissions2.sort_values(by='HospitalAdmitDTS', ascending=False)
top_250_patients2 = most_recent_admissions2.head(400)
siteTwoPatients = siteTwoPatients[siteTwoPatients['BDSPPatientID'].isin(top_250_patients2['BDSPPatientID'])]

In [32]:
print(len(siteOnePatients))
print(len(siteTwoPatients))

1364
2644


In [30]:
# Create excel workbook to give rajib with all site 1 and site 2 patients on different pages
path = '/home/cdac-c-15/Desktop/Final_Patients_Filtered.xlsx'

# Concatenate the data if needed for any other operations
allFilteredPatients = pd.concat([siteOnePatients, siteTwoPatients], axis=0)

# Create an Excel writer object and write each DataFrame to a different sheet
with pd.ExcelWriter(path, engine='openpyxl') as writer:
    siteOnePatients.to_excel(writer, sheet_name='Site 1 Patients', index=False)
    siteTwoPatients.to_excel(writer, sheet_name='Site 2 Patients', index=False)

In [48]:
# output siteOne patients to an excel workbook page 
path = '/media/cdac-c-15/Thunderpack/Dropbox/zz_EHR_Thunderpacks/MGB/thunderpack_adt_1m_MGB/'
sheetName = 'Site One Filtered Patients'

siteOnePatients.to_csv('siteOneAllPts.csv', index=False)

# with pd.ExcelWriter(path, engine='openpyxl', mode='a') as writer:
#     siteOnePatients.to_excel(writer, sheet_name=sheetName, index=False)

In [49]:
# output siteTwo patients to an excel workbook page 
path = '/media/cdac-c-15/Thunderpack/Dropbox/zz_EHR_Thunderpacks/MGB/thunderpack_adt_1m_MGB/'
sheetName = 'Site Two Filtered Patients'

siteTwoPatients.to_csv('siteTwoAllPts.csv', index=False)

# with pd.ExcelWriter(path, engine='openpyxl', mode='a') as writer:
#     siteTwoPatients.to_excel(writer, sheet_name=sheetName, index=False)