In [1]:
import pandas as pd
import os
import numpy as np
from functools import reduce

import matplotlib.pyplot as plt

In [2]:
DATA_DIR = os.path.join('/','home','ngsci','datasets','arrest-ntuh-ecg','v1')
LEAD_ORDER = ['I', 'II', 'III',
              'aVR', 'aVL', 'aVF',
              'V1', 'V2', 'V3', 'V4', 'V5', 'V6']
LEAD_INDEX = {lead: i for i, lead in enumerate(LEAD_ORDER)}
ECG_CAPTURE_TIME = 10  # Each ECG is taken for 10 seconds
ECG_SAMPLE_RATE = 500 # Each ECG is sampled at 500 Hz

## Study Group Filter Generation

We explore the study group data below.

In [3]:
def load_outcomes_df(fp):
    '''Load outcomes'''
    df = pd.read_csv(fp)

    # Convert to datetime
    date_cols = [c for c in df.columns if '_offset' in c]
    for c in date_cols:
        df[c] = pd.to_datetime(df[c], format='ISO8601')

    print('Loaded {}: {}'.format(fp, df.shape))
    return df

def load_study_group_rhythm_ecgs():
    '''Load Study Group rhythm ECGs npy and lookup CSV'''
    ecg_rhythm_fp = os.path.join(DATA_DIR, 'study-group','ecg-waveforms','waveform-rhythm.npy')
    waveform_rhythm_npy = np.load(ecg_rhythm_fp)
    print('Loaded {}: {}'.format(ecg_rhythm_fp, waveform_rhythm_npy.shape))
    
    ecg_lookup_fp = os.path.join(DATA_DIR, 'study-group','ecg-waveforms','waveform-npy.csv')
    waveform_npy_df = pd.read_csv(ecg_lookup_fp)
    print('Loaded {}: {}'.format(ecg_lookup_fp, waveform_npy_df.shape))
    
    return waveform_rhythm_npy, waveform_npy_df

def load_study_group_outcomes():
    # Load all Study Group files
    cohort_fp = os.path.join(DATA_DIR, 'study-group','cohort.csv')
    cohort_df = load_outcomes_df(cohort_fp)

    comorbidities_fp = os.path.join(DATA_DIR, 'study-group','comorbidities.csv')
    comorbidities_df = load_outcomes_df(comorbidities_fp)

    ecg_fp = os.path.join(DATA_DIR, 'study-group','ecg.csv')
    ecg_df = load_outcomes_df(ecg_fp)

    rosc_outcomes_fp = os.path.join(DATA_DIR, 'study-group','rosc-outcomes.csv')
    rosc_outcomes_df = load_outcomes_df(rosc_outcomes_fp)

    study_group_df = reduce(lambda left,right: pd.merge(left, right, on=['patient_ngsci_id', 'year', 'ecg_id'],
                                                        how='outer'), [cohort_df, comorbidities_df, ecg_df, rosc_outcomes_df])
    print('`study_group_df`: {}'.format(study_group_df.shape))
    study_group_df_lookup = {
        'study_group_df': study_group_df,
        'cohort_df': cohort_df,
        'comorbidities_df': comorbidities_df,
        'ecg_df': ecg_df,
        'rosc_outcomes_df': rosc_outcomes_df
    }
    return study_group_df_lookup

def stdg_calculate_time_diff(df):
    """
    Calculate the time difference between control and pre acquisition_datetime_offset values
    for the same patient_ngsci_id and save it in a new column called time_diff.

    Parameters:
    df (pd.DataFrame): The input dataframe containing patient_ngsci_id, ecg_timetag, 
                       and acquisition_datetime_offset columns.

    Returns:
    pd.DataFrame: A new dataframe with patient_ngsci_id and time_diff columns.
    """
    # Filter rows based on ecg_timetag
    control_df = df[df['ecg_timetag'] == '[1]ROSC']
    pre_df = df[df['ecg_timetag'] == '[0]pre']

    # Rename columns to avoid conflicts during merge
    control_df = control_df.rename(columns={'acquisition_datetime_offset': 'control_offset'})
    pre_df = pre_df.rename(columns={'acquisition_datetime_offset': 'pre_offset'})
    
    #Print the number of patients id in both datasets
    # Find the intersection of unique 'patient_ngsci_id' values in both DataFrames
    common_patient_ids = set(control_df['patient_ngsci_id']).intersection(set(pre_df['patient_ngsci_id']))

    # Count and print the number of common patient_ngsci_id
    print('The number of patients ID with both Pre and SCA are: ', len(common_patient_ids))

    # Merge the dataframes on patient_ngsci_id
    merged_df = pd.merge(control_df, pre_df, on='patient_ngsci_id')

    # Calculate the difference and save in a new column
    merged_df['time_diff'] = merged_df['control_offset'] - merged_df['pre_offset']

    # Select relevant columns
    result_df = merged_df[['patient_ngsci_id', 'time_diff']]

    return result_df

def stdg_print_problematic_patients(df):
    print(df[df['patient_ngsci_id']==100308])
    print(df[df['patient_ngsci_id']==101190])
    print(df[df['patient_ngsci_id']==101298])
    
def check_missing_values(df):
    """
    Check for missing values in the dataframe and print the count of missing values in each column.

    Parameters:
    df (pd.DataFrame): The dataframe to check for missing values.

    Returns:
    None
    """
    # Check for missing values in the dataframe
    missing_values = df.isnull().any()

    # Print the result
    print("Columns with missing values:")
    print(missing_values)
    
    # Check the count of missing values in each column
    missing_values_count = df.isnull().sum()
    print("\nCount of missing values in each column:")
    print(missing_values_count)

In [4]:
#Read ECG data from study group
ecg_study_npy, ecg_study_df = load_study_group_rhythm_ecgs()

Loaded /home/ngsci/datasets/arrest-ntuh-ecg/v1/study-group/ecg-waveforms/waveform-rhythm.npy: (1686, 12, 5000)
Loaded /home/ngsci/datasets/arrest-ntuh-ecg/v1/study-group/ecg-waveforms/waveform-npy.csv: (1686, 4)


In [5]:
#Read outcome data from study group
study_group_df_lookup = load_study_group_outcomes()
print(type(study_group_df_lookup))

Loaded /home/ngsci/datasets/arrest-ntuh-ecg/v1/study-group/cohort.csv: (1686, 9)
Loaded /home/ngsci/datasets/arrest-ntuh-ecg/v1/study-group/comorbidities.csv: (1686, 11)
Loaded /home/ngsci/datasets/arrest-ntuh-ecg/v1/study-group/ecg.csv: (1686, 7)
Loaded /home/ngsci/datasets/arrest-ntuh-ecg/v1/study-group/rosc-outcomes.csv: (1686, 37)
`study_group_df`: (1686, 55)
<class 'dict'>


In [6]:
print(ecg_study_df.head(5))

                             ecg_id     year  npy_index ecg_timetag
0  4d2b35fb8850b75dd3f6644978e542b5  2011-14          0     [1]ROSC
1  ea01a56650768f3f41d255fe546e27a9  2011-14          1     [1]ROSC
2  e4a8dfe722c4686695265586d8b5b6bc  2011-14          2      [0]pre
3  bfae87358404a1fb0f2ee6e71296f399  2011-14          3     [1]ROSC
4  041608622e1cbdad60f80257bae3ab61  2011-14          4     [2]24hr


In [7]:
#Print number of unique patients in the Study group
print('# of unique patients: {:,}'.format(len(study_group_df_lookup['cohort_df'][['patient_ngsci_id']].drop_duplicates())))

# of unique patients: 974


In [8]:
#Get and check ECG info
study_ecg_info_df=study_group_df_lookup['ecg_df']
#delete long-so far unecessary information
study_ecg_info_df=study_ecg_info_df.drop(['acquisition_datetime_is_weekend','rosc'],axis=1)
print(study_ecg_info_df.head(5))
print(study_ecg_info_df.shape)

   patient_ngsci_id     year                            ecg_id   
0            100000  2011-14  4d2b35fb8850b75dd3f6644978e542b5  \
1            100001  2011-14  ea01a56650768f3f41d255fe546e27a9   
2            100001  2011-14  e380efe5cc3211b5137f5f6b156f6201   
3            100001  2011-14  fc42a07f6d54ac0db90adf19f710fd02   
4            100002  2011-14  30849ba48b0418b50852bba62ddc4f71   

  acquisition_datetime_offset ecg_timetag  
0         2123-02-07 22:21:00     [1]ROSC  
1         2115-11-22 06:57:00     [1]ROSC  
2         2115-11-23 14:18:00     [2]24hr  
3         2115-11-22 05:45:00      [0]pre  
4         2122-10-06 13:33:00     [1]ROSC  
(1686, 5)


In [9]:
#Print and count the unique values of ecg type.
study_ecg_type_counts = study_ecg_info_df['ecg_timetag'].value_counts()
study_rosc_ecg_n = study_ecg_type_counts[0]
study_pre_ecg_n = study_ecg_type_counts[1]
study_24hr_ecg_n = study_ecg_type_counts[2]

print(f'# of just-after ROSC ECGs is {study_rosc_ecg_n} in the study group')
print(f'# of pre-cardiac arrest ECGs is {study_pre_ecg_n} in the study group')
print(f'# of 24h-after ROSC ECGs is {study_24hr_ecg_n} in the study group')

# of just-after ROSC ECGs is 838 in the study group
# of pre-cardiac arrest ECGs is 605 in the study group
# of 24h-after ROSC ECGs is 243 in the study group


In [10]:
#Delete problematic ECGs
#Problematic patients are who have more than 2 records i.e. multiple Pre and ROSC ECGs
#Delete the 24hr after SCA ECGs
No_24hr_condition = study_ecg_info_df['ecg_timetag'] != '[2]24hr'
stdg_ecg_PreROSC_df=study_ecg_info_df[No_24hr_condition].copy()

count_24hr = (stdg_ecg_PreROSC_df['ecg_timetag'] == '[2]24hr').sum()
print(f"Number of ECG after 24hr of ROSC after deletion: {count_24hr}")

patient_id_counts = stdg_ecg_PreROSC_df['patient_ngsci_id'].value_counts()
#patient_id_counts_df = pd.DataFrame(patient_id_counts)

# Filter patient_id values that appear more than two times
more_than_two_counts = patient_id_counts[patient_id_counts > 2]
more_than_two_counts_df = pd.DataFrame(more_than_two_counts).reset_index('patient_ngsci_id')

# Get the count of different patient_id values that appear more than two times
num_different_values = len(more_than_two_counts)

# Display the result
print(f"Number of different patient_id values appearing more than two times: {num_different_values}")
print(f"The patient ids with more than 2 pre or ROSC records are: {more_than_two_counts}")

#Delete these patients from the dataset
#print(stdg_ecg_PreROSC_df.shape)
#stdg_ecg_PreROSC_df = stdg_ecg_PreROSC_df[~stdg_ecg_PreROSC_df['patient_ngsci_id'].isin(more_than_two_counts['patient_ngsci_id'])]
#print(stdg_ecg_PreROSC_df[patient_id_counts > 2])

Number of ECG after 24hr of ROSC after deletion: 0
Number of different patient_id values appearing more than two times: 3
The patient ids with more than 2 pre or ROSC records are: patient_ngsci_id
101298    4
101190    4
100308    3
Name: count, dtype: int64


In [11]:
#Show the information of patients with more than 2 pre-ECGs
more_than_two_ROSC_PreEcgs = stdg_ecg_PreROSC_df[stdg_ecg_PreROSC_df['patient_ngsci_id'].isin(more_than_two_counts_df['patient_ngsci_id'])]
more_than_two_ROSC_PreEcgs

Unnamed: 0,patient_ngsci_id,year,ecg_id,acquisition_datetime_offset,ecg_timetag
393,100308,2011-14,3fb5e002ac6e540dbb458f57e79c2955,2128-08-19 20:21:00,[1]ROSC
395,100308,2011-14,48ce2822b2839b74a187ea0968f5d201,2128-09-12 19:32:00,[1]ROSC
396,100308,2011-14,a5b43dada49c227ae80e2686df0194a7,2128-08-26 08:27:00,[0]pre
1547,101190,2019,0059a263438c8bfba6a1eaf56c0ddd93,2167-11-10 12:08:33,[0]pre
1548,101190,2019,26dcfd8809f2b9dab06448312152e92a,2168-05-08 15:39:15,[1]ROSC
1549,101190,2019,5bf15010f13d23e77794f4e722929c65,2168-06-04 09:50:15,[0]pre
1550,101190,2019,2d8f70a010b1351ea17d31676e6f3ab6,2168-08-05 09:17:42,[1]ROSC
1682,101298,2015-16,28a6f831b6378c1c254f2e28a720e3ac,2132-07-11 15:20:05,[1]ROSC
1683,101298,2015-16,ba8bd21a6c21015a7c33cd96570b6175,2132-06-15 02:54:40,[0]pre
1684,101298,2017,8f7e1f60f3eb4a7cf54a044d2cc3d3db,2173-11-16 11:02:13,[0]pre


In [49]:
#Calculate the minimum and maximum time from Pre-ECG and ROSC-ECG
#study_ecg_info_df.shape
#print(study_ecg_PreROSC_df.head(10))

# Sort by patient_id first, then by ecg_timetag
#study_ecg_PreROSC_df = study_ecg_PreROSC_df.sort_values(by=['patient_ngsci_id','year','acquisition_datetime_offset','ecg_timetag'])
#print(study_ecg_PreROSC_sorted.head(10))

#Calculate difference
#study_ecg_PreROSC_df['time_diff'] = study_ecg_PreROSC_df.groupby(['patient_ngsci_id','year'])['acquisition_datetime_offset'].diff()
#NaT_condition=PreROSC_diff_df_full='NaT'
#PreROSC_diff_df = PreROSC_diff_df_full.drop(PreROSC_diff_df_full[NaT_condition])

#Calculate difference using the new function
stdg_timeDiff_df=stdg_calculate_time_diff(stdg_ecg_PreROSC_df)
print(stdg_timeDiff_df.shape)

#noTime_condition=study_ecg_PreROSC_df['time_diff']!='NaT'
#noTime_condition

#PreRosc_diff = study_ecg_PreROSC_df[noTime_condition].copy()
#print(study_ecg_PreROSC_df.head(10))
max_PreROSC_time = stdg_timeDiff_df['time_diff'].max()
min_PreROSC_time = stdg_timeDiff_df['time_diff'].min()

print(f'Max # of days between PRE-ECG and ROSC-ECG in the study group: {max_PreROSC_time}')
print(f'Min # of days between PRE-ECG and ROSC-ECG in the study group: {min_PreROSC_time}')

#Working in a "histogram" of the PRE and ROSC ECG time difference:

more_than_day = stdg_timeDiff_df['time_diff'][stdg_timeDiff_df['time_diff'] > pd.Timedelta('1 days')]
more_than_week = stdg_timeDiff_df['time_diff'][stdg_timeDiff_df['time_diff'] > pd.Timedelta('7 days')]
more_than_month = stdg_timeDiff_df['time_diff'][stdg_timeDiff_df['time_diff'] > pd.Timedelta('30 days')]
more_than_6_month = stdg_timeDiff_df['time_diff'][stdg_timeDiff_df['time_diff'] > pd.Timedelta('180 days')]
more_than_year = stdg_timeDiff_df['time_diff'][stdg_timeDiff_df['time_diff'] > pd.Timedelta('365 days')]
less_than_0 = stdg_timeDiff_df[['patient_ngsci_id','time_diff']][stdg_timeDiff_df['time_diff'] < pd.Timedelta('0 seconds')]


# Get the count of time differences
num_more_than_day = len(more_than_day.unique())
num_more_than_week = len(more_than_week.unique())
num_more_than_month = len(more_than_month.unique())
num_more_than_6_month = len(more_than_6_month.unique())
num_more_than_year = len(more_than_year.unique())

num_less_than_0 = len(less_than_0)


# Display the result
print(f"Number of PRE and ROSC ECG time differences more than one day: {num_more_than_day}")
print(f"Number of PRE and ROSC ECG time differences more than one week: {num_more_than_week}")
print(f"Number of PRE and ROSC ECG time differences more than one month: {num_more_than_month}")
print(f"Number of PRE and ROSC ECG time differences more than 6 months: {num_more_than_6_month}")
print(f"Number of PRE and ROSC ECG time differences more than a year: {num_more_than_year}")

print(f"Number of PRE and no-CA ECG time difference NEGATIVE: {num_less_than_0}")

(486, 2)
Max # of days between PRE-ECG and ROSC-ECG in the study group: 15389 days 16:42:17
Min # of days between PRE-ECG and ROSC-ECG in the study group: -15103 days +04:17:52
Number of PRE and ROSC ECG time differences more than one day: 318
Number of PRE and ROSC ECG time differences more than one week: 274
Number of PRE and ROSC ECG time differences more than one month: 220
Number of PRE and ROSC ECG time differences more than 6 months: 115
Number of PRE and ROSC ECG time differences more than a year: 80
Number of PRE and no-CA ECG time difference NEGATIVE: 3


In [50]:
#Show number of pre-OHSCA-ECGs less than a threshold time
df_less_than_day = stdg_timeDiff_df['time_diff'][stdg_timeDiff_df['time_diff'] <= pd.Timedelta('1 days')]
df_less_than_week = stdg_timeDiff_df['time_diff'][stdg_timeDiff_df['time_diff'] <= pd.Timedelta('7 days')]
df_less_than_month = stdg_timeDiff_df['time_diff'][stdg_timeDiff_df['time_diff'] <= pd.Timedelta('30 days')]
df_less_than_6_month = stdg_timeDiff_df['time_diff'][stdg_timeDiff_df['time_diff'] <= pd.Timedelta('180 days')]
df_less_than_year = stdg_timeDiff_df['time_diff'][stdg_timeDiff_df['time_diff'] <= pd.Timedelta('365 days')]

# Get the count of time differences
num_less_than_day = len(df_less_than_day.unique())
num_less_than_week = len(df_less_than_week.unique())
num_less_than_month = len(df_less_than_month.unique())
num_less_than_6_month = len(df_less_than_6_month.unique())
num_less_than_year = len(df_less_than_year.unique())


# Display the result
print(f"Number of PRE-ECG time less than one day: {num_less_than_day}")
print(f"Number of PRE-ECG time less than one week: {num_less_than_week}")
print(f"Number of PRE-ECG time less than one month: {num_less_than_month}")
print(f"Number of PRE-ECG time less than 6 months: {num_less_than_6_month}")
print(f"Number of PRE-ECG time less than a year: {num_less_than_year}")


Number of PRE-ECG time less than one day: 167
Number of PRE-ECG time less than one week: 211
Number of PRE-ECG time less than one month: 265
Number of PRE-ECG time less than 6 months: 370
Number of PRE-ECG time less than a year: 405


In [51]:
print(stdg_timeDiff_df.shape)
stdg_print_problematic_patients(stdg_timeDiff_df)
print(stdg_timeDiff_df.shape)
print(less_than_0)

(486, 2)
     patient_ngsci_id         time_diff
113            100308 -7 days +11:54:00
114            100308  17 days 11:05:00
     patient_ngsci_id          time_diff
449            101190  180 days 03:30:42
450            101190 -27 days +05:49:00
451            101190  268 days 21:09:09
452            101190   61 days 23:27:27
     patient_ngsci_id             time_diff
482            101298      26 days 12:25:25
483            101298 -15103 days +04:17:52
484            101298   15389 days 16:42:17
485            101298     260 days 08:34:44
(486, 2)
     patient_ngsci_id             time_diff
113            100308     -7 days +11:54:00
450            101190    -27 days +05:49:00
483            101298 -15103 days +04:17:52


In [52]:
#Delete the problematic patients
#Original size
print(stdg_ecg_PreROSC_df.shape)
print(stdg_timeDiff_df.shape)

# Filter out the rows with problematic patient_ngsci_id values
stdg_ecg_PreROSC_df = stdg_ecg_PreROSC_df[~stdg_ecg_PreROSC_df['patient_ngsci_id'].isin(less_than_0['patient_ngsci_id'])]
stdg_timeDiff_df = stdg_timeDiff_df[~stdg_timeDiff_df['patient_ngsci_id'].isin(less_than_0['patient_ngsci_id'])]

#Sizes after deleting problematic patients
print(stdg_ecg_PreROSC_df.shape)
print(stdg_timeDiff_df.shape)

# Print the resulting dataframe
#print(study_ecg_PreROSC_df)
#print(stdg_ecg_PreROSC_df[['patient_ngsci_id','ecg_timetag','acquisition_datetime_offset']][stdg_ecg_PreROSC_df['patient_ngsci_id']==100308])
#print(stdg_ecg_PreROSC_df[['patient_ngsci_id','ecg_timetag','acquisition_datetime_offset']][stdg_ecg_PreROSC_df['patient_ngsci_id']==101190])
#print(stdg_ecg_PreROSC_df[['patient_ngsci_id','ecg_timetag','acquisition_datetime_offset']][stdg_ecg_PreROSC_df['patient_ngsci_id']==101298])
#Delete negative difference patients from the dataset

(1443, 5)
(486, 2)
(1432, 5)
(476, 2)


In [53]:
#Keep only the preECGs 
stdg_preECG_df = stdg_ecg_PreROSC_df[stdg_ecg_PreROSC_df['ecg_timetag']=='[0]pre']
#print(stdg_preECG_df.head(5))
#print(stdg_preECG_df.shape)

#Keep the pre-ECG with less than 24h prior the SCA
#patiens_preECG_within24h_df=stdg_timeDiff_df[stdg_timeDiff_df['time_diff'] <= pd.Timedelta('1 days')]

#Keep patients within 1 days 
patiens_preECG_within24h_df=stdg_timeDiff_df[stdg_timeDiff_df['time_diff'] <= pd.Timedelta('1 days')]

#Keep patients within x days 
patiens_preECG_within_1week_df=stdg_timeDiff_df[stdg_timeDiff_df['time_diff'] <= pd.Timedelta('7 days')]
patiens_preECG_within_2weeks_df=stdg_timeDiff_df[stdg_timeDiff_df['time_diff'] <= pd.Timedelta('15 days')]
patiens_preECG_within_1month_df=stdg_timeDiff_df[stdg_timeDiff_df['time_diff'] <= pd.Timedelta('30 days')]
patiens_preECG_within_3months_df=stdg_timeDiff_df[stdg_timeDiff_df['time_diff'] <= pd.Timedelta('90 days')]
patiens_preECG_within_6months_df=stdg_timeDiff_df[stdg_timeDiff_df['time_diff'] <= pd.Timedelta('180 days')]
patiens_preECG_within_1year_df=stdg_timeDiff_df[stdg_timeDiff_df['time_diff'] <= pd.Timedelta('365 days')]

#print(patiens_preECG_within24h_df.head(5))
print('Number of patients with Pre-ECGs (IHCA and OHCA) less than 24 hours:', patiens_preECG_within24h_df.shape[0])

#Create the pre-ECG dataframes with patient_id, year and ecg_id
stdg_preECG_within24h_df = stdg_preECG_df[stdg_preECG_df['patient_ngsci_id'].isin(patiens_preECG_within24h_df['patient_ngsci_id'])]
#print(stdg_preECG_within24h_df.head(5))
#print(stdg_preECG_within24h_df.shape)
stdg_preECG_within_1week_df = stdg_preECG_df[stdg_preECG_df['patient_ngsci_id'].isin(patiens_preECG_within_1week_df['patient_ngsci_id'])]
stdg_preECG_within_2weeks_df = stdg_preECG_df[stdg_preECG_df['patient_ngsci_id'].isin(patiens_preECG_within_2weeks_df['patient_ngsci_id'])]
stdg_preECG_within_1month_df = stdg_preECG_df[stdg_preECG_df['patient_ngsci_id'].isin(patiens_preECG_within_1month_df['patient_ngsci_id'])]
stdg_preECG_within_3months_df = stdg_preECG_df[stdg_preECG_df['patient_ngsci_id'].isin(patiens_preECG_within_3months_df['patient_ngsci_id'])]
stdg_preECG_within_6months_df = stdg_preECG_df[stdg_preECG_df['patient_ngsci_id'].isin(patiens_preECG_within_6months_df['patient_ngsci_id'])]
stdg_preECG_within_1year_df = stdg_preECG_df[stdg_preECG_df['patient_ngsci_id'].isin(patiens_preECG_within_1year_df['patient_ngsci_id'])]

#Delete unnecesary columns
# Drop the specified columns
stdg_preECG_within24h_df = stdg_preECG_within24h_df.drop(columns=['year', 'acquisition_datetime_offset'])
stdg_preECG_within_1week_df = stdg_preECG_within_1week_df.drop(columns=['year', 'acquisition_datetime_offset'])
stdg_preECG_within_2weeks_df = stdg_preECG_within_2weeks_df.drop(columns=['year', 'acquisition_datetime_offset'])
stdg_preECG_within_1month_df = stdg_preECG_within_1month_df.drop(columns=['year', 'acquisition_datetime_offset'])
stdg_preECG_within_3months_df = stdg_preECG_within_3months_df.drop(columns=['year', 'acquisition_datetime_offset'])
stdg_preECG_within_6months_df = stdg_preECG_within_6months_df.drop(columns=['year', 'acquisition_datetime_offset'])
stdg_preECG_within_1year_df = stdg_preECG_within_1year_df.drop(columns=['year', 'acquisition_datetime_offset'])

check_missing_values(stdg_preECG_within24h_df)
check_missing_values(stdg_preECG_within_1week_df)
check_missing_values(stdg_preECG_within_2weeks_df)
check_missing_values(stdg_preECG_within_1month_df)
check_missing_values(stdg_preECG_within_3months_df)
check_missing_values(stdg_preECG_within_6months_df)
check_missing_values(stdg_preECG_within_1year_df)

Number of patients with Pre-ECGs (IHCA and OHCA) less than 24 hours: 165
Columns with missing values:
patient_ngsci_id    False
ecg_id              False
ecg_timetag         False
dtype: bool

Count of missing values in each column:
patient_ngsci_id    0
ecg_id              0
ecg_timetag         0
dtype: int64
Columns with missing values:
patient_ngsci_id    False
ecg_id              False
ecg_timetag         False
dtype: bool

Count of missing values in each column:
patient_ngsci_id    0
ecg_id              0
ecg_timetag         0
dtype: int64
Columns with missing values:
patient_ngsci_id    False
ecg_id              False
ecg_timetag         False
dtype: bool

Count of missing values in each column:
patient_ngsci_id    0
ecg_id              0
ecg_timetag         0
dtype: int64
Columns with missing values:
patient_ngsci_id    False
ecg_id              False
ecg_timetag         False
dtype: bool

Count of missing values in each column:
patient_ngsci_id    0
ecg_id              0
ecg_ti

## OHCA or IHCA

In [54]:
#Check how many IHCA and OHCA are
#Get and check ROSC outcomes info
study_rosc_info_df=study_group_df_lookup['rosc_outcomes_df']
#print(study_rosc_info_df.head(5))
IH_OH_CA_counts=study_rosc_info_df['location_of_cardiac_arrest'].value_counts()
#print(IH_OH_CA_counts)

#Keep only the OHCA
patiens_OHA_df = study_rosc_info_df[['patient_ngsci_id','ecg_id','location_of_cardiac_arrest',\
                                                     'cause_of_cardiac_arrest','initial_rhythm']]\
                                  [study_rosc_info_df['location_of_cardiac_arrest'] == 'OHCA']
#print(patiens_OHA_df.head(5))
print('Number of patients with OHCA-Pre-ECG total:', patiens_OHA_df.shape[0])

#Filter preECGs by Pre-ECG window time
OHCA_preECG_within24h_df = pd.merge(stdg_preECG_within24h_df,patiens_OHA_df, on=['patient_ngsci_id','ecg_id'])
#print(OHCA_preECG_within24h_df.head(5))
OHCA_preECG_within_1week_df = pd.merge(stdg_preECG_within_1week_df,patiens_OHA_df, on=['patient_ngsci_id','ecg_id'])
OHCA_preECG_within_2weeks_df = pd.merge(stdg_preECG_within_2weeks_df,patiens_OHA_df, on=['patient_ngsci_id','ecg_id'])
OHCA_preECG_within_1month_df = pd.merge(stdg_preECG_within_1month_df,patiens_OHA_df, on=['patient_ngsci_id','ecg_id'])
OHCA_preECG_within_3months_df = pd.merge(stdg_preECG_within_3months_df,patiens_OHA_df, on=['patient_ngsci_id','ecg_id'])
OHCA_preECG_within_6months_df = pd.merge(stdg_preECG_within_6months_df,patiens_OHA_df, on=['patient_ngsci_id','ecg_id'])
OHCA_preECG_within_1year_df = pd.merge(stdg_preECG_within_1year_df,patiens_OHA_df, on=['patient_ngsci_id','ecg_id'])

#Print results
print('Number of patients with OHCA-Pre-ECGs within 24 hours:', OHCA_preECG_within24h_df.shape[0])
print('Number of patients with OHCA-Pre-ECGs within 1 week:', OHCA_preECG_within_1week_df.shape[0])
print('Number of patients with OHCA-Pre-ECGs within 2 weeks:', OHCA_preECG_within_2weeks_df.shape[0])
print('Number of patients with OHCA-Pre-ECGs within 1 month:', OHCA_preECG_within_1month_df.shape[0])
print('Number of patients with OHCA-Pre-ECGs within 3 months:', OHCA_preECG_within_3months_df.shape[0])
print('Number of patients with OHCA-Pre-ECGs within 6 months:', OHCA_preECG_within_6months_df.shape[0])
print('Number of patients with OHCA-Pre-ECGs within 1 year:', OHCA_preECG_within_1year_df.shape[0])

#Keep only the IHCA
patiens_IHA_df = study_rosc_info_df[['patient_ngsci_id','ecg_id','location_of_cardiac_arrest',\
                                                     'cause_of_cardiac_arrest','initial_rhythm']]\
                                  [study_rosc_info_df['location_of_cardiac_arrest'] == 'IHCA']
#print(patiens_OHA_df.head(5))
#print(patiens_OHA_df.shape)

IHCA_preECG_within24h_df = pd.merge(stdg_preECG_within24h_df,patiens_IHA_df, on=['patient_ngsci_id','ecg_id'])
#print(IHCA_preECG_within24h_df.head(5))
#print(IHCA_preECG_within24h_df.shape)

#Check for missing values.
#check_missing_values(IHCA_preECG_within24h_df)
#check_missing_values(OHCA_preECG_within24h_df)


Number of patients with OHCA-Pre-ECGs total: 854
Number of patients with OHCA-Pre-ECGs within 24 hours: 19
Number of patients with OHCA-Pre-ECGs within 1 week: 27
Number of patients with OHCA-Pre-ECGs within 2 weeks: 34
Number of patients with OHCA-Pre-ECGs within 1 month: 58
Number of patients with OHCA-Pre-ECGs within 3 months: 104
Number of patients with OHCA-Pre-ECGs within 6 months: 132
Number of patients with OHCA-Pre-ECGs within 1 year: 149


In [55]:
print('Number of IH+OH CA is: ', IH_OH_CA_counts)
print('Number of OHCA ECGs with preECGs', patiens_OHA_df.shape)


stg_preEcgs_OHCA_df = pd.merge(stdg_preECG_df,patiens_OHA_df, on=['patient_ngsci_id','ecg_id'])
print('Number of preECGs OHCA', stg_preEcgs_OHCA_df.shape[0])

stg_preEcgs_IHCA_df = pd.merge(stdg_preECG_df,patiens_IHA_df, on=['patient_ngsci_id','ecg_id'])
print('Number of preECGs IHCA', stg_preEcgs_IHCA_df.shape[0])

patiens_Other_df = study_rosc_info_df[['patient_ngsci_id','ecg_id','location_of_cardiac_arrest',\
                                                     'cause_of_cardiac_arrest','initial_rhythm']]\
                                  [study_rosc_info_df['location_of_cardiac_arrest'] == 'Other']
stg_preEcgs_Other_df = pd.merge(stdg_preECG_df,patiens_Other_df, on=['patient_ngsci_id','ecg_id'])
print('Number of preECGs Other', stg_preEcgs_Other_df.shape[0])

Number of IH+OH CA is:  location_of_cardiac_arrest
OHCA     854
IHCA     707
Other    123
0          2
Name: count, dtype: int64
Number of OHCA ECGs with preECGs (854, 5)
Number of preECGs OHCA 240
Number of preECGs IHCA 332
Number of preECGs Other 27


In [56]:
#Get and check ECG info
stg_cohort_df=study_group_df_lookup['cohort_df']

#Keep only the columns of interest
stg_cohort_df=stg_cohort_df[['patient_ngsci_id','ecg_id','sex','age']]
#print(stg_cohort_df.head(5))

#Merge the cohort information into the filters
IHCA_preECG_within24h_df=pd.merge(IHCA_preECG_within24h_df, stg_cohort_df, on =['patient_ngsci_id','ecg_id'])
#print(IHCA_preECG_within24h_df.head(5))

#Out of Hospital filter
OHCA_preECG_within24h_df=pd.merge(OHCA_preECG_within24h_df, stg_cohort_df, on =['patient_ngsci_id','ecg_id'])
#print(OHCA_preECG_within24h_df.head(5))

### Save the files into csv

In [57]:
#IHCA_preECG_within24h_df.to_csv('fltr_IHCA_preECG_within24h.csv', index=False)
#OHCA_preECG_within24h_df.to_csv('fltr_OHCA_preECG_within24h.csv', index=False)

#Within 30 days
#OHCA_preECG_within24h_df.to_csv('fltr_OHCA_preECG_in30d.csv', index=False)