## Control Group Filter Generation

We explore the control group data below.

In [1]:
import pandas as pd
import os
import numpy as np
from functools import reduce

import matplotlib.pyplot as plt

In [2]:
DATA_DIR = os.path.join('/','home','ngsci','datasets','arrest-ntuh-ecg','v1')
LEAD_ORDER = ['I', 'II', 'III',
              'aVR', 'aVL', 'aVF',
              'V1', 'V2', 'V3', 'V4', 'V5', 'V6']
LEAD_INDEX = {lead: i for i, lead in enumerate(LEAD_ORDER)}
ECG_CAPTURE_TIME = 10  # Each ECG is taken for 10 seconds
ECG_SAMPLE_RATE = 500 # Each ECG is sampled at 500 Hz

In [3]:
#Functions

def load_outcomes_df(fp):
    '''Load outcomes'''
    df = pd.read_csv(fp)

    # Convert to datetime
    date_cols = [c for c in df.columns if '_offset' in c]
    for c in date_cols:
        df[c] = pd.to_datetime(df[c], format='ISO8601')

    print('Loaded {}: {}'.format(fp, df.shape))
    return df

def load_control_group_outcomes():
    # Load Control Group files
    ecg_cohort_fp = os.path.join(DATA_DIR, 'control-group','ecg-cohort.csv')
    ecg_cohort_df = load_outcomes_df(ecg_cohort_fp)

    return ecg_cohort_df

def load_control_group_rhythm_ecgs(year):
    '''Load Control Group rhythm ECGs npy and lookup CSV'''
    # Waveforms - pre
    fp = os.path.join(DATA_DIR, 'control-group/{}/pre/ecg-waveforms/waveform-rhythm.npy'.format(year))
    ecg_rhythm_npy_pre = np.load(fp)
    print('Loaded {}: {}'.format(fp, ecg_rhythm_npy_pre.shape))
    
    # Waveforms - control
    fp = os.path.join(DATA_DIR, 'control-group/{}/control/ecg-waveforms/waveform-rhythm.npy'.format(year))
    ecg_rhythm_npy_control = np.load(fp)
    print('Loaded {}: {}'.format(fp, ecg_rhythm_npy_control.shape))
    
    # Dictionary to store both 'pre' and 'control' waveforms
    ecg_rhythm_npy_precontrol = {
        'pre': ecg_rhythm_npy_pre,
        'control': ecg_rhythm_npy_control
    }
    
    # Waveform lookup df - pre
    fp = os.path.join(DATA_DIR, 'control-group/{}/pre/ecg-waveforms/waveform-npy.csv'.format(year))
    ecg_npy_df_pre = pd.read_csv(fp)
    print('Loaded {}: {}'.format(fp, ecg_npy_df_pre.shape))
    ecg_npy_df_pre['ecg_timetag'] = 'pre'
    
    # Waveform lookup df - control
    fp = os.path.join(DATA_DIR, 'control-group/{}/control/ecg-waveforms/waveform-npy.csv'.format(year))
    ecg_npy_df_control = pd.read_csv(fp)
    print('Loaded {}: {}'.format(fp, ecg_npy_df_control.shape))
    ecg_npy_df_control['ecg_timetag'] = 'control'
    
    ecg_npy_df = pd.concat([ecg_npy_df_pre, ecg_npy_df_control])

    return ecg_rhythm_npy_precontrol, ecg_npy_df

def ctrg_calculate_time_diff(df):
    """
    Calculate the time difference between control and pre acquisition_datetime_offset values
    for the same patient_ngsci_id and save it in a new column called time_diff.

    Parameters:
    df (pd.DataFrame): The input dataframe containing patient_ngsci_id, ecg_timetag, 
                       and acquisition_datetime_offset columns.

    Returns:
    pd.DataFrame: A new dataframe with patient_ngsci_id and time_diff columns.
    """
    # Filter rows based on ecg_timetag
    control_df = df[df['ecg_timetag'] == 'control']
    pre_df = df[df['ecg_timetag'] == 'pre']

    # Rename columns to avoid conflicts during merge
    control_df = control_df.rename(columns={'acquisition_datetime_offset': 'control_offset'})
    pre_df = pre_df.rename(columns={'acquisition_datetime_offset': 'pre_offset'})

    # Merge the dataframes on patient_ngsci_id
    merged_df = pd.merge(control_df, pre_df, on='patient_ngsci_id')

    # Calculate the difference and save in a new column
    merged_df['time_diff'] = merged_df['control_offset'] - merged_df['pre_offset']

    # Select relevant columns
    result_df = merged_df[['patient_ngsci_id', 'time_diff']]

    return result_df

In [4]:
# Load Control Group files
#control_group_df contains the data on ecg-cohort.csv
ctrg_ecg_cohort_df = load_control_group_outcomes()

ecg_rhythm_npy_precontrol_2015, ecg_npy_df_2015 = load_control_group_rhythm_ecgs('2015')

Loaded /home/ngsci/datasets/arrest-ntuh-ecg/v1/control-group/ecg-cohort.csv: (16386, 8)
Loaded /home/ngsci/datasets/arrest-ntuh-ecg/v1/control-group/2015/pre/ecg-waveforms/waveform-rhythm.npy: (1046, 12, 5000)
Loaded /home/ngsci/datasets/arrest-ntuh-ecg/v1/control-group/2015/control/ecg-waveforms/waveform-rhythm.npy: (1712, 12, 5000)
Loaded /home/ngsci/datasets/arrest-ntuh-ecg/v1/control-group/2015/pre/ecg-waveforms/waveform-npy.csv: (1046, 4)
Loaded /home/ngsci/datasets/arrest-ntuh-ecg/v1/control-group/2015/control/ecg-waveforms/waveform-npy.csv: (1712, 4)


In [5]:
ctrg_ecg_cohort_df.head(5)

Unnamed: 0,patient_ngsci_id,year,ecg_id,ecg_timetag,acquisition_datetime_offset,acquisition_datetime_is_weekend,age,sex
0,1510001362,2015,192717e3fd052706ce297b9d36b58354,control,2126-03-10 23:13:00,0,82,male
1,1510001362,2015,9232eb679d9ab57579440e89b12fb3c9,pre,2125-07-19 11:10:00,0,82,male
2,1510010000,2015,20fbe673da581eba4f6c363eef9557a9,control,2210-08-01 22:56:00,0,80,male
3,1510032359,2015,ab6a26e105ab852f49dd395c497b91e1,control,2171-12-26 21:11:00,0,74,female
4,1510032359,2015,64f8fa6e0718a00f87c0760063d94086,pre,2167-11-07 20:02:00,1,70,female


In [6]:
#Print and count the unique values of ecg type and patients
control_ecg_type_counts = ctrg_ecg_cohort_df['ecg_timetag'].value_counts()
print(control_ecg_type_counts)
print(ctrg_ecg_cohort_df.shape)
unique_patient_ids = ctrg_ecg_cohort_df['patient_ngsci_id'].nunique()
print(f"Number of unique patient is: {unique_patient_ids}")

ecg_timetag
control    9976
pre        6410
Name: count, dtype: int64
(16386, 8)
Number of unique patient is: 9976


In [7]:
#Calculate the minimum and maximum time from Pre-ECG and noCA-ECG

# Sort by patient_id first, then by ecg_timetag
#ctrg_ecg_cohort_df = ctrg_ecg_cohort_df.sort_values(by=['patient_ngsci_id','year','ecg_timetag']) #,'acquisition_datetime_offset'

#Calculate difference
ctrg_timeDiff_df = ctrg_calculate_time_diff(ctrg_ecg_cohort_df)

# Print the result
#print(result_df)

#print(ctrg_ecg_cohort_df.head(10))

max_Pre_noCA_time = ctrg_timeDiff_df['time_diff'].max()
min_Pre_noCA_time = ctrg_timeDiff_df['time_diff'].min()

#Checking which rows are the max and minimum 
#max_row_index = ctrg_timeDiff_df['time_diff'].idxmax()
#min_row_index = ctrg_timeDiff_df['time_diff'].idxmin()
#print(f'The index name of the max difference is: {max_row_index}')
#print(f'The index name of the min difference is: {min_row_index}')
#row_number_max = ctrg_timeDiff_df.index.get_loc(max_row_index)
#row_number_min = ctrg_timeDiff_df.index.get_loc(min_row_index)

#print(ctrg_timeDiff_df.iloc[row_number_max - 4 : row_number_max + 5])
#print(ctrg_timeDiff_df.iloc[row_number_min - 4 : row_number_min + 5])

print(f'Max # of days between PRE-ECG and no-CA event in the study group: {max_Pre_noCA_time}')
print(f'Min # of days between PRE-ECG and no-CA event in the study group: {min_Pre_noCA_time}')

#Working in a "histogram" of the PRE and ROSC ECG time difference:

more_than_day = ctrg_timeDiff_df['time_diff'][ctrg_timeDiff_df['time_diff'] > pd.Timedelta('1 days')]
more_than_week = ctrg_timeDiff_df['time_diff'][ctrg_timeDiff_df['time_diff'] > pd.Timedelta('7 days')]
more_than_month = ctrg_timeDiff_df['time_diff'][ctrg_timeDiff_df['time_diff'] > pd.Timedelta('30 days')]
more_than_6_month = ctrg_timeDiff_df['time_diff'][ctrg_timeDiff_df['time_diff'] > pd.Timedelta('180 days')]
more_than_year = ctrg_timeDiff_df['time_diff'][ctrg_timeDiff_df['time_diff'] > pd.Timedelta('365 days')]
less_than_0 = ctrg_timeDiff_df[['patient_ngsci_id','time_diff']][ctrg_timeDiff_df['time_diff'] < pd.Timedelta('0 seconds')]


# Get the count of time differences
num_more_than_day = len(more_than_day.unique())
num_more_than_week = len(more_than_week.unique())
num_more_than_month = len(more_than_month.unique())
num_more_than_6_month = len(more_than_6_month.unique())
num_more_than_year = len(more_than_year.unique())
num_less_than_0 = len(less_than_0['time_diff'].unique())

# Display the result
print(f"Number of PRE and no-CA ECG time difference more than one day: {num_more_than_day}")
print(f"Number of PRE and no-CA ECG time difference more than one week: {num_more_than_week}")
print(f"Number of PRE and no-CA ECG time difference more than one month: {num_more_than_month}")
print(f"Number of PRE and no-CA ECG time difference more than 6 months: {num_more_than_6_month}")
print(f"Number of PRE and no-CA ECG time difference more than a year: {num_more_than_year}")


print(f"Number of PRE and no-CA ECG time difference NEGATIVE: {num_less_than_0}")



Max # of days between PRE-ECG and no-CA event in the study group: 36199 days 00:00:00
Min # of days between PRE-ECG and no-CA event in the study group: -35766 days +00:00:00
Number of PRE and no-CA ECG time difference more than one day: 4949
Number of PRE and no-CA ECG time difference more than one week: 4711
Number of PRE and no-CA ECG time difference more than one month: 4264
Number of PRE and no-CA ECG time difference more than 6 months: 3108
Number of PRE and no-CA ECG time difference more than a year: 2548
Number of PRE and no-CA ECG time difference NEGATIVE: 1370


In [8]:
#Print some examples why the time difference is negative
print(ctrg_ecg_cohort_df[['patient_ngsci_id','ecg_timetag','acquisition_datetime_offset']][ctrg_ecg_cohort_df['patient_ngsci_id']==1819211246])
print(ctrg_ecg_cohort_df[['patient_ngsci_id','ecg_timetag','acquisition_datetime_offset']][ctrg_ecg_cohort_df['patient_ngsci_id']==1819241880])
print(ctrg_ecg_cohort_df[['patient_ngsci_id','ecg_timetag','acquisition_datetime_offset']][ctrg_ecg_cohort_df['patient_ngsci_id']==1819252849])
print(ctrg_ecg_cohort_df[['patient_ngsci_id','ecg_timetag','acquisition_datetime_offset']][ctrg_ecg_cohort_df['patient_ngsci_id']==1819261458])

       patient_ngsci_id ecg_timetag acquisition_datetime_offset
10429        1819211246     control                  2139-06-28
10430        1819211246         pre                  2145-09-28
       patient_ngsci_id ecg_timetag acquisition_datetime_offset
10433        1819241880     control                  2123-08-22
10434        1819241880         pre                  2205-09-23
       patient_ngsci_id ecg_timetag acquisition_datetime_offset
10435        1819252849     control                  2132-12-16
10436        1819252849         pre                  2164-03-31
       patient_ngsci_id ecg_timetag acquisition_datetime_offset
10437        1819261458     control                  2120-02-14
10438        1819261458         pre                  2218-01-17


In [9]:
less_than_0.head(5)

Unnamed: 0,patient_ngsci_id,time_diff
3554,1810061000,-20252 days
3556,1810111613,-3286 days
3557,1810131364,-2704 days
3560,1810182865,-11293 days
3562,1810221077,-1110 days


In [9]:
#Delete all time differences tha are negative
patients_ecgIDs_timeDiff_negative = ctrg_ecg_cohort_df[['patient_ngsci_id','ecg_id','ecg_timetag']]\
                                                        [ctrg_ecg_cohort_df['patient_ngsci_id']\
                                                         .isin(less_than_0['patient_ngsci_id'])]
#print(patients_ecgIDs_timeDiff_negative.head(5))
#print(patients_ecgIDs_timeDiff_negative.shape)

#Remove the control ECGs (non-SCA ECGs taken the same day of other SCA-ECG)
patients_ecgIDs_timeDiff_negative=patients_ecgIDs_timeDiff_negative[\
                                patients_ecgIDs_timeDiff_negative['ecg_timetag']=='pre']

#print(patients_ecgIDs_timeDiff_negative.head(5))
#print(patients_ecgIDs_timeDiff_negative.shape)

#Remove the patients_id from the main dataframe

ctrg_ecg_cohort_df=ctrg_ecg_cohort_df[~ctrg_ecg_cohort_df['patient_ngsci_id']\
                                      .isin(patients_ecgIDs_timeDiff_negative['patient_ngsci_id'])]
#print(ctrg_ecg_cohort_df.shape)
#print(ctrg_ecg_cohort_df.head(5))

KeyboardInterrupt: 

In [None]:
#Test there is no negative difference

#Recalculate difference
ctrg_timeDiff_df = ctrg_calculate_time_diff(ctrg_ecg_cohort_df)

max_Pre_noCA_time = ctrg_timeDiff_df['time_diff'].max()
min_Pre_noCA_time = ctrg_timeDiff_df['time_diff'].min()

print(f'Max # of days between PRE-ECG and no-CA event in the study group: {max_Pre_noCA_time}')
print(f'Min # of days between PRE-ECG and no-CA event in the study group: {min_Pre_noCA_time}')

less_than_0 = ctrg_timeDiff_df[['patient_ngsci_id','time_diff']][ctrg_timeDiff_df['time_diff'] < pd.Timedelta('0 seconds')]

# Get the count of time differences
num_less_than_0 = len(less_than_0['time_diff'].unique())

# Display the result
print(f"Number of PRE and no-CA ECG time difference NEGATIVE: {num_less_than_0}")

In [None]:
#Remove the control ECGs from the main dataframe (non-SCA ECGs taken the same day of other SCA-ECG)
ctrg_preECG_df=ctrg_ecg_cohort_df[['patient_ngsci_id','ecg_id','ecg_timetag','age','sex']]\
                                        [ctrg_ecg_cohort_df['ecg_timetag']=='pre']
print(ctrg_preECG_df.shape)
#print(ctrg_preECG_df.head(5))

#Keep pre-ecgs within 24h before next visit
#preECG_within_Ndays = ctrg_timeDiff_df[ctrg_timeDiff_df['time_diff'] <= pd.Timedelta('7 days')]

#Keep pre-ecgs within 30 days before next visit
preECG_within_Ndays = ctrg_timeDiff_df[ctrg_timeDiff_df['time_diff'] <= pd.Timedelta('30 days')]

#print(preECG_within_Ndays.head(5))
print(preECG_within_Ndays.shape)

ctrg_preECG_within_Ndays_df = ctrg_preECG_df[ctrg_preECG_df['patient_ngsci_id']\
                                             .isin(preECG_within_Ndays['patient_ngsci_id'])]
print(ctrg_preECG_within_Ndays_df.shape)
#print(ctrg_preECG_within_Ndays_df.head(5))

In [None]:
#save the filter in a csv file
#ctrg_preECG_within_Ndays_df.to_csv('fltr_ctrg_preECG_within_7days.csv', index=False)

#save the filter for 30 days control data in a csv file
ctrg_preECG_within_Ndays_df.to_csv('fltr_ctrg_preECG_30days.csv', index=False)