In [1]:
import pandas as pd
import os
import numpy as np
from functools import reduce

import matplotlib.pyplot as plt

In [2]:
DATA_DIR = os.path.join('/','home','ngsci','datasets','arrest-ntuh-ecg','v1')
LEAD_ORDER = ['I', 'II', 'III',
              'aVR', 'aVL', 'aVF',
              'V1', 'V2', 'V3', 'V4', 'V5', 'V6']
LEAD_INDEX = {lead: i for i, lead in enumerate(LEAD_ORDER)}
ECG_CAPTURE_TIME = 10  # Each ECG is taken for 10 seconds
ECG_SAMPLE_RATE = 500 # Each ECG is sampled at 500 Hz

## Study Group Data

We explore the study group data below.

In [3]:
def load_outcomes_df(fp):
    '''Load outcomes. Use this when the csv file has date type data'''
    df = pd.read_csv(fp)

    # Convert to datetime
    date_cols = [c for c in df.columns if '_offset' in c]
    for c in date_cols:
        df[c] = pd.to_datetime(df[c], format='ISO8601')

    print('Loaded {}: {}'.format(fp, df.shape))
    return df

def load_study_group_rhythm_ecgs():
    '''Load Study Group rhythm ECGs npy and lookup CSV'''
    ecg_rhythm_fp = os.path.join(DATA_DIR, 'study-group','ecg-waveforms','waveform-rhythm.npy')
    waveform_rhythm_npy = np.load(ecg_rhythm_fp)
    print('Loaded {}: {}'.format(ecg_rhythm_fp, waveform_rhythm_npy.shape))
    
    ecg_lookup_fp = os.path.join(DATA_DIR, 'study-group','ecg-waveforms','waveform-npy.csv')
    waveform_npy_df = pd.read_csv(ecg_lookup_fp)
    print('Loaded {}: {}'.format(ecg_lookup_fp, waveform_npy_df.shape))
    
    return waveform_rhythm_npy, waveform_npy_df

def load_study_group_outcomes():
    # Load all Study Group files
    cohort_fp = os.path.join(DATA_DIR, 'study-group','cohort.csv')
    cohort_df = load_outcomes_df(cohort_fp)

    comorbidities_fp = os.path.join(DATA_DIR, 'study-group','comorbidities.csv')
    comorbidities_df = load_outcomes_df(comorbidities_fp)

    ecg_fp = os.path.join(DATA_DIR, 'study-group','ecg.csv')
    ecg_df = load_outcomes_df(ecg_fp)

    rosc_outcomes_fp = os.path.join(DATA_DIR, 'study-group','rosc-outcomes.csv')
    rosc_outcomes_df = load_outcomes_df(rosc_outcomes_fp)

    study_group_df = reduce(lambda left,right: pd.merge(left, right, on=['patient_ngsci_id', 'year', 'ecg_id'],
                                                        how='outer'), [cohort_df, comorbidities_df, ecg_df, rosc_outcomes_df])
    print('`study_group_df`: {}'.format(study_group_df.shape))
    study_group_df_lookup = {
        'study_group_df': study_group_df,
        'cohort_df': cohort_df,
        'comorbidities_df': comorbidities_df,
        'ecg_df': ecg_df,
        'rosc_outcomes_df': rosc_outcomes_df
    }
    return study_group_df_lookup

def load_study_group_waveforms_attributes():
    '''Load Study Group Waveform Attribute Table'''
    #Get the file path (fp)
    waveforms_attributes_fp = os.path.join(DATA_DIR, 'study-group','ecg-waveforms','waveform.csv')
    #Read the file on the file path
    waveform_attributes_df = pd.read_csv(waveforms_attributes_fp)
    print('Loaded {}: {}'.format(waveforms_attributes_fp, waveform_attributes_df.shape))
     
    return waveform_attributes_df

def find_unique_values_counts(dataframe, columns):
    unique_values_counts = {}
    for column in columns:
        unique_values_counts[column] = dataframe[column].value_counts().to_dict()
    return unique_values_counts

def load_study_group_ecg_metadata(file):
    '''Load Study Group ecg-metadata file'''
    #Get the file path (fp)
    ecg_metadata_fp = os.path.join(DATA_DIR, 'study-group','ecg-metadata',file)
    #Read the file on the file path
    ecg_metadata_df = pd.read_csv(ecg_metadata_fp,low_memory=False)
    print('Loaded {}: {}'.format(ecg_metadata_fp, ecg_metadata_df.shape))
    
    return  ecg_metadata_df

def print_unique_values_counts(df, columns_to_analyze):
    # Get unique values and their counts
    unique_values_counts = find_unique_values_counts(df, columns_to_analyze)
    
    #print the number of unique ecgs in the dataframe
    num_unique_ecg_ids = df['ecg_id'].nunique()
    print(f"Number of different 'ecg_ids' is:{num_unique_ecg_ids}")
    
    # Print the results
    for column, values_counts in unique_values_counts.items():
        print(f"Column: {column}")
        for value, count in values_counts.items():
            print(f"Value: {value}, Count: {count}")
        print("\n")
        
def transform_to_single_row(df):
    """
    Transforms a dataframe with columns 'ecg_id', 'sample', 'ecg_timetag', and 'lead_meas'
    to a single row dataframe where 'lead_meas' values become column names.

    Parameters:
    df (pd.DataFrame): Input dataframe with columns 'ecg_id', 'sample', 'ecg_timetag', and 'lead_meas'.

    Returns:
    pd.DataFrame: Transformed dataframe with a single row.
    """
    # Get the unique values for ecg_id and ecg_timetag
    ecg_id_value = df['ecg_id'].iloc[0]
    ecg_timetag_value = df['ecg_timetag'].iloc[0]

    # Pivot the dataframe
    pivot_df = df.pivot(index='ecg_id', columns='lead_meas', values='sample')

    # Reset the index to turn the index into a column
    pivot_df.reset_index(inplace=True)

    # Add the ecg_timetag column
    pivot_df['ecg_timetag'] = ecg_timetag_value

    return pivot_df

def one_hot_encode_sex(df):
    # Create a new DataFrame with one-hot encoded columns
    encoded_df = pd.get_dummies(df, columns=['sex'])
    
    # Rename the encoded columns for clarity
    encoded_df = encoded_df.rename(columns={'sex_female': 'female', 'sex_male': 'male'})
    
    return encoded_df

def convert_TrueFalse_column(df, column):
    """
    Converts the values in the column of the dataframe.
    Changes 'True' to 1 and 'False' to 0.

    Parameters:
    df (pd.DataFrame): Input dataframe with a column.

    Returns:
    pd.DataFrame: Dataframe with converted column.
    """
    # Mapping dictionary
    mapping = {False: 0, True: 1}
    
    # Apply the mapping
    df[column] = df[column].map(mapping)
    
    return df

def is_column_numeric(df, column_name):
    """
    Check if a column in a DataFrame contains only numeric values.

    Args:
        df (pd.DataFrame): The DataFrame to check.
        column_name (str): The name of the column to check.

    Returns:
        bool: True if the column contains only numeric values, False otherwise.
    """
    # Check if the column exists in the DataFrame
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' not found in DataFrame")

    # Check if the column contains only numeric values
    return pd.api.types.is_numeric_dtype(df[column_name]) and not df[column_name].isnull().any()

def replace_age_values(df):
    """
    Replace '90+' values in the 'age' column with 95.

    Args:
        df (pd.DataFrame): The DataFrame to modify.

    Returns:
        pd.DataFrame: The modified DataFrame.
    """
    # Check if the 'age' column exists in the DataFrame
    if 'age' not in df.columns:
        raise ValueError("Column 'age' not found in DataFrame")

    # Replace '90+' values with 95
    df['age'] = df['age'].replace('90+', 95)

    return df

In [4]:
#Read ECG data from study group
ecg_study_npy, ecg_study_df = load_study_group_rhythm_ecgs()

Loaded /home/ngsci/datasets/arrest-ntuh-ecg/v1/study-group/ecg-waveforms/waveform-rhythm.npy: (1686, 12, 5000)
Loaded /home/ngsci/datasets/arrest-ntuh-ecg/v1/study-group/ecg-waveforms/waveform-npy.csv: (1686, 4)


In [5]:
#Read outcome data from study group
study_group_df_lookup = load_study_group_outcomes()

Loaded /home/ngsci/datasets/arrest-ntuh-ecg/v1/study-group/cohort.csv: (1686, 9)
Loaded /home/ngsci/datasets/arrest-ntuh-ecg/v1/study-group/comorbidities.csv: (1686, 11)
Loaded /home/ngsci/datasets/arrest-ntuh-ecg/v1/study-group/ecg.csv: (1686, 7)
Loaded /home/ngsci/datasets/arrest-ntuh-ecg/v1/study-group/rosc-outcomes.csv: (1686, 37)
`study_group_df`: (1686, 55)


In [6]:
print(ecg_study_df.head(5))

                             ecg_id     year  npy_index ecg_timetag
0  4d2b35fb8850b75dd3f6644978e542b5  2011-14          0     [1]ROSC
1  ea01a56650768f3f41d255fe546e27a9  2011-14          1     [1]ROSC
2  e4a8dfe722c4686695265586d8b5b6bc  2011-14          2      [0]pre
3  bfae87358404a1fb0f2ee6e71296f399  2011-14          3     [1]ROSC
4  041608622e1cbdad60f80257bae3ab61  2011-14          4     [2]24hr


In [7]:
#Print number of unique patients in the Study group
print('# of unique patients: {:,}'.format(len(study_group_df_lookup['cohort_df'][['patient_ngsci_id']].drop_duplicates())))

# of unique patients: 974


In [8]:
#Get and check ECG info
study_ecg_info_df=study_group_df_lookup['ecg_df']
#delete long-so far unecessary information
study_ecg_info_df=study_ecg_info_df.drop(['acquisition_datetime_is_weekend','rosc'],axis=1)
print(study_ecg_info_df.head(5))
print(study_ecg_info_df.shape)

   patient_ngsci_id     year                            ecg_id   
0            100000  2011-14  4d2b35fb8850b75dd3f6644978e542b5  \
1            100001  2011-14  ea01a56650768f3f41d255fe546e27a9   
2            100001  2011-14  e380efe5cc3211b5137f5f6b156f6201   
3            100001  2011-14  fc42a07f6d54ac0db90adf19f710fd02   
4            100002  2011-14  30849ba48b0418b50852bba62ddc4f71   

  acquisition_datetime_offset ecg_timetag  
0         2123-02-07 22:21:00     [1]ROSC  
1         2115-11-22 06:57:00     [1]ROSC  
2         2115-11-23 14:18:00     [2]24hr  
3         2115-11-22 05:45:00      [0]pre  
4         2122-10-06 13:33:00     [1]ROSC  
(1686, 5)


In [9]:
#Print and count the unique values of ecg type.
study_ecg_type_counts = study_ecg_info_df['ecg_timetag'].value_counts()
study_rosc_ecg_np = study_ecg_type_counts[0]
study_pre_ecg_np = study_ecg_type_counts[1]
study_24hr_ecg_np = study_ecg_type_counts[2]

print(f'# of just-after ROSC ECGs is {study_rosc_ecg_np} in the study group')
print(f'# of pre-cardiac arrest ECGs is {study_pre_ecg_np} in the study group')
print(f'# of 24h-after ROSC ECGs is {study_24hr_ecg_np} in the study group')

# of just-after ROSC ECGs is 838 in the study group
# of pre-cardiac arrest ECGs is 605 in the study group
# of 24h-after ROSC ECGs is 243 in the study group


In [10]:
#Load study group attributes
waveform_attributes_stg_df = load_study_group_waveforms_attributes()

# List of columns to analyze
columns_to_analyze = ['waveform_type', 'waveform_start_time', 'number_of_leads', 
                      'sample_type', 'sample_base', 'sample_exponent', 
                      'highpass_filter', 'lowpass_filter', 'ac_filter']

print_unique_values_counts(waveform_attributes_stg_df, columns_to_analyze)

Loaded /home/ngsci/datasets/arrest-ntuh-ecg/v1/study-group/ecg-waveforms/waveform.csv: (3372, 12)
Number of different 'ecg_ids' is:1686
Column: waveform_type
Value: Median, Count: 1686
Value: Rhythm, Count: 1686


Column: waveform_start_time
Value: 0, Count: 3372


Column: number_of_leads
Value: 8, Count: 3372


Column: sample_type
Value: CONTINUOUS_SAMPLES, Count: 3372


Column: sample_base
Value: 500, Count: 3372


Column: sample_exponent
Value: 0, Count: 3372


Column: highpass_filter
Value: 0.32, Count: 2840
Value: 0.16, Count: 496
Value: 0.56, Count: 36


Column: lowpass_filter
Value: 150, Count: 3372


Column: ac_filter
Value: 60, Count: 3366
Value: NONE, Count: 6




In [11]:
#Load Study Group ECG-metadata amplitude measurements
stg_amplitude_measurements_df = load_study_group_ecg_metadata('amplitude-measurements.csv')

#Columns of the amplitude measurements dataframe
columns_to_analyze=['lead_id', 'amplitude_measurement_beat_id', 'amplitude_measurement_wave_id', 
                    'amplitude_measurement_mode']

#Print unique values counts
print_unique_values_counts(stg_amplitude_measurements_df, columns_to_analyze)

Loaded /home/ngsci/datasets/arrest-ntuh-ecg/v1/study-group/ecg-metadata/amplitude-measurements.csv: (475661, 11)
Number of different 'ecg_ids' is:1616
Column: lead_id
Value: III, Count: 41093
Value: AVL, Count: 40817
Value: V2, Count: 39773
Value: V1, Count: 39723
Value: AVF, Count: 39601
Value: I, Count: 39572
Value: V5, Count: 39476
Value: V3, Count: 39379
Value: V4, Count: 39325
Value: V6, Count: 39230
Value: II, Count: 39068
Value: AVR, Count: 38478
Value: A1, Count: 27
Value: A2, Count: 27
Value: A3, Count: 27
Value: X, Count: 15
Value: Y, Count: 15
Value: Z, Count: 15


Column: amplitude_measurement_beat_id
Value: 3, Count: 158663
Value: 4, Count: 137964
Value: 2, Count: 133302
Value: 5, Count: 25169
Value: 1, Count: 20563


Column: amplitude_measurement_wave_id
Value: IE_STJ, Count: 58203
Value: IE_STJ40, Count: 58203
Value: IE_STJ60, Count: 58203
Value: IE_STJ80, Count: 58203
Value: IE_T, Count: 56083
Value: IE_R, Count: 53591
Value: IE_S, Count: 40033
Value: IE_P, Count: 35442

In [12]:
#Load Study Group ECG-metadata amplitude measurements
stg_diagnosis_df = load_study_group_ecg_metadata('diagnosis.csv')

#Print unique values counts
columns_to_analyze=['modality','stmt_flag', 'stmt_text']
#print_unique_values_counts(stg_diagnosis_df, columns_to_analyze)

#Get the ECG diagnosis of the pre-ECGs only
stg_preECG_diagnosis_df = stg_diagnosis_df[stg_diagnosis_df['ecg_timetag'] == '[0]pre']

print(stg_preECG_diagnosis_df.head(5))
print(stg_preECG_diagnosis_df.shape)

#print_unique_values_counts(stg_preECG_diagnosis_df, columns_to_analyze)

Loaded /home/ngsci/datasets/arrest-ntuh-ecg/v1/study-group/ecg-metadata/diagnosis.csv: (6974, 6)
                              ecg_id     year modality   stmt_flag   
8   e4a8dfe722c4686695265586d8b5b6bc  2011-14  RESTING  USERINSERT  \
9   e4a8dfe722c4686695265586d8b5b6bc  2011-14  RESTING  USERINSERT   
10  e4a8dfe722c4686695265586d8b5b6bc  2011-14  RESTING  USERINSERT   
25  0ae546edc7a3c6c8e2b5f5a3b75b8e04  2011-14  RESTING  USERINSERT   
26  0ae546edc7a3c6c8e2b5f5a3b75b8e04  2011-14  RESTING  USERINSERT   

                               stmt_text ecg_timetag  
8   Sinus rhythmwith 1st degree AV block      [0]pre  
9             Nonspecific ST abnormality      [0]pre  
10                          Abnormal ECG      [0]pre  
25                     Junctional rhythm      [0]pre  
26    Inferior infarct, age undetermined      [0]pre  
(2195, 6)


In [13]:
#Get the ecg-ids that have a normal ECG diagnosis
#condition=stg_diagnosis_df['ecg_timetag'] == '[0]pre'

normal_ecg_ids = stg_preECG_diagnosis_df[stg_preECG_diagnosis_df['stmt_text'] == 'Normal ECG']['ecg_id']
#print(normal_ecg_ids.shape)
#print(normal_ecg_ids.head(5))

# Filter stg_preECG_diagnosis_df to include only rows where ecg_id is in normal_ecg_ids
stg_full_normal_preECGs_df = stg_preECG_diagnosis_df[stg_preECG_diagnosis_df['ecg_id'].isin(normal_ecg_ids)]
print(stg_full_normal_preECGs_df.shape)
print(stg_full_normal_preECGs_df.head(5))
#Save the results.
#stg_full_normal_preECGs_df.to_csv('stg_full_normal_preECGs.csv', index=False)

(161, 6)
                               ecg_id     year modality   stmt_flag   
173  42c7a3d733574af9636e10768048e761  2011-14  RESTING  USERINSERT  \
174  42c7a3d733574af9636e10768048e761  2011-14  RESTING  USERINSERT   
190  f9804c1208c684c162e1c37952ebc7a0  2011-14  RESTING  USERINSERT   
191  f9804c1208c684c162e1c37952ebc7a0  2011-14  RESTING  USERINSERT   
267  b6f4a6c0cdf562ec5def3d9e7d9099fb  2011-14  RESTING  USERINSERT   

               stmt_text ecg_timetag  
173  Normal sinus rhythm      [0]pre  
174           Normal ECG      [0]pre  
190  Normal sinus rhythm      [0]pre  
191           Normal ECG      [0]pre  
267  Normal sinus rhythm      [0]pre  


In [14]:
#Save all measuremnt data per lead

#Load Study Group ECG-metadata measurement-matrix-per-lead
stg_measurementsFull_byColumns_df = load_study_group_ecg_metadata('measurement-matrix-per-lead.csv')

#Print unique values counts
columns_to_analyze=['lead_id','measurement_id', 'ttal', 'stdown', 
                    'stelev', 'jelev', 'dltwv', 'stinj', 'ppdeep']

#Delete ECGs with strange 255 value in lead_id and measurement_id
stg_measurements_byColumns_df = stg_measurementsFull_byColumns_df[stg_measurementsFull_byColumns_df['lead_id'] != 255]
#print_unique_values_counts(stg_measurements_byColumns_df, columns_to_analyze)

#Change lead_id values
lead_id_mapping = {       #Define the mapping from numeric values to lead names
    0: 'I',
    1: 'II',
    2: 'V1',
    3: 'V2',
    4: 'V3',
    5: 'V4',
    6: 'V5',
    7: 'V6',
    8: 'III',
    9: 'AVR',
    10: 'AVL',
    11: 'AVF' }

# Apply the mapping to the lead_id column in the dataframe
stg_measurements_byColumns_df['lead_id'] = stg_measurements_byColumns_df.loc[:,'lead_id'].map(lead_id_mapping)

#Change measurement_id values
meas_id_mapping = {   #Define the mapping from numeric values to lead names
    0: 'PONA',
    1: 'PAMP',
    2: 'PDUR',
    3: 'bmPAR',
    4: 'bmPI',
    5: 'P’AMP',
    6: 'P’DUR',
    7: 'bmPPAR',
    8: 'bmPPI',
    9: 'QAMP',
    10: 'QDUR',
    11: 'bmQAR',
    12: 'bmQI',
    13: 'RAMP',
    14: 'RDUR',
    15: 'bmRAR',
    16: 'bmRI',
    17: 'SAMP',
    18: 'SDUR',
    19: 'bmSAR',
    20: 'bmSI',
    21: 'R’AMP',
    22: 'R’DUR',
    23: 'bmRPAR',
    24: 'bmRPI',
    25: 'S’AMP',
    26: 'S’DUR',
    27: 'bmSPAR',
    28: 'bmSPI',
    29: 'STJ',
    30: 'STM',
    31: 'STE',
    32: 'MXSTA',
    33: 'MNSTA',
    34: 'SPTA',
    35: 'QRSA',
    36: 'QRSDEF',
    37: 'MAXRA',
    38: 'MAXSA',
    39: 'TAMP',
    40: 'TDUR',
    41: 'bmTAR',
    42: 'bmTI',
    43: 'T’AMP',
    44: 'TPDUR',
    45: 'bmTPAR',
    46: 'bmTPI',
    47: 'TEND',
    48: 'PAREA',
    49: 'QRSAR',
    50: 'TAREA',
    51: 'QRSINT',
    52: 'BITFLG'
}

# Apply the mapping to the measurement_id column in the dataframe
stg_measurements_byColumns_df['measurement_id'] = stg_measurements_byColumns_df.loc[:,'measurement_id'].map(meas_id_mapping)
# Display the updated dataframe
#print(stg_measurements_byColumns_df)

#Keep measurements applicable to each Lead
# Define the columns to keep
columns_to_keep = ['ecg_id', 'lead_id', 'ttal', 'stdown', 'stelev', 'jelev', 'dltwv', 'stinj', 'ppdeep', 'ecg_timetag']

# Select the specified columns and create the new dataframe
measurements_per_lead = stg_measurements_byColumns_df[columns_to_keep].dropna()
# Display the new dataframe
#print(measurements_per_lead.head(25))

# Create the new column by concatenating 'lead_id' and 'measurement_id' with an underscore
stg_measurements_byColumns_df['lead_meas'] = stg_measurements_byColumns_df['lead_id'].astype(str) + \
                                            '_' + stg_measurements_byColumns_df['measurement_id'].astype(str)

# Drop the original columns 'lead_id' and 'measurement_id' and others
stg_measurements_byColumns_df.drop(columns=['year','lead_id', 'measurement_id', 'ttal', 'stdown', 'stelev', \
                                            'jelev', 'dltwv', 'stinj', 'ppdeep','8'], inplace=True)

# Display the updated dataframe
print(stg_measurements_byColumns_df)

Loaded /home/ngsci/datasets/arrest-ntuh-ecg/v1/study-group/ecg-metadata/measurement-matrix-per-lead.csv: (1069756, 14)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stg_measurements_byColumns_df['lead_id'] = stg_measurements_byColumns_df.loc[:,'lead_id'].map(lead_id_mapping)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stg_measurements_byColumns_df['measurement_id'] = stg_measurements_byColumns_df.loc[:,'measurement_id'].map(meas_id_mapping)


                                   ecg_id    sample ecg_timetag   lead_meas
0        4d2b35fb8850b75dd3f6644978e542b5        14     [1]ROSC      I_PONA
1        4d2b35fb8850b75dd3f6644978e542b5        29     [1]ROSC      I_PAMP
2        4d2b35fb8850b75dd3f6644978e542b5        82     [1]ROSC      I_PDUR
3        4d2b35fb8850b75dd3f6644978e542b5        48     [1]ROSC     I_bmPAR
4        4d2b35fb8850b75dd3f6644978e542b5        56     [1]ROSC      I_bmPI
...                                   ...       ...         ...         ...
1069751  04018ca836b61e6e49290c590d0c3335         0     [1]ROSC   AVF_PAREA
1069752  04018ca836b61e6e49290c590d0c3335     -1080     [1]ROSC   AVF_QRSAR
1069753  04018ca836b61e6e49290c590d0c3335       460     [1]ROSC   AVF_TAREA
1069754  04018ca836b61e6e49290c590d0c3335        94     [1]ROSC  AVF_QRSINT
1069755  04018ca836b61e6e49290c590d0c3335  b'.\x00'     [1]ROSC  AVF_BITFLG

[1069752 rows x 4 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stg_measurements_byColumns_df['lead_meas'] = stg_measurements_byColumns_df['lead_id'].astype(str) + \
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stg_measurements_byColumns_df.drop(columns=['year','lead_id', 'measurement_id', 'ttal', 'stdown', 'stelev', \


In [15]:
# Group the dataframe by 'ecg_id' and apply the transformation function to each group

# Create the output dataframe for merging results
stg_measurements_byRows_df = pd.DataFrame()

for ecg_id, group in stg_measurements_byColumns_df.groupby('ecg_id'):
    single_row_df = transform_to_single_row(group)
    stg_measurements_byRows_df = pd.concat([stg_measurements_byRows_df, single_row_df], ignore_index=True)

# Drop columns that contain 'BITFLG' in their names
stg_measurements_byRows_df = stg_measurements_byRows_df.loc[:, ~stg_measurements_byRows_df.columns.str.contains('BITFLG')]

#print(stg_measurements_byRows_df.head(2))
#Get only ECG measurements for pre-SCA

# Filter the rows where ecg_timetag equals '[0]pre'
stg_measurements_preECGs_df = stg_measurements_byRows_df[stg_measurements_byRows_df['ecg_timetag'] == '[0]pre']

# Print the resulting dataframe to verify
print(stg_measurements_preECGs_df.shape)
print(stg_measurements_preECGs_df.head(5))

(604, 626)
lead_meas                            ecg_id AVF_MAXRA AVF_MAXSA AVF_MNSTA   
3          0059a263438c8bfba6a1eaf56c0ddd93       976        43        24  \
4          00b1d88f70c487706ce9f862d8bfeae5       112         0       -10   
6          00cdaddb43f4caf5683d4517e47390d6       332       141       -74   
9          016c710cdd1c94821f9c4accc222d5db        48       258        19   
11         0195f94679da47685400b710f3891a92       781         0        39   

lead_meas AVF_MXSTA AVF_PAMP AVF_PAREA AVF_PDUR AVF_PONA AVF_P’AMP  ...   
3                63       92       174       68       53         0  ...  \
4                53      107       270      110       48         0  ...   
6                48       83       189      102        9         0  ...   
9                34       78       160      114       19         0  ...   
11               63       83       240      118       43         0  ...   

lead_meas V6_bmRPI V6_bmSAR V6_bmSI V6_bmSPAR V6_bmSPI V6_bmTAR V6_bmTI   


In [16]:
#Filter the data
#Read the filters
fltr_IHCA_preECG_within24h = pd.read_csv('fltr_IHCA_preECG_within24h.csv')
fltr_OHCA_preECG_within24h = pd.read_csv('fltr_OHCA_preECG_within24h.csv')
fltr_both_preECG_within24h = pd.concat([fltr_IHCA_preECG_within24h, fltr_OHCA_preECG_within24h], ignore_index=True)
                        
print(fltr_both_preECG_within24h.shape)
#fltr_both_preECG_within24h.head(5)
                                      
#Apply filters
stg_meas_IHCA_preECGs_w24h= pd.merge(stg_measurements_preECGs_df,fltr_IHCA_preECG_within24h,\
                                     on=['ecg_id', 'ecg_timetag'])
#print(stg_meas_IHCA_preECGs_w24h.shape)
#stg_meas_IHCA_preECGs_w24h.head(5)

stg_meas_OHCA_preECGs_w24h= pd.merge(stg_measurements_preECGs_df,fltr_OHCA_preECG_within24h,\
                                     on=['ecg_id', 'ecg_timetag'])
print(stg_meas_OHCA_preECGs_w24h.shape)
print(stg_meas_OHCA_preECGs_w24h.head(5))

stg_meas_both_preECGs_w24h=pd.merge(stg_measurements_preECGs_df,fltr_both_preECG_within24h,\
                                     on=['ecg_id', 'ecg_timetag'])
#print(stg_meas_both_preECGs_w24h.shape)
#stg_meas_both_preECGs_w24h.head(5)

(160, 8)
(19, 632)
                             ecg_id AVF_MAXRA AVF_MAXSA AVF_MNSTA AVF_MXSTA   
0  05fa300d950b78940fa60cd8ddb0a845       244       146        24       214  \
1  104702b301e22d70dee15936c8f94376         0      1601      -196       -54   
2  1fa73043ee5f75c015ff715f337cfa5e       751       502       -44        87   
3  27107c4158e3729bf68b4438d5dbc45e        78       673      -250      -215   
4  30cc765d728586ae8f0ee76d1f371202      1342       166       -54       -10   

  AVF_PAMP AVF_PAREA AVF_PDUR AVF_PONA AVF_P’AMP  ... V6_bmTI V6_bmTPAR   
0        0         0        0        0         0  ...       8         0  \
1        0         0        0        0         0  ...     140         0   
2        0         0        0        0         0  ...      48         0   
3       58        33       40     -113         0  ...      70         0   
4      136       432      114       14         0  ...       0         0   

  V6_bmTPI ecg_timetag patient_ngsci_id location_of_car

In [17]:
#Change the name of a column
stg_meas_OHCA_preECGs_w24h = stg_meas_OHCA_preECGs_w24h.rename(columns={'patient_ngsci_id': 'patient_id'})
#print(stg_meas_both_preECGs_w24h.head(5))

#Keep leads II and V1 only
# select columns that contain 'I' or 'V1' and also include ecg_id, patient_id, sex, and age
OHCA_preECGs_w24h = stg_meas_OHCA_preECGs_w24h[
    stg_meas_OHCA_preECGs_w24h.columns[
        (stg_meas_OHCA_preECGs_w24h.columns.str.startswith('I_') | 
         stg_meas_OHCA_preECGs_w24h.columns.str.startswith('V1_')) | 
        stg_meas_OHCA_preECGs_w24h.columns.isin(['ecg_id', 'patient_id', 'sex', 'age'])]]
# Display the new dataframe
#print(OHCA_preECGs_w24h.head(5))

In [18]:
#Keep these variables only. PAMP, PDUR, bmPAR, bmPI, QAMP, QDUR, bmQAR, bmQI, RAMP, RDUR, bmRAR, bmRI
#SAMP, SDUR, bmSAR, bmSI, QRSA, QRSDEF
columns_I = ['I_PAMP', 'I_PDUR', 'I_QAMP', 'I_QDUR', 'I_RAMP',\
                     'I_RDUR', 'I_SAMP', 'I_SDUR', 'I_QRSA', 'I_QRSDEF']

columns_V1 = ['V1_PAMP', 'V1_PDUR', 'V1_QAMP', 'V1_QDUR', 'V1_RAMP',\
                     'V1_RDUR', 'V1_SAMP', 'V1_SDUR', 'V1_QRSA', 'V1_QRSDEF']

columns_to_keep = ['ecg_id', 'patient_id', 'sex', 'age']+columns_I+columns_V1

OHCA_preECGs_w24h=OHCA_preECGs_w24h[columns_to_keep]
#print(OHCA_preECGs_w24h.head(5))
OHCA_preECGs_w24h['label']=1
#print(OHCA_preECGs_w24h.head(5))

#Hot-code sex variable
OHCA_preECGs_w24h = one_hot_encode_sex(OHCA_preECGs_w24h)
print(OHCA_preECGs_w24h.head(5))

                             ecg_id  patient_id age I_PAMP I_PDUR I_QAMP   
0  05fa300d950b78940fa60cd8ddb0a845      101200  82      0      0      0  \
1  104702b301e22d70dee15936c8f94376      101158  82      0      0      0   
2  1fa73043ee5f75c015ff715f337cfa5e      101279  25      0      0      0   
3  27107c4158e3729bf68b4438d5dbc45e      100834  58     34     40      0   
4  30cc765d728586ae8f0ee76d1f371202      100753  37     87    114      0   

  I_QDUR I_RAMP I_RDUR I_SAMP  ... V1_QDUR V1_RAMP V1_RDUR V1_SAMP V1_SDUR   
0      0     48     19    253  ...       0      68      23     180      31  \
1      0    493    105      0  ...      20       0       0       0       0   
2      0    112    100      0  ...       0     473      40     668      60   
3      0    170     44    800  ...       0    1250     132       0       0   
4      0    385     73      0  ...       0     131      22    1674      55   

  V1_QRSA V1_QRSDEF label female   male  
0     352       712     1   True

In [19]:
#Change the female and male to numeric variables
convert_TrueFalse_column(OHCA_preECGs_w24h, 'female')
convert_TrueFalse_column(OHCA_preECGs_w24h, 'male')
print(OHCA_preECGs_w24h.head(5))
#Save the results in a CSV file. 
#OHCA_preECGs_w24h.to_csv('data_OHCA_preECGs_w24h.csv', index=False)

                             ecg_id  patient_id age I_PAMP I_PDUR I_QAMP   
0  05fa300d950b78940fa60cd8ddb0a845      101200  82      0      0      0  \
1  104702b301e22d70dee15936c8f94376      101158  82      0      0      0   
2  1fa73043ee5f75c015ff715f337cfa5e      101279  25      0      0      0   
3  27107c4158e3729bf68b4438d5dbc45e      100834  58     34     40      0   
4  30cc765d728586ae8f0ee76d1f371202      100753  37     87    114      0   

  I_QDUR I_RAMP I_RDUR I_SAMP  ... V1_QDUR V1_RAMP V1_RDUR V1_SAMP V1_SDUR   
0      0     48     19    253  ...       0      68      23     180      31  \
1      0    493    105      0  ...      20       0       0       0       0   
2      0    112    100      0  ...       0     473      40     668      60   
3      0    170     44    800  ...       0    1250     132       0       0   
4      0    385     73      0  ...       0     131      22    1674      55   

  V1_QRSA V1_QRSDEF label female male  
0     352       712     1      1  

In [20]:
#Check if all ages are numeric
#numeric_test=is_column_numeric(OHCA_preECGs_w24h, 'age')
#print(numeric_test)
OHCA_preECGs_w24h=replace_age_values(OHCA_preECGs_w24h)
#numeric_test2=is_column_numeric(OHCA_preECGs_w24h, 'age')
#print(numeric_test2)


#Save the results in a CSV file.
#OHCA_preECGs_w24h.to_csv('data_OHCA_preECGs_w24h.csv', index=False)
OHCA_preECGs_w24h

Unnamed: 0,ecg_id,patient_id,age,I_PAMP,I_PDUR,I_QAMP,I_QDUR,I_RAMP,I_RDUR,I_SAMP,...,V1_QDUR,V1_RAMP,V1_RDUR,V1_SAMP,V1_SDUR,V1_QRSA,V1_QRSDEF,label,female,male
0,05fa300d950b78940fa60cd8ddb0a845,101200,82,0,0,0,0,48,19,253,...,0,68,23,180,31,352,712,1,1,0
1,104702b301e22d70dee15936c8f94376,101158,82,0,0,0,0,493,105,0,...,20,0,0,0,0,-19,19,1,1,0
2,1fa73043ee5f75c015ff715f337cfa5e,101279,25,0,0,0,0,112,100,0,...,0,473,40,668,60,-195,1141,1,0,1
3,27107c4158e3729bf68b4438d5dbc45e,100834,58,34,40,0,0,170,44,800,...,0,1250,132,0,0,1250,1250,1,0,1
4,30cc765d728586ae8f0ee76d1f371202,100753,37,87,114,0,0,385,73,0,...,0,131,22,1674,55,-1543,1805,1,1,0
5,4587987abd6e66d3d316ecdceb5cc23a,101239,95,34,120,14,18,380,43,117,...,0,781,128,0,0,781,781,1,0,1
6,472a68020a2f7c3930affaeeb5c1acdf,100068,83,0,0,0,0,190,41,107,...,0,43,19,229,21,254,712,1,0,1
7,605dcb0ab9192ff72f93ab96337ea7fa,101265,72,126,106,0,0,742,99,0,...,0,219,44,1425,57,-1206,1644,1,1,0
8,685c6130656dc0bc4fa581481f9f2986,101054,75,58,74,0,0,551,59,0,...,0,146,21,815,50,-669,961,1,0,1
9,726ef9d0d0498a6a6b5dbf49a9ff9ac0,100835,79,0,0,0,0,83,27,419,...,15,1162,147,0,0,1133,1191,1,0,1
