## Control Group Data

We explore the control group data below.

In [1]:
import pandas as pd
import os
import numpy as np
from functools import reduce

import matplotlib.pyplot as plt

In [2]:
DATA_DIR = os.path.join('/','home','ngsci','datasets','arrest-ntuh-ecg','v1')
LEAD_ORDER = ['I', 'II', 'III',
              'aVR', 'aVL', 'aVF',
              'V1', 'V2', 'V3', 'V4', 'V5', 'V6']
LEAD_INDEX = {lead: i for i, lead in enumerate(LEAD_ORDER)}
ECG_CAPTURE_TIME = 10  # Each ECG is taken for 10 seconds
ECG_SAMPLE_RATE = 500 # Each ECG is sampled at 500 Hz

In [3]:
#Functions

def load_outcomes_df(fp):
    '''Load outcomes'''
    df = pd.read_csv(fp)

    # Convert to datetime
    date_cols = [c for c in df.columns if '_offset' in c]
    for c in date_cols:
        df[c] = pd.to_datetime(df[c], format='ISO8601')

    print('Loaded {}: {}'.format(fp, df.shape))
    return df

def load_control_group_outcomes():
    # Load Control Group files
    ecg_cohort_fp = os.path.join(DATA_DIR, 'control-group','ecg-cohort.csv')
    ecg_cohort_df = load_outcomes_df(ecg_cohort_fp)

    return ecg_cohort_df

def load_control_group_rhythm_ecgs(year):
    '''Load Control Group rhythm ECGs npy and lookup CSV'''
    # Waveforms - pre
    fp = os.path.join(DATA_DIR, 'control-group/{}/pre/ecg-waveforms/waveform-rhythm.npy'.format(year))
    ecg_rhythm_npy_pre = np.load(fp)
    print('Loaded {}: {}'.format(fp, ecg_rhythm_npy_pre.shape))
    
    # Waveforms - control
    fp = os.path.join(DATA_DIR, 'control-group/{}/control/ecg-waveforms/waveform-rhythm.npy'.format(year))
    ecg_rhythm_npy_control = np.load(fp)
    print('Loaded {}: {}'.format(fp, ecg_rhythm_npy_control.shape))
    
    # Dictionary to store both 'pre' and 'control' waveforms
    ecg_rhythm_npy_precontrol = {
        'pre': ecg_rhythm_npy_pre,
        'control': ecg_rhythm_npy_control
    }
    
    # Waveform lookup df - pre
    fp = os.path.join(DATA_DIR, 'control-group/{}/pre/ecg-waveforms/waveform-npy.csv'.format(year))
    ecg_npy_df_pre = pd.read_csv(fp)
    print('Loaded {}: {}'.format(fp, ecg_npy_df_pre.shape))
    ecg_npy_df_pre['ecg_timetag'] = 'pre'
    
    # Waveform lookup df - control
    fp = os.path.join(DATA_DIR, 'control-group/{}/control/ecg-waveforms/waveform-npy.csv'.format(year))
    ecg_npy_df_control = pd.read_csv(fp)
    print('Loaded {}: {}'.format(fp, ecg_npy_df_control.shape))
    ecg_npy_df_control['ecg_timetag'] = 'control'
    
    ecg_npy_df = pd.concat([ecg_npy_df_pre, ecg_npy_df_control])

    return ecg_rhythm_npy_precontrol, ecg_npy_df

def load_control_group_ecg_metadata(year, control_or_pre, file):
    '''Load Control Group ecg-metadata file
    Inputs = year, control_or_pre, file
    Output = The file as dataframe'''
    #Get the file path (fp)
    ecg_metadata_fp = os.path.join(DATA_DIR,'control-group',year, \
                                   control_or_pre,'ecg-metadata',file)
    #Read the file on the file path
    ecg_metadata_df = pd.read_csv(ecg_metadata_fp,low_memory=False)
    print('Loaded {}: {}'.format(ecg_metadata_fp, ecg_metadata_df.shape))
    
    return  ecg_metadata_df

def find_unique_values_counts(dataframe, columns):
    unique_values_counts = {}
    for column in columns:
        unique_values_counts[column] = dataframe[column].value_counts().to_dict()
    return unique_values_counts


def print_unique_values_counts(df, columns_to_analyze):
    # Get unique values and their counts
    unique_values_counts = find_unique_values_counts(df, columns_to_analyze)
    
    #print the number of unique ecgs in the dataframe
    num_unique_ecg_ids = df['ecg_id'].nunique()
    print(f"Number of different 'ecg_ids' is:{num_unique_ecg_ids}")
    
    # Print the results
    for column, values_counts in unique_values_counts.items():
        print(f"Column: {column}")
        for value, count in values_counts.items():
            print(f"Value: {value}, Count: {count}")
        print("\n")

def load_control_group_waveforms_attributes(year, control_or_pre):
    '''Load Study Group Waveform Attribute Table'''
    #Get the file path (fp)
    waveforms_attributes_fp = os.path.join(DATA_DIR, 'control-group',year,control_or_pre,\
                                           'ecg-waveforms','waveform.csv')
    #Read the file on the file path
    waveform_attributes_df = pd.read_csv(waveforms_attributes_fp)
    print('Loaded {}: {}'.format(waveforms_attributes_fp, waveform_attributes_df.shape))
     
    return waveform_attributes_df

def transform_to_single_row(df):
    """
    Transforms a dataframe with columns 'ecg_id', 'sample', 'ecg_timetag', and 'lead_meas'
    to a single row dataframe where 'lead_meas' values become column names.

    Parameters:
    df (pd.DataFrame): Input dataframe with columns 'ecg_id', 'sample', 'ecg_timetag', and 'lead_meas'.

    Returns:
    pd.DataFrame: Transformed dataframe with a single row.
    """
    # Get the unique values for ecg_id and ecg_timetag
    ecg_id_value = df['ecg_id'].iloc[0]
    ecg_timetag_value = df['ecg_timetag'].iloc[0]

    # Pivot the dataframe
    pivot_df = df.pivot(index='ecg_id', columns='lead_meas', values='sample')

    # Reset the index to turn the index into a column
    pivot_df.reset_index(inplace=True)

    # Add the ecg_timetag column
    pivot_df['ecg_timetag'] = ecg_timetag_value

    return pivot_df

import pandas as pd

def check_missing_values(df):
    """
    Check for missing values in the dataframe and print the count of missing values in each column.

    Parameters:
    df (pd.DataFrame): The dataframe to check for missing values.

    Returns:
    None
    """
    # Check for missing values in the dataframe
    missing_values = df.isnull().any()

    # Print the result
    print("Columns with missing values:")
    print(missing_values)
    
    # Check the count of missing values in each column
    missing_values_count = df.isnull().sum()
    print("\nCount of missing values in each column:")
    print(missing_values_count)

def convert_sex_column(df):
    """
    Converts the values in the 'sex' column of the dataframe.
    Changes 'female' to 0 and 'male' to 1.

    Parameters:
    df (pd.DataFrame): Input dataframe with a 'sex' column.

    Returns:
    pd.DataFrame: Dataframe with converted 'sex' column.
    """
    # Mapping dictionary
    sex_mapping = {'female': 0, 'male': 1}
    
    # Apply the mapping
    df['sex'] = df['sex'].map(sex_mapping)
    
    return df

def one_hot_encode_sex(df):
    # Create a new DataFrame with one-hot encoded columns
    encoded_df = pd.get_dummies(df, columns=['sex'])
    
    # Rename the encoded columns for clarity
    encoded_df = encoded_df.rename(columns={'sex_female': 'female', 'sex_male': 'male'})
    
    return encoded_df

def convert_TrueFalse_column(df, column):
    """
    Converts the values in the column of the dataframe.
    Changes 'True' to 1 and 'False' to 0.

    Parameters:
    df (pd.DataFrame): Input dataframe with a column.

    Returns:
    pd.DataFrame: Dataframe with converted column.
    """
    # Mapping dictionary
    mapping = {False: 0, True: 1}
    
    # Apply the mapping
    df[column] = df[column].map(mapping)
    
    return df

def is_column_numeric(df, column_name):
    """
    Check if a column in a DataFrame contains only numeric values.

    Args:
        df (pd.DataFrame): The DataFrame to check.
        column_name (str): The name of the column to check.

    Returns:
        bool: True if the column contains only numeric values, False otherwise.
    """
    # Check if the column exists in the DataFrame
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' not found in DataFrame")

    # Check if the column contains only numeric values
    return pd.api.types.is_numeric_dtype(df[column_name]) and not df[column_name].isnull().any()

def replace_age_values(df):
    """
    Replace '90+' values in the 'age' column with 95.

    Args:
        df (pd.DataFrame): The DataFrame to modify.

    Returns:
        pd.DataFrame: The modified DataFrame.
    """
    # Check if the 'age' column exists in the DataFrame
    if 'age' not in df.columns:
        raise ValueError("Column 'age' not found in DataFrame")

    # Replace '90+' values with 95
    df['age'] = df['age'].replace('90+', 95)

    return df

In [4]:
# Load Control Group files
control_group_df = load_control_group_outcomes()

ecg_rhythm_npy_precontrol_2015, ecg_npy_df_2015 = load_control_group_rhythm_ecgs('2015')

Loaded /home/ngsci/datasets/arrest-ntuh-ecg/v1/control-group/ecg-cohort.csv: (16386, 8)
Loaded /home/ngsci/datasets/arrest-ntuh-ecg/v1/control-group/2015/pre/ecg-waveforms/waveform-rhythm.npy: (1046, 12, 5000)
Loaded /home/ngsci/datasets/arrest-ntuh-ecg/v1/control-group/2015/control/ecg-waveforms/waveform-rhythm.npy: (1712, 12, 5000)
Loaded /home/ngsci/datasets/arrest-ntuh-ecg/v1/control-group/2015/pre/ecg-waveforms/waveform-npy.csv: (1046, 4)
Loaded /home/ngsci/datasets/arrest-ntuh-ecg/v1/control-group/2015/control/ecg-waveforms/waveform-npy.csv: (1712, 4)


In [5]:
control_group_df.head(5)

Unnamed: 0,patient_ngsci_id,year,ecg_id,ecg_timetag,acquisition_datetime_offset,acquisition_datetime_is_weekend,age,sex
0,1510001362,2015,192717e3fd052706ce297b9d36b58354,control,2126-03-10 23:13:00,0,82,male
1,1510001362,2015,9232eb679d9ab57579440e89b12fb3c9,pre,2125-07-19 11:10:00,0,82,male
2,1510010000,2015,20fbe673da581eba4f6c363eef9557a9,control,2210-08-01 22:56:00,0,80,male
3,1510032359,2015,ab6a26e105ab852f49dd395c497b91e1,control,2171-12-26 21:11:00,0,74,female
4,1510032359,2015,64f8fa6e0718a00f87c0760063d94086,pre,2167-11-07 20:02:00,1,70,female


In [6]:
#Print and count the unique values of ecg type and patients
control_ecg_type_counts = control_group_df['ecg_timetag'].value_counts()
print(control_ecg_type_counts)
print(control_group_df.shape)
unique_patient_ids = control_group_df['patient_ngsci_id'].nunique()
print(f"Number of unique patient is: {unique_patient_ids}")

ecg_timetag
control    9976
pre        6410
Name: count, dtype: int64
(16386, 8)
Number of unique patient is: 9976


In [7]:
#Save the control group years
ctrg_years = {'2015', '2016', '2017', '2018', '2019'}

# List of columns to analyze
columns_to_analyze = ['waveform_type', 'waveform_start_time', 'number_of_leads', 
                      'sample_type', 'sample_base', 'sample_exponent', 
                      'highpass_filter', 'lowpass_filter', 'ac_filter']

#Print all counts in all years
for year in ctrg_years:
    #Load study group attributes
    waveform_attributes_ctrg_df = load_control_group_waveforms_attributes(year,'pre')
    #print_unique_values_counts(waveform_attributes_ctrg_df, columns_to_analyze)

Loaded /home/ngsci/datasets/arrest-ntuh-ecg/v1/control-group/2018/pre/ecg-waveforms/waveform.csv: (2820, 12)
Loaded /home/ngsci/datasets/arrest-ntuh-ecg/v1/control-group/2015/pre/ecg-waveforms/waveform.csv: (2092, 12)
Loaded /home/ngsci/datasets/arrest-ntuh-ecg/v1/control-group/2019/pre/ecg-waveforms/waveform.csv: (2894, 12)
Loaded /home/ngsci/datasets/arrest-ntuh-ecg/v1/control-group/2017/pre/ecg-waveforms/waveform.csv: (2538, 12)
Loaded /home/ngsci/datasets/arrest-ntuh-ecg/v1/control-group/2016/pre/ecg-waveforms/waveform.csv: (2476, 12)


In [8]:
#Save all measuremnt data per lead

#Load Control Group ECG-metadata measurement-matrix-per-lead
columns_to_analyze=['lead_id','measurement_id', 'ttal', 'stdown', 
                    'stelev', 'jelev', 'dltwv', 'stinj', 'ppdeep']
#Change lead_id values
lead_id_mapping = {       #Define the mapping from numeric values to lead names
    0: 'I',
    1: 'II',
    2: 'V1',
    3: 'V2',
    4: 'V3',
    5: 'V4',
    6: 'V5',
    7: 'V6',
    8: 'III',
    9: 'AVR',
    10: 'AVL',
    11: 'AVF' }

#Change measurement_id values
meas_id_mapping = {   #Define the mapping from numeric values to lead names
    0: 'PONA',
    1: 'PAMP',
    2: 'PDUR',
    3: 'bmPAR',
    4: 'bmPI',
    5: 'P’AMP',
    6: 'P’DUR',
    7: 'bmPPAR',
    8: 'bmPPI',
    9: 'QAMP',
    10: 'QDUR',
    11: 'bmQAR',
    12: 'bmQI',
    13: 'RAMP',
    14: 'RDUR',
    15: 'bmRAR',
    16: 'bmRI',
    17: 'SAMP',
    18: 'SDUR',
    19: 'bmSAR',
    20: 'bmSI',
    21: 'R’AMP',
    22: 'R’DUR',
    23: 'bmRPAR',
    24: 'bmRPI',
    25: 'S’AMP',
    26: 'S’DUR',
    27: 'bmSPAR',
    28: 'bmSPI',
    29: 'STJ',
    30: 'STM',
    31: 'STE',
    32: 'MXSTA',
    33: 'MNSTA',
    34: 'SPTA',
    35: 'QRSA',
    36: 'QRSDEF',
    37: 'MAXRA',
    38: 'MAXSA',
    39: 'TAMP',
    40: 'TDUR',
    41: 'bmTAR',
    42: 'bmTI',
    43: 'T’AMP',
    44: 'TPDUR',
    45: 'bmTPAR',
    46: 'bmTPI',
    47: 'TEND',
    48: 'PAREA',
    49: 'QRSAR',
    50: 'TAREA',
    51: 'QRSINT',
    52: 'BITFLG'
}

# Define the columns to keep
columns_to_keep = ['ecg_id', 
                   'lead_id', 
                   'ttal', 
                   'stdown', 
                   'stelev', 
                   'jelev', 
                   'dltwv', 
                   'stinj', 
                   'ppdeep', 
                   'ecg_timetag']

measurements_years_per_lead = pd.DataFrame()

# Create the partial and the final output dataframe for merging results
ctrg_measurements_years_byRows_df = pd.DataFrame()

ctrg_years = {'2015', '2016', '2017', '2018', '2019'}


for year in ctrg_years:
    ctrg_measurements_byColumns_df = load_control_group_ecg_metadata(year,'pre',\
                                                'measurement-matrix-per-lead.csv')
    #Print unique values counts
    #print_unique_values_counts(ctrg_measurementsFull_byColumns_df, columns_to_analyze)

    # Apply the mapping to the lead_id column in the dataframe
    ctrg_measurements_byColumns_df['lead_id'] = ctrg_measurements_byColumns_df.loc[:,'lead_id'].map(lead_id_mapping)
    # Apply the mapping to the measurement_id column in the dataframe
    ctrg_measurements_byColumns_df['measurement_id'] = ctrg_measurements_byColumns_df.loc[:,'measurement_id'].map(meas_id_mapping)
    
    #Keep measurements applicable to each Lead
        # Select the specified columns and create the new dataframe
    measurements_per_lead_partial = ctrg_measurements_byColumns_df[columns_to_keep].dropna() #Concatenate all at the end
    #Concatenate the partial measurements per lead. 
    measurements_years_per_lead = pd.concat([measurements_years_per_lead, measurements_per_lead_partial], ignore_index=True)
    
    
    # Create the new column by concatenating 'lead_id' and 'measurement_id' with an underscore
    ctrg_measurements_byColumns_df['lead_meas'] = ctrg_measurements_byColumns_df['lead_id'].astype(str) + \
                                            '_' + ctrg_measurements_byColumns_df['measurement_id'].astype(str)
    
    # Drop the original columns 'lead_id' and 'measurement_id' and others
    ctrg_measurements_byColumns_df.drop(columns=['lead_id', 'measurement_id', 'ttal', 'stdown', 'stelev', \
                                            'jelev', 'dltwv', 'stinj', 'ppdeep','8'], inplace=True)
    
    # Group the dataframe by 'ecg_id' and apply the transformation function to each group
    ctrg_measurements_byRows_df = pd.DataFrame() #Create the dataframe to group all the partial results
    for ecg_id, group in ctrg_measurements_byColumns_df.groupby('ecg_id'):
        single_row_df = transform_to_single_row(group)
        ctrg_measurements_byRows_df = pd.concat([ctrg_measurements_byRows_df, single_row_df], ignore_index=True)
    
    # Drop columns that contain 'BITFLG' in their names
    ctrg_measurements_byRows_df = ctrg_measurements_byRows_df.loc[:, ~ctrg_measurements_byRows_df.columns.str.contains('BITFLG')]

    #Group dataframes by year
    ctrg_measurements_years_byRows_df = pd.concat([ctrg_measurements_years_byRows_df, ctrg_measurements_byRows_df], ignore_index=True)

print(ctrg_measurements_years_byRows_df.shape)

Loaded /home/ngsci/datasets/arrest-ntuh-ecg/v1/control-group/2018/pre/ecg-metadata/measurement-matrix-per-lead.csv: (896760, 14)
Loaded /home/ngsci/datasets/arrest-ntuh-ecg/v1/control-group/2015/pre/ecg-metadata/measurement-matrix-per-lead.csv: (665256, 14)
Loaded /home/ngsci/datasets/arrest-ntuh-ecg/v1/control-group/2019/pre/ecg-metadata/measurement-matrix-per-lead.csv: (920292, 14)
Loaded /home/ngsci/datasets/arrest-ntuh-ecg/v1/control-group/2017/pre/ecg-metadata/measurement-matrix-per-lead.csv: (807084, 14)
Loaded /home/ngsci/datasets/arrest-ntuh-ecg/v1/control-group/2016/pre/ecg-metadata/measurement-matrix-per-lead.csv: (787368, 14)
(6410, 626)


In [9]:
ctrg_measurements_years_byRows_df.head(5)

lead_meas,ecg_id,AVF_MAXRA,AVF_MAXSA,AVF_MNSTA,AVF_MXSTA,AVF_PAMP,AVF_PAREA,AVF_PDUR,AVF_PONA,AVF_P’AMP,...,V6_bmRPI,V6_bmSAR,V6_bmSI,V6_bmSPAR,V6_bmSPI,V6_bmTAR,V6_bmTI,V6_bmTPAR,V6_bmTPI,ecg_timetag
0,00736c9de5c6359324b2636812d15fb9,410,209,19,39,117,291,100,29,0,...,0,80,60,0,0,1803,156,0,0,pre
1,00c4c3d0de9ff8a9a6a3715565ebef58,200,820,-35,-5,161,375,79,87,-24,...,0,948,62,0,0,-311,102,0,0,pre
2,00c50db07e6f2432b540d1906ecc60db,297,473,-103,48,117,168,63,-10,-34,...,60,12,24,196,94,10,8,-11,14,pre
3,00d3c4e729566f7c94805928a967e0a6,678,48,14,29,131,333,108,63,0,...,0,34,54,0,0,1199,106,0,0,pre
4,0147e10410e6fa98df66bfc36cd7dc82,561,166,-5,97,141,388,104,53,0,...,0,294,50,0,0,1894,84,0,0,pre


In [10]:
#Load the filter
fltr_ctrg_preECG_within_Ndays = pd.read_csv('fltr_ctrg_preECG_within_7days.csv')

#Merge main data with filter
crtg_meas_preECGs_wNdays = pd.merge(fltr_ctrg_preECG_within_Ndays, ctrg_measurements_years_byRows_df, on=['ecg_id', 'ecg_timetag'])
print(crtg_meas_preECGs_wNdays.shape)
print(crtg_meas_preECGs_wNdays.head(5))

#Delete some columns 
#crtg_meas_preECGs_wNdays=crtg_meas_preECGs_wNdays.drop(columns=['patient_ngsci_id','ecg_timetag'])
#print(crtg_meas_preECGs_wNdays.shape)
#print(crtg_meas_preECGs_wNdays.head(5))


(247, 629)
   patient_ngsci_id                            ecg_id ecg_timetag age     sex   
0        1510681000  7ae0c39d4faf61818ab8f2e77bdff1b2         pre  63  female  \
1        1510761001  46bec794a0506fdd53769efa7592df77         pre  56    male   
2        1511071007  00d4f1578b4d8c4a330ef83b24d659cf         pre  65  female   
3        1511161003  31bb0a7cff7aa23d0f24cb5c328d102a         pre  61    male   
4        1511211004  6b9123b3ff7b0e0564383262376f4e69         pre  74  female   

  AVF_MAXRA AVF_MAXSA AVF_MNSTA AVF_MXSTA AVF_PAMP  ... V6_bmRPAR V6_bmRPI   
0       507        53       -44        34      112  ...         0        0  \
1       488       346       -25        14       48  ...         0        0   
2       205       219       -35         9      112  ...         0        0   
3       493        14         4        14      112  ...         0        0   
4       517       249       -35         0       53  ...         0        0   

  V6_bmSAR V6_bmSI V6_bmSPAR V6_b

In [11]:
#Change the name of a column
crtg_meas_preECGs_wNdays = crtg_meas_preECGs_wNdays.rename(columns={'patient_ngsci_id': 'patient_id'})
#print(stg_meas_both_preECGs_w24h.head(5))

#Keep leads II and V1 only
# select columns that contain 'I' or 'V1' and also include ecg_id, patient_id, sex, and age
ctrg_preECGs_wNdays = crtg_meas_preECGs_wNdays[
    crtg_meas_preECGs_wNdays.columns[
        (crtg_meas_preECGs_wNdays.columns.str.startswith('I_') | 
         crtg_meas_preECGs_wNdays.columns.str.startswith('V1_')) | 
        crtg_meas_preECGs_wNdays.columns.isin(['ecg_id', 'patient_id', 'sex', 'age'])]]
# Display the new dataframe
#print(ctrg_preECGs_wNdays.head(5))

In [12]:
check_missing_values(ctrg_preECGs_wNdays)

Columns with missing values:
patient_id    False
ecg_id        False
age           False
sex           False
I_MAXRA       False
              ...  
V1_bmSPI      False
V1_bmTAR      False
V1_bmTI       False
V1_bmTPAR     False
V1_bmTPI      False
Length: 108, dtype: bool

Count of missing values in each column:
patient_id    0
ecg_id        0
age           0
sex           0
I_MAXRA       0
             ..
V1_bmSPI      0
V1_bmTAR      0
V1_bmTI       0
V1_bmTPAR     0
V1_bmTPI      0
Length: 108, dtype: int64


In [13]:
#Keep these variables only. PAMP, PDUR, bmPAR, bmPI, QAMP, QDUR, bmQAR, bmQI, RAMP, RDUR, bmRAR, bmRI
#SAMP, SDUR, bmSAR, bmSI, QRSA, QRSDEF
columns_I = ['I_PAMP', 
             'I_PDUR', 
             'I_QAMP', 
             'I_QDUR', 
             'I_RAMP',\
            'I_RDUR', 
             'I_SAMP', 
             'I_SDUR', 
             'I_QRSA', 
             'I_QRSDEF']

columns_V1 = ['V1_PAMP', 
              'V1_PDUR', 
              'V1_QAMP', 
              'V1_QDUR', 
              'V1_RAMP',\
              'V1_RDUR', 
              'V1_SAMP', 
              'V1_SDUR', 
              'V1_QRSA', 
              'V1_QRSDEF']

columns_to_keep = ['ecg_id', 'patient_id', 'sex', 'age']+columns_I+columns_V1

ctrg_preECGs_wNdays = ctrg_preECGs_wNdays[columns_to_keep]
#print(OHCA_preECGs_w24h.head(5))
ctrg_preECGs_wNdays['label']=0
#print(ctrg_preECGs_wNdays.head(5))

#ctrg_preECGs_wNdays = convert_sex_column(ctrg_preECGs_wNdays)
#print(ctrg_preECGs_wNdays.head(5))

In [14]:
#Hot-code sex variable
ctrg_preECGs_wNdays = one_hot_encode_sex(ctrg_preECGs_wNdays)
#print(ctrg_preECGs_wNdays.head(5))
#Save the results in a CSV file. 
#ctrg_preECGs_wNdays.to_csv('data_ctrg_preECGs_wNdays.csv', index=False)

In [15]:
#Change the female and male to numeric variables
convert_TrueFalse_column(ctrg_preECGs_wNdays, 'female')
convert_TrueFalse_column(ctrg_preECGs_wNdays, 'male')
#print(ctrg_preECGs_wNdays.head(5))

Unnamed: 0,ecg_id,patient_id,age,I_PAMP,I_PDUR,I_QAMP,I_QDUR,I_RAMP,I_RDUR,I_SAMP,...,V1_QDUR,V1_RAMP,V1_RDUR,V1_SAMP,V1_SDUR,V1_QRSA,V1_QRSDEF,label,female,male
0,7ae0c39d4faf61818ab8f2e77bdff1b2,1510681000,63,87,68,0,0,1142,43,170,...,0,405,28,1235,46,-830,1640,0,1,0
1,46bec794a0506fdd53769efa7592df77,1510761001,56,87,62,58,20,688,44,0,...,0,341,25,1655,65,-1314,1996,0,0,1
2,00d4f1578b4d8c4a330ef83b24d659cf,1511071007,65,97,110,0,0,473,78,0,...,0,29,14,747,64,-718,776,0,1,0
3,31bb0a7cff7aa23d0f24cb5c328d102a,1511161003,61,122,104,0,0,546,60,151,...,0,131,27,1523,57,-1392,1654,0,0,1
4,6b9123b3ff7b0e0564383262376f4e69,1511211004,74,97,96,0,0,615,74,0,...,74,0,0,0,0,-1025,1025,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
242,3cfa8ae12254ae7359549de455cfbf38,1736521811,58,63,114,0,0,1049,53,219,...,0,92,25,644,71,-552,736,0,1,0
243,92c9bd1594c9eb7bd0877d68e22907ef,1736681824,50,73,104,0,0,537,70,0,...,0,83,22,517,48,-434,600,0,1,0
244,e106ee1122b4fbc1841b5c3d3ec31d3c,1736971846,68,112,106,0,0,1098,67,151,...,0,380,53,180,12,200,560,0,0,1
245,48b23d5c40098a378465b4f30a8cfdb4,1737081843,64,146,108,24,12,1240,68,0,...,0,48,16,869,64,-821,917,0,1,0


In [16]:
#Check if all ages are numeric
#numeric_test=is_column_numeric(ctrg_preECGs_wNdays, 'age')
#print(numeric_test)
ctrg_preECGs_wNdays=replace_age_values(ctrg_preECGs_wNdays)
#numeric_test2=is_column_numeric(ctrg_preECGs_wNdays, 'age')
#print(numeric_test2)


#Save the results in a CSV file.
#ctrg_preECGs_wNdays.to_csv('data_ctrg_preECGs_wNdays.csv', index=False)

#Print final shape
print(ctrg_preECGs_wNdays.shape)

(247, 26)
