In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as skl

  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [3]:
# load csv with information
#info_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/coorteeqsrafva.csv', sep=';', header=0, index_col=0)
info_df = pd.read_csv('/Users/kris/Library/CloudStorage/OneDrive-Personal/Data science/applications of data science/assessment 3/ECG Data/coorteeqsrafva.csv', sep=';', header=0, index_col=0)
# print info
print(info_df.shape)
info_df.columns

(6428, 30)


Index(['diagnosi', 'ecg_id', 'ritmi', 'patient_id', 'age', 'sex', 'height',
       'weight', 'nurse', 'site', 'device', 'recording_date', 'report',
       'scp_codes', 'heart_axis', 'infarction_stadium1', 'infarction_stadium2',
       'validated_by', 'second_opinion', 'initial_autogenerated_report',
       'validated_by_human', 'baseline_drift', 'static_noise', 'burst_noise',
       'electrodes_problems', 'extra_beats', 'pacemaker', 'strat_fold',
       'filename_lr', 'filename_hr'],
      dtype='object')

In [4]:
# mapping the 3 rhythms in ritmi column to numerical
info_df['ritmi'] = info_df['ritmi'].map({'SR': 0, 'AF': 1, 'VA': 2}).values

In [5]:
# select labels that have a qulaity factor and use only cases that do not have quality problems
ecg_class = pd.DataFrame(info_df[['ritmi','validated_by_human','baseline_drift', 'static_noise', 'burst_noise',
       'electrodes_problems', 'extra_beats','pacemaker','infarction_stadium1', 'infarction_stadium2','second_opinion']])
ecg_class['case'] = np.arange(ecg_class.shape[0])
ecg_class=ecg_class.fillna(0)
ecg_class

Unnamed: 0,ritmi,validated_by_human,baseline_drift,static_noise,burst_noise,electrodes_problems,extra_beats,pacemaker,infarction_stadium1,infarction_stadium2,second_opinion,case
0,2,False,0,0,0,0,0,0,0,0,False,0
1,1,False,0,0,0,0,0,0,0,0,False,1
2,0,True,0,0,0,0,0,0,0,0,False,2
3,2,True,0,0,0,0,0,0,0,0,False,3
4,2,False,0,", I-AVR,",0,0,0,0,0,0,False,4
...,...,...,...,...,...,...,...,...,...,...,...,...
6423,2,True,0,0,0,0,0,0,0,0,False,6423
6424,2,True,"v3,",0,0,0,2ES,0,Stadium III,0,False,6424
6425,0,True,0,", I-AVR,",0,0,0,0,0,0,False,6425
6426,1,True,0,0,0,0,2ES,0,0,0,False,6426


In [6]:
from sklearn.utils import resample

df_majority = ecg_class[ecg_class.ritmi==0]
df_minority = ecg_class[ecg_class.ritmi==1]


# Downsample majority class
df_majority_downsampled = resample(df_majority,
                                 replace=False,    # sample without replacement
                                 n_samples=1587,     # to match minority class
                                 random_state=123) # reproducible results

# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])

# Display new class counts
df_downsampled.ritmi.value_counts()

0    1587
1    1587
Name: ritmi, dtype: int64

In [7]:
af_sinus_df = df_downsampled[df_downsampled['ritmi'].isin([0,1])]
af_sinus_df = af_sinus_df[(af_sinus_df['validated_by_human'] == True)]
af_sinus_df = af_sinus_df[(af_sinus_df['baseline_drift'] == 0)]
af_sinus_df = af_sinus_df[(af_sinus_df['static_noise'] == 0)]
af_sinus_df = af_sinus_df[(af_sinus_df['burst_noise'] == 0)]
af_sinus_df = af_sinus_df[(af_sinus_df['electrodes_problems'] == 0)]
af_sinus_df = af_sinus_df[(af_sinus_df['extra_beats'] == 0)]
af_sinus_df = af_sinus_df[(af_sinus_df['pacemaker'] == 0)]
af_sinus_df = af_sinus_df[(af_sinus_df['infarction_stadium1'] == 0)]
af_sinus_df = af_sinus_df[(af_sinus_df['infarction_stadium2'] == 0)]
af_sinus_df = af_sinus_df[(af_sinus_df['second_opinion'] == False)]

af_sinus_df.shape

(1090, 12)

In [8]:
df_downsampled

Unnamed: 0,ritmi,validated_by_human,baseline_drift,static_noise,burst_noise,electrodes_problems,extra_beats,pacemaker,infarction_stadium1,infarction_stadium2,second_opinion,case
4354,0,False,0,0,0,0,0,0,0,0,False,4354
4345,0,True,0,0,0,0,0,0,0,0,False,4345
589,0,True,0,0,0,0,0,0,0,0,False,589
4327,0,False,0,", I-AVR,",0,0,0,0,unknown,0,False,4327
5830,0,True,", II-V2",", I-V3,",0,0,0,0,0,0,False,5830
...,...,...,...,...,...,...,...,...,...,...,...,...
6405,1,True,0,0,0,0,0,0,0,0,False,6405
6408,1,False,0,", I-AVF,",0,0,0,0,unknown,0,False,6408
6415,1,False,0,0,0,0,0,0,Stadium II-III,0,False,6415
6419,1,False,0,0,0,0,0,0,0,0,False,6419


In [9]:
y = df_downsampled[['ritmi']]

In [10]:
y

Unnamed: 0,ritmi
4354,0
4345,0
589,0
4327,0
5830,0
...,...
6405,1
6408,1
6415,1
6419,1


In [12]:
# read in actual ecg data
ecg = np.load('/Users/kris/Library/CloudStorage/OneDrive-Personal/Data science/applications of data science/assessment 3/ECG Data/ecgeq-500hzsrfava.npy')
print(ecg.shape)

(6428, 5000, 12)


In [13]:
# reduce the length of the ECG strips

ecg = ecg[:,:2000,:3]

ecg.shape

(6428, 2000, 3)

In [14]:
# Extract the selected indices as a NumPy array
selected_indices = df_downsampled['case'].to_numpy()

# Filter out ECG data for selected patients using indices
x = ecg[selected_indices]

# Verify the shape of the resulting NumPy array
print("Selected ECG NumPy array shape:", x.shape)

Selected ECG NumPy array shape: (3174, 2000, 3)


In [15]:
# Save x DataFrame to CSV

np.save('/Users/kris/Library/CloudStorage/OneDrive-Personal/Data science/applications of data science/assessment 3/ECG Data/x_data.npy', x)

# Save y DataFrame to CSV
y.to_csv('/Users/kris/Library/CloudStorage/OneDrive-Personal/Data science/applications of data science/assessment 3/ECG Data/y_labels.csv', index=False)