# Pre-Processing: Phonocardiograms

Author: Jake Dumbauld <br>
Contact: jacobmilodumbauld@gmail.com<br>
Date: 3.15.22

In [1]:
# importing libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import librosa
import librosa.display
import IPython

import os
from random import uniform
import time
from IPython.display import display, clear_output

In [2]:
# importing data annotations as a df
patient_info = pd.read_csv('/Users/jmd/Documents/BOOTCAMP/Capstone/the-circor-digiscope-phonocardiogram-dataset-1.0.1/training_data.csv')
patient_info

Unnamed: 0,Patient ID,Locations,Age,Sex,Height,Weight,Pregnancy status,Murmur,Murmur locations,Most audible location,...,Systolic murmur grading,Systolic murmur pitch,Systolic murmur quality,Diastolic murmur timing,Diastolic murmur shape,Diastolic murmur grading,Diastolic murmur pitch,Diastolic murmur quality,Campaign,Additional ID
0,2530,AV+PV+TV+MV,Child,Female,98.0,15.9,False,Absent,,,...,,,,,,,,,CC2015,
1,9979,AV+PV+TV+MV,Child,Female,103.0,13.1,False,Present,AV+MV+PV+TV,TV,...,III/VI,High,Harsh,,,,,,CC2015,
2,9983,AV+PV+TV+MV,Child,Male,115.0,19.1,False,Unknown,,,...,,,,,,,,,CC2015,
3,13918,AV+PV+TV+MV,Child,Male,98.0,15.9,False,Present,TV,TV,...,I/VI,Low,Blowing,,,,,,CC2015,
4,14241,AV+PV+TV+MV,Child,Male,87.0,11.2,False,Present,AV+MV+PV+TV,PV,...,II/VI,Low,Harsh,,,,,,CC2015,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
937,85340,AV+PV+TV+MV,Child,Male,105.0,16.6,False,Absent,,,...,,,,,,,,,CC2015,
938,85341,AV+PV+TV+MV,Child,Male,92.0,15.2,False,Absent,,,...,,,,,,,,,CC2015,
939,85343,AV+PV+TV+MV,Child,Female,97.0,13.5,False,Present,MV+TV,TV,...,I/VI,Low,Blowing,,,,,,CC2015,
940,85345,AV+PV,Child,Female,132.0,38.1,False,Absent,,,...,,,,,,,,,CC2015,


In [3]:
patient_info.columns

Index(['Patient ID', 'Locations', 'Age', 'Sex', 'Height', 'Weight',
       'Pregnancy status', 'Murmur', 'Murmur locations',
       'Most audible location', 'Systolic murmur timing',
       'Systolic murmur shape', 'Systolic murmur grading',
       'Systolic murmur pitch', 'Systolic murmur quality',
       'Diastolic murmur timing', 'Diastolic murmur shape',
       'Diastolic murmur grading', 'Diastolic murmur pitch',
       'Diastolic murmur quality', 'Campaign', 'Additional ID'],
      dtype='object')

In [4]:
training_folder_path = '/Users/jmd/Documents/BOOTCAMP/Capstone/the-circor-digiscope-phonocardiogram-dataset-1.0.1/training_data/'

## Generating 4K Signals

Choosing a Sampling Rate of 4096. 

From Evidence-Based Physical Diagnosis (Fourth Edition)<br>
- Although the human ear can hear sounds with frequencies from 20 to 20,000 cycles per second (Hz), the principal frequencies of heart sounds and murmurs are at the lower end of this range, from 20 to 500 Hz.
- In general, choosing a sampling rate twice that of the maximum frequency you intend to capture is standard. In my review of the data, I did find some sounds at and around 2000 hz. 
- Choosing a sampling rate of twice that to satisfy the Nyquist-Shannon Sampling Theorem. 
- Finally picking 4096 is a nice clean power of 2.

In [5]:
sr = 4096

In [6]:
filenames = []
signals = []

i = 0

for file in os.listdir(training_folder_path):
    filename = os.fsdecode(file)
    if filename.endswith(".wav"):
        signal, sr = librosa.load(training_folder_path + file, sr = sr)
        filenames.append(filename.split('.')[0])
        signals.append(signal)
        i += 1
    if i % 50 == 0:
        clear_output(wait=True)
        display(f"{i} files loaded")
    else:
        continue

'3150 files loaded'

Each filename has an associated patient ID and location, just pulling out that information here.

In [7]:
signal_locations = []
for file in filenames:
    signal_locations.append(file.split('_')[1])

signal_patient_ids = []
for file in filenames:
    signal_patient_ids.append(file.split('_')[0])

Creating a df with the above info as well as the signal info created above.

In [8]:
signal_df = pd.DataFrame({'signal_patient_id': signal_patient_ids,
                         'location': signal_locations,
                         'signal': signals},
                         columns=['signal_patient_id','location','signal'])

signal_df['signal_patient_id'] = signal_df['signal_patient_id'].astype('int')

signal_df.sort_values(by='signal_patient_id').reset_index(drop=True)

Now, I have to expand the patient info df duplicating the rows by the number of locations recordings were taken from.

In [11]:
patient_info['location_count'] = 0
for i in range(0,len(patient_info['location_count'])):
    patient_info.at[i,'location_count'] = len(patient_info['Locations'][i].split('+'))

In [12]:
patient_info = patient_info.loc[patient_info.index.repeat(patient_info.location_count)]

In [13]:
patient_info.reset_index(drop=True,inplace=True)

Concatenating this new patient info df with the signal df I created above.

In [14]:
final_patient_info = pd.concat([patient_info.reset_index(drop=True),
                                signal_df.sort_values(by='signal_patient_id').reset_index(drop=True)], axis=1)

In [15]:
final_patient_info.columns

Index(['Patient ID', 'Locations', 'Age', 'Sex', 'Height', 'Weight',
       'Pregnancy status', 'Murmur', 'Murmur locations',
       'Most audible location', 'Systolic murmur timing',
       'Systolic murmur shape', 'Systolic murmur grading',
       'Systolic murmur pitch', 'Systolic murmur quality',
       'Diastolic murmur timing', 'Diastolic murmur shape',
       'Diastolic murmur grading', 'Diastolic murmur pitch',
       'Diastolic murmur quality', 'Campaign', 'Additional ID',
       'location_count', 'signal_patient_id', 'location', 'signal'],
      dtype='object')

There are some unneeded columns here, but I opted to clean those up at import in later stages in case anything suddenly became necessary.

In [16]:
np.save('/Users/jmd/Documents/BOOTCAMP/Capstone/arrays/patient_signals_4k.npy', final_patient_info.to_numpy())

## Generating 1K SR Signals

Returned to this problem later to preprocess the data in a different way so that it was more palatable to an LTSM. TODO: Write out full explanation here.

In [5]:
sr = 1024

filenames = []
signals = []

i = 0

for file in os.listdir(training_folder_path):
    filename = os.fsdecode(file)
    if filename.endswith(".wav"):
        signal, sr = librosa.load(training_folder_path + file, sr = sr)
        filenames.append(filename.split('.')[0])
        signals.append(signal)
        i += 1
    if i % 50 == 0:
        clear_output(wait=True)
        display(f"{i} files loaded")
    else:
        continue

'3150 files loaded'

Each filename has an associated patient ID and location, just pulling out that information here.

In [6]:
signal_locations = []
for file in filenames:
    signal_locations.append(file.split('_')[1])

signal_patient_ids = []
for file in filenames:
    signal_patient_ids.append(file.split('_')[0])

Creating a df with the above info as well as the signal info created above.

In [7]:
signal_df = pd.DataFrame({'signal_patient_id': signal_patient_ids,
                         'location': signal_locations,
                         'signal': signals},
                         columns=['signal_patient_id','location','signal'])

signal_df['signal_patient_id'] = signal_df['signal_patient_id'].astype('int')

signal_df.sort_values(by='signal_patient_id').reset_index(drop=True)

Unnamed: 0,signal_patient_id,location,signal
0,2530,PV,"[0.037114453, 0.061636038, 0.055389933, 0.0675..."
1,2530,AV,"[0.006357202, -0.010564456, -0.008372305, 0.00..."
2,2530,MV,"[0.16865823, 0.035755683, -0.01783402, 0.00613..."
3,2530,TV,"[0.042926352, 0.08795212, 0.081163116, 0.07926..."
4,9979,MV,"[0.11233253, 0.16617733, 0.22739325, 0.3153002..."
...,...,...,...
3158,85345,AV,"[0.02705935, 0.0051991576, -0.028789269, -0.00..."
3159,85345,PV,"[0.011014578, 0.009928619, 0.014445103, 0.0160..."
3160,85349,AV,"[0.0010312841, -0.0074770185, 0.0053564664, 0...."
3161,85349,PV,"[0.070928715, 0.053460453, -0.012949261, -0.09..."


Now, I have to expand the patient info df duplicating the rows by the number of locations recordings were taken from.

In [8]:
patient_info['location_count'] = 0
for i in range(0,len(patient_info['location_count'])):
    patient_info.at[i,'location_count'] = len(patient_info['Locations'][i].split('+'))

In [9]:
patient_info = patient_info.loc[patient_info.index.repeat(patient_info.location_count)]

In [10]:
patient_info.reset_index(drop=True,inplace=True)

Concatenating this new patient info df with the signal df I created above.

In [11]:
final_patient_info = pd.concat([patient_info.reset_index(drop=True),
                                signal_df.sort_values(by='signal_patient_id').reset_index(drop=True)], axis=1)

In [12]:
final_patient_info.columns

Index(['Patient ID', 'Locations', 'Age', 'Sex', 'Height', 'Weight',
       'Pregnancy status', 'Murmur', 'Murmur locations',
       'Most audible location', 'Systolic murmur timing',
       'Systolic murmur shape', 'Systolic murmur grading',
       'Systolic murmur pitch', 'Systolic murmur quality',
       'Diastolic murmur timing', 'Diastolic murmur shape',
       'Diastolic murmur grading', 'Diastolic murmur pitch',
       'Diastolic murmur quality', 'Campaign', 'Additional ID',
       'location_count', 'signal_patient_id', 'location', 'signal'],
      dtype='object')

There are some unneeded columns here, but I opted to clean those up at import in later stages in case anything suddenly became necessary.

In [13]:
np.save('/Users/jmd/Documents/BOOTCAMP/Capstone/arrays/patient_signals_1k.npy', final_patient_info.to_numpy())