# Pre-Processing: Phonocardiograms

Author: Jake Dumbauld <br>
Contact: jacobmilodumbauld@gmail.com<br>
Date: 3.15.22

In [1]:
# importing libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import librosa
import librosa.display
import IPython

import os
from random import uniform
import time
from IPython.display import display, clear_output

In [2]:
# importing data annotations as a df
patient_info = pd.read_csv('/Users/jmd/Documents/BOOTCAMP/Capstone/the-circor-digiscope-phonocardiogram-dataset-1.0.1/training_data.csv')
patient_info

Unnamed: 0,Patient ID,Locations,Age,Sex,Height,Weight,Pregnancy status,Murmur,Murmur locations,Most audible location,...,Systolic murmur grading,Systolic murmur pitch,Systolic murmur quality,Diastolic murmur timing,Diastolic murmur shape,Diastolic murmur grading,Diastolic murmur pitch,Diastolic murmur quality,Campaign,Additional ID
0,2530,AV+PV+TV+MV,Child,Female,98.0,15.9,False,Absent,,,...,,,,,,,,,CC2015,
1,9979,AV+PV+TV+MV,Child,Female,103.0,13.1,False,Present,AV+MV+PV+TV,TV,...,III/VI,High,Harsh,,,,,,CC2015,
2,9983,AV+PV+TV+MV,Child,Male,115.0,19.1,False,Unknown,,,...,,,,,,,,,CC2015,
3,13918,AV+PV+TV+MV,Child,Male,98.0,15.9,False,Present,TV,TV,...,I/VI,Low,Blowing,,,,,,CC2015,
4,14241,AV+PV+TV+MV,Child,Male,87.0,11.2,False,Present,AV+MV+PV+TV,PV,...,II/VI,Low,Harsh,,,,,,CC2015,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
937,85340,AV+PV+TV+MV,Child,Male,105.0,16.6,False,Absent,,,...,,,,,,,,,CC2015,
938,85341,AV+PV+TV+MV,Child,Male,92.0,15.2,False,Absent,,,...,,,,,,,,,CC2015,
939,85343,AV+PV+TV+MV,Child,Female,97.0,13.5,False,Present,MV+TV,TV,...,I/VI,Low,Blowing,,,,,,CC2015,
940,85345,AV+PV,Child,Female,132.0,38.1,False,Absent,,,...,,,,,,,,,CC2015,


In [3]:
patient_info.columns

Index(['Patient ID', 'Locations', 'Age', 'Sex', 'Height', 'Weight',
       'Pregnancy status', 'Murmur', 'Murmur locations',
       'Most audible location', 'Systolic murmur timing',
       'Systolic murmur shape', 'Systolic murmur grading',
       'Systolic murmur pitch', 'Systolic murmur quality',
       'Diastolic murmur timing', 'Diastolic murmur shape',
       'Diastolic murmur grading', 'Diastolic murmur pitch',
       'Diastolic murmur quality', 'Campaign', 'Additional ID'],
      dtype='object')

In [4]:
training_folder_path = '/Users/jmd/Documents/BOOTCAMP/Capstone/the-circor-digiscope-phonocardiogram-dataset-1.0.1/training_data/'

Looping over the files in my training data and importing with a 4k sampling rate - reason for this is that further exploration of the data later down the line revealed that the important frequencies are all less than 2000 Hz. Choosing a sampling rate of twice that to satisfy the Nyquist-Shannon Sampling Theorem. 

In [5]:
sr = 4096

In [6]:
filenames = []
signals = []

i = 0

for file in os.listdir(training_folder_path):
    filename = os.fsdecode(file)
    if filename.endswith(".wav"):
        signal, sr = librosa.load(training_folder_path + file, sr = sr)
        filenames.append(filename.split('.')[0])
        signals.append(signal)
        i += 1
    if i % 50 == 0:
        clear_output(wait=True)
        display(f"{i} files loaded")
    else:
        continue

'3150 files loaded'

In [7]:
signal_locations = []
for file in filenames:
    signal_locations.append(file.split('_')[1])

signal_patient_ids = []
for file in filenames:
    signal_patient_ids.append(file.split('_')[0])

In [8]:
signal_df = pd.DataFrame({'signal_patient_id': signal_patient_ids,
                         'location': signal_locations,
                         'signal': signals},
                         columns=['signal_patient_id','location','signal'])

In [9]:
signal_df['signal_patient_id'] = signal_df['signal_patient_id'].astype('int')

In [10]:
signal_df.sort_values(by='signal_patient_id').reset_index(drop=True)

Unnamed: 0,signal_patient_id,location,signal
0,2530,PV,"[0.07682987, 0.06061038, 0.039170958, 0.048250..."
1,2530,AV,"[-0.01187718, 0.029969877, 0.01927742, -0.0206..."
2,2530,MV,"[0.37442628, 0.32439327, 0.095518045, -0.06558..."
3,2530,TV,"[0.06770988, 0.073658854, 0.072224066, 0.08253..."
4,9979,MV,"[0.15039496, 0.18560724, 0.17212218, 0.1603406..."
...,...,...,...
3158,85345,AV,"[0.03814805, 0.06008572, 0.03808801, -0.008758..."
3159,85345,PV,"[-0.0061015976, 0.029588033, 0.020953469, 0.00..."
3160,85349,AV,"[0.00026825635, 0.0034792388, 0.014762115, -0...."
3161,85349,PV,"[0.1387334, 0.08302983, 0.13867667, -0.0078439..."


In [11]:
patient_info['location_count'] = 0
for i in range(0,len(patient_info['location_count'])):
    patient_info.at[i,'location_count'] = len(patient_info['Locations'][i].split('+'))

In [12]:
patient_info = patient_info.loc[patient_info.index.repeat(patient_info.location_count)]

In [13]:
patient_info.reset_index(drop=True,inplace=True)

In [14]:
final_patient_info = pd.concat([patient_info.reset_index(drop=True),
                                signal_df.sort_values(by='signal_patient_id').reset_index(drop=True)], axis=1)

In [15]:
final_patient_info.columns

Index(['Patient ID', 'Locations', 'Age', 'Sex', 'Height', 'Weight',
       'Pregnancy status', 'Murmur', 'Murmur locations',
       'Most audible location', 'Systolic murmur timing',
       'Systolic murmur shape', 'Systolic murmur grading',
       'Systolic murmur pitch', 'Systolic murmur quality',
       'Diastolic murmur timing', 'Diastolic murmur shape',
       'Diastolic murmur grading', 'Diastolic murmur pitch',
       'Diastolic murmur quality', 'Campaign', 'Additional ID',
       'location_count', 'signal_patient_id', 'location', 'signal'],
      dtype='object')

In [16]:
np.save('/Users/jmd/Documents/BOOTCAMP/Capstone/patient_signals_4k.npy', final_patient_info.to_numpy())

In [17]:
test_df = pd.DataFrame(data = np.load('/Users/jmd/Documents/BOOTCAMP/Capstone/patient_signals_4k.npy', allow_pickle=True),
                       columns=(final_patient_info.columns))

In [18]:
test_df.shape

(3163, 26)

In [19]:
len(test_df['signal'].to_list()[0])

53478

In [20]:
final_patient_info.shape

(3163, 26)

In [21]:
len(final_patient_info['signal'].to_list()[0])

53478