# Isolating Signals & Target Variable for Simple Modelling

Author: Jake Dumbauld <br>
Contact: jacobmilodumbauld@gmail.com<br>
Date: 3.15.22

In [1]:
# importing libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import librosa
import librosa.display
import IPython

import os
from random import uniform
import time
from IPython.display import display, clear_output

In [2]:
df = pd.DataFrame(data = np.load('/Users/jmd/Documents/BOOTCAMP/Capstone/patient_signals_4k.npy', allow_pickle=True),
                       columns=(['Patient ID', 'Locations', 'Age', 'Sex', 'Height', 'Weight',
                                 'Pregnancy status', 'Murmur', 'Murmur locations',
                                 'Most audible location', 'Systolic murmur timing',
                                 'Systolic murmur shape', 'Systolic murmur grading',
                                 'Systolic murmur pitch', 'Systolic murmur quality',
                                 'Diastolic murmur timing', 'Diastolic murmur shape',
                                 'Diastolic murmur grading', 'Diastolic murmur pitch',
                                 'Diastolic murmur quality', 'Campaign', 'Additional ID',
                                 'location_count', 'signal_patient_id', 'location', 'signal']))

Choosing a Sampling Rate of 4096. 

From Evidence-Based Physical Diagnosis (Fourth Edition)<br>
- Although the human ear can hear sounds with frequencies from 20 to 20,000 cycles per second (Hz), the principal frequencies of heart sounds and murmurs are at the lower end of this range, from 20 to 500 Hz.

In [3]:
sr = 4096

## Basic EDA

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3163 entries, 0 to 3162
Data columns (total 26 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Patient ID                3163 non-null   object
 1   Locations                 3163 non-null   object
 2   Age                       2924 non-null   object
 3   Sex                       3163 non-null   object
 4   Height                    2808 non-null   object
 5   Weight                    2834 non-null   object
 6   Pregnancy status          3163 non-null   object
 7   Murmur                    3163 non-null   object
 8   Murmur locations          616 non-null    object
 9   Most audible location     616 non-null    object
 10  Systolic murmur timing    612 non-null    object
 11  Systolic murmur shape     612 non-null    object
 12  Systolic murmur grading   612 non-null    object
 13  Systolic murmur pitch     612 non-null    object
 14  Systolic murmur quality 

## Binarizing Target

#### Dropping Unknown Murmurs

In [5]:
df.shape

(3163, 26)

In [6]:
df['Murmur'].value_counts()

Absent     2391
Present     616
Unknown     156
Name: Murmur, dtype: int64

In [7]:
df.drop(df[df['Murmur'] == 'Unknown'].index, inplace=True)

In [8]:
df.reset_index(inplace=True, drop=True)

#### Binarizing Murmurs

In [9]:
df['Murmur'].value_counts()

Absent     2391
Present     616
Name: Murmur, dtype: int64

In [10]:
df['Murmur'] = df['Murmur'].map({"Absent": 0,
                                 "Present": 1})

## Extracting a Signal DF

Goal here is trying to select an appropriate length of audio clip to shorten my files down to. My goal is to have 90% of my audio clips be that minimum length, the rest will be padded. 

In [11]:
signal_df = df[['Murmur','signal']]

In [12]:
signal_df.shape

(3007, 2)

In [13]:
signals = signal_df['signal']

In [14]:
lengths = []
for signal in signals:
    lengths.append(len(signal))

In [15]:
lengths = pd.Series(lengths)

In [16]:
lengths.describe() / 22050 #sampling rate for the files in this set, so now the lengths are given in seconds

count     0.136372
mean      4.252887
std       1.355664
min       0.957052
25%       3.539864
50%       3.991610
75%       5.459864
max      11.983764
dtype: float64

In [32]:
for i in range(4, 30):
    print(i, len(lengths[lengths > (i * sr)]) / len(lengths))

4 1.0
5 1.0
6 0.9986697705354174
7 0.9960093116062521
8 0.9890256069171932
9 0.9787163285666778
10 0.9630861323578317
11 0.9441303624875291
12 0.9218490189557699
13 0.8985700033255737
14 0.8782840039906884
15 0.8540073162620552
16 0.8240771533089458
17 0.79847023611573
18 0.7751912204855338
19 0.7509145327569006
20 0.6508147655470569
21 0.5340871300299301
22 0.4672430994346525
23 0.4286664449617559
24 0.40239441303624873
25 0.3831060857998005
26 0.36647821749251747
27 0.3518456933821084
28 0.32790156301962087
29 0.28633189225141337


So it looks like I can work with 12 second clips. To calculate how long that series its we multiply that by the sampling rate which I declared at the top

In [18]:
target_len = 12 * sr
target_len

49152

In [19]:
len(signal_df['signal'])

3007

In [20]:
signal_df['signal'].isna().sum()

0

In [21]:
new_signals = []
for signal in signals:
    if len(signal) == target_len:
        new_signals.append(signal)
    elif len(signal) > target_len:
        new_signals.append(signal[0:target_len])
    elif len(signal) < target_len:
        padwidth = target_len-len(signal)
        new_signals.append(np.pad(signal, (0, padwidth), mode='constant'))
    else:
        print('wtf')


In [22]:
len(new_signals)

3007

In [23]:
new_signals = np.asarray(new_signals)

In [24]:
new_signals.shape

(3007, 49152)

In [25]:
temp_df = pd.DataFrame(new_signals)

In [26]:
signal_df.drop('signal',axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  signal_df.drop('signal',axis=1,inplace=True)


In [27]:
final_df = signal_df.join(temp_df)

In [28]:
final_df.isna().sum().sum()

0

In [29]:
final_df.shape

(3007, 49153)

In [30]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3007 entries, 0 to 3006
Columns: 49153 entries, Murmur to 49151
dtypes: float32(49152), int64(1)
memory usage: 563.8 MB


In [31]:
np.save('/Users/jmd/Documents/BOOTCAMP/Capstone/signal_murmur_prelogreg_4k.npy', final_df.to_numpy())