# Isolating Signals & Target Variable for Simple Modelling

Author: Jake Dumbauld <br>
Contact: jacobmilodumbauld@gmail.com<br>
Date: 3.15.22

In [1]:
# importing libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import librosa
import librosa.display
import IPython

import os
from random import uniform
import time
from IPython.display import display, clear_output

In [2]:
df = pd.DataFrame(data = np.load('/Users/jmd/Documents/BOOTCAMP/Capstone/arrays/patient_signals_4k.npy', allow_pickle=True),
                       columns=(['Patient ID', 'Locations', 'Age', 'Sex', 'Height', 'Weight',
                                 'Pregnancy status', 'Murmur', 'Murmur locations',
                                 'Most audible location', 'Systolic murmur timing',
                                 'Systolic murmur shape', 'Systolic murmur grading',
                                 'Systolic murmur pitch', 'Systolic murmur quality',
                                 'Diastolic murmur timing', 'Diastolic murmur shape',
                                 'Diastolic murmur grading', 'Diastolic murmur pitch',
                                 'Diastolic murmur quality', 'Campaign', 'Additional ID',
                                 'location_count', 'signal_patient_id', 'location', 'signal']))

In [3]:
sr = 4096

## Binarizing Target

#### Dropping Unknown Murmurs

In [4]:
df.shape

(3163, 26)

In [5]:
df['Murmur'].value_counts()

Absent     2391
Present     616
Unknown     156
Name: Murmur, dtype: int64

In [6]:
df.drop(df[df['Murmur'] == 'Unknown'].index, inplace=True)

In [7]:
df.reset_index(inplace=True, drop=True)

#### Binarizing Murmurs

In [8]:
df['Murmur'].value_counts()

Absent     2391
Present     616
Name: Murmur, dtype: int64

In [9]:
df['Murmur'] = df['Murmur'].map({"Absent": 0,
                                 "Present": 1})

## Extracting a Signal DF

Goal here is trying to select an appropriate length of audio clip to shorten my files down to. My goal is to have 90% of my audio clips be that minimum so that they can be trimmed, the rest will be padded. 

In [10]:
signal_df = df[['Murmur','signal']]

In [11]:
signal_df.shape

(3007, 2)

In [12]:
signals = signal_df['signal']

In [13]:
lengths = []
for signal in signals:
    lengths.append(len(signal))

In [14]:
lengths = pd.Series(lengths)

In [15]:
lengths.describe() / sr #sampling rate for the files in this set, so now the lengths are given in seconds

count     0.734131
mean     22.894569
std       7.297946
min       5.152100
25%      19.056152
50%      21.488037
75%      29.392090
max      64.512207
dtype: float64

Printing out a list of the proportion of clips that are at least a certain length in seconds.

In [16]:
for i in range(4, 30):
    print(i, len(lengths[lengths > (i * sr)]) / len(lengths))

4 1.0
5 1.0
6 0.9986697705354174
7 0.9960093116062521
8 0.9890256069171932
9 0.9787163285666778
10 0.9630861323578317
11 0.9441303624875291
12 0.9218490189557699
13 0.8985700033255737
14 0.8782840039906884
15 0.8540073162620552
16 0.8240771533089458
17 0.79847023611573
18 0.7751912204855338
19 0.7509145327569006
20 0.6508147655470569
21 0.5340871300299301
22 0.4672430994346525
23 0.4286664449617559
24 0.40239441303624873
25 0.3831060857998005
26 0.36647821749251747
27 0.3518456933821084
28 0.32790156301962087
29 0.28633189225141337


So it looks like I can work with 12 second clips. To calculate how long that series its we multiply that by the sampling rate which I declared at the top

In [17]:
target_len = 12 * sr
target_len

49152

In [19]:
signal_df['signal'].isna().sum()

0

Looping through the signal data and trimming/padding it to the appropriate length

In [20]:
new_signals = []
for signal in signals:
    if len(signal) == target_len:
        new_signals.append(signal)
    elif len(signal) > target_len:
        new_signals.append(signal[0:target_len])
    elif len(signal) < target_len:
        padwidth = target_len-len(signal)
        new_signals.append(np.pad(signal, (0, padwidth), mode='constant'))
    else:
        print('wtf')

In [22]:
new_signals = np.asarray(new_signals)

In [23]:
new_signals.shape

(3007, 49152)

Making a dataframe out of my `new_signals`

In [24]:
temp_df = pd.DataFrame(new_signals)

Dropping the old signal data from my dataframe

In [25]:
signal_df.drop('signal',axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  signal_df.drop('signal',axis=1,inplace=True)


Concatenating my murmur data with my signal data. 

In [26]:
final_df = signal_df.join(temp_df)

In [27]:
final_df.isna().sum().sum()

0

In [28]:
final_df.shape

(3007, 49153)

In [29]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3007 entries, 0 to 3006
Columns: 49153 entries, Murmur to 49151
dtypes: float32(49152), int64(1)
memory usage: 563.8 MB


In [30]:
np.save('/Users/jmd/Documents/BOOTCAMP/Capstone/arrays/signal_murmur_presimple_4k.npy', final_df.to_numpy())