# Constructing Data Sets for Neural Networks

Author: Jake Dumbauld <br>
Contact: jacobmilodumbauld@gmail.com<br>
Date: 3.15.22

In [1]:
# importing libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import librosa
import librosa.display
import IPython
from IPython.display import display, clear_output

import os
from random import uniform
import time

#options
pd.set_option('display.max_columns', None) #making sure I can see all my columns

Picking up from my df created in `2 - Importing Signal Data`

In [2]:
df = pd.DataFrame(data = np.load('/Users/jmd/Documents/BOOTCAMP/Capstone/patient_signals_4k.npy', allow_pickle=True),
                       columns=(['Patient ID', 'Locations', 'Age', 'Sex', 'Height', 'Weight',
                                 'Pregnancy status', 'Murmur', 'Murmur locations',
                                 'Most audible location', 'Systolic murmur timing',
                                 'Systolic murmur shape', 'Systolic murmur grading',
                                 'Systolic murmur pitch', 'Systolic murmur quality',
                                 'Diastolic murmur timing', 'Diastolic murmur shape',
                                 'Diastolic murmur grading', 'Diastolic murmur pitch',
                                 'Diastolic murmur quality', 'Campaign', 'Additional ID',
                                 'location_count', 'signal_patient_id', 'location', 'signal']))

## Dropping duplicated or unnecessary columns

Here I am opting to drop all columns qualifying and grading the murmr. Reason being is that in the field, ideally this model would be deployed with inputs of the signal data itself, and then the demographic information like age, sex, height, weight, and pregnancy status. This information will be useful in the future when evaluating the type of murmurs most misclassified, but right now I am generating my model data sets and thus will be dropping it. </br>

Additionally, `Additional_ID` is a reference to another ID in the table, if a patient participated in the study in 2 different campaigns. I will also be dropping this column as I expect the variability in the patients heart rate & sounds from day to day will be enough to avoid problems of multicollinearity if I choose to train any simple models on this data.

In [3]:
df.columns

Index(['Patient ID', 'Locations', 'Age', 'Sex', 'Height', 'Weight',
       'Pregnancy status', 'Murmur', 'Murmur locations',
       'Most audible location', 'Systolic murmur timing',
       'Systolic murmur shape', 'Systolic murmur grading',
       'Systolic murmur pitch', 'Systolic murmur quality',
       'Diastolic murmur timing', 'Diastolic murmur shape',
       'Diastolic murmur grading', 'Diastolic murmur pitch',
       'Diastolic murmur quality', 'Campaign', 'Additional ID',
       'location_count', 'signal_patient_id', 'location', 'signal'],
      dtype='object')

In [4]:
columns_to_drop = ['Locations','signal_patient_id', 'Murmur locations',
                   'Most audible location', 'Systolic murmur timing',
                   'Systolic murmur shape', 'Systolic murmur grading',
                   'Systolic murmur pitch', 'Systolic murmur quality',
                   'Diastolic murmur timing', 'Diastolic murmur shape',
                   'Diastolic murmur grading', 'Diastolic murmur pitch',
                   'Diastolic murmur quality', 'Campaign', 'Additional ID', 'location_count']

In [5]:
df.drop(columns_to_drop, axis=1, inplace=True)

## Binarizing Columns

### Binarizing Murmurs

In [6]:
#checking to see if we have any NaN's
df['Murmur'].isna().sum()

0

In [7]:
df.drop(df[df['Murmur'] == 'Unknown'].index, inplace=True)

In [8]:
df['Murmur'].value_counts()

Absent     2391
Present     616
Name: Murmur, dtype: int64

In [9]:
df['Murmur'] = df['Murmur'].map({"Absent": 0,
                                 "Present": 1})

In [10]:
df.reset_index(inplace=True, drop=True)

### Binarizing Pregnancy Status

In [11]:
#checking to see if we have any NaN's
df['Pregnancy status'].isna().sum()

0

In [12]:
df['Pregnancy status'] = df['Pregnancy status'].astype(int)

### Binarizing Sex

In [13]:
#checking to see if we have any NaN's
df['Sex'].isna().sum()

0

In [14]:
df['Sex'].value_counts()

Female    1523
Male      1484
Name: Sex, dtype: int64

In [15]:
df['Sex'] = df['Sex'].map({"Male": 0,
                           "Female": 1})

### Mapping Age to Ints

In [16]:
#checking to see if we have any NaN's
df['Age'].isna().sum()

237

Rather than imputing something potentiall incorrect, I'm going to start my map at 0 with NaN's being set at 0. 

In [17]:
df['Age'].value_counts()

Child          2125
Infant          383
Adolescent      230
Young Adult      24
Neonate           8
Name: Age, dtype: int64

In [18]:
df['Age'] = df['Age'].map({np.nan: 0,
                           'Neonate': 1,
                           'Infant': 2,
                           'Child': 3,
                           'Adolescent': 4,
                           'Young Adult': 5})

In [19]:
# sanity check
df['Age'].value_counts()

3    2125
2     383
0     237
4     230
5      24
1       8
Name: Age, dtype: int64

### Dummy variables for location

In [20]:
df = pd.get_dummies(df, columns=['location'])

In [21]:
len(df.columns)

13

In [22]:
last_col = df.pop('signal')

In [23]:
last_position = len(df.columns)

In [24]:
df.insert(last_position, 'signal', last_col)

### Dealing with NaNs in Height/Weight

Broad strokes - imputed with means of same sex/age groups where possible, otherwise imputed with the mean of the whole sample. For some patients, particularly those who were pregnant, lack of age information made imputation difficult.

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3007 entries, 0 to 3006
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Patient ID        3007 non-null   object
 1   Age               3007 non-null   int64 
 2   Sex               3007 non-null   int64 
 3   Height            2663 non-null   object
 4   Weight            2687 non-null   object
 5   Pregnancy status  3007 non-null   int64 
 6   Murmur            3007 non-null   int64 
 7   location_AV       3007 non-null   uint8 
 8   location_MV       3007 non-null   uint8 
 9   location_PV       3007 non-null   uint8 
 10  location_Phc      3007 non-null   uint8 
 11  location_TV       3007 non-null   uint8 
 12  signal            3007 non-null   object
dtypes: int64(4), object(4), uint8(5)
memory usage: 202.7+ KB


In [26]:
#dealing with NaN heights
new_heights = []

#iterating through heights
for i, height in enumerate(df['Height']):
    
    #if the height is null 
    if (pd.isnull(height) == True):        
        
        #store the age and sex groups of the current patient in memory
        age_group = df.iloc[i,1]
        sex_group = df.iloc[i,2]
        
        #creating a condition that checks if age and sex are equal to the age and sex group
        condition = (df['Age'] == age_group) & (df['Sex'] == sex_group)
        
        #computing the mean height of the patient group with same age and sex category
        groups_height_mean = df[condition]['Height'].mean()
        
        #if that mean is null append the mean of the entire sample
        if (pd.isnull(groups_height_mean) == True):
            
            new_heights.append(df['Height'].mean().round(1))
        
        #else append the group mean
        else:
            
            new_heights.append(groups_height_mean.round(1))
    
    else:
        new_heights.append(df['Height'][i])

# reassigning heights with imputation.
df['Height'] = new_heights

In [27]:
#dealing with NaN weights
new_weights = []

#iterating through weights
for i, weight in enumerate(df['Weight']):
    
    #if the weight is null 
    if (pd.isnull(weight) == True):        
        
        #store the age and sex groups of the current patient in memory
        age_group = df.iloc[i,1]
        sex_group = df.iloc[i,2]
        
        #creating a condition that checks if age and sex are equal to the age and sex group
        condition = (df['Age'] == age_group) & (df['Sex'] == sex_group)
        
        #computing the mean weight of the patient group with same age and sex category
        groups_weight_mean = df[condition]['Weight'].mean()
        
        #if that mean is null append the mean of the entire sample
        if (pd.isnull(groups_weight_mean) == True):
            
            new_weights.append(df['Weight'].mean().round(1))
        
        #else append the group mean
        else:
            
            new_weights.append(groups_weight_mean.round(1))
    
    else:
        new_weights.append(df['Weight'][i])
        
# reassigning weights with imputation.
df['Weight'] = new_weights

In [28]:
# sanity check
df.isna().sum()

Patient ID          0
Age                 0
Sex                 0
Height              0
Weight              0
Pregnancy status    0
Murmur              0
location_AV         0
location_MV         0
location_PV         0
location_Phc        0
location_TV         0
signal              0
dtype: int64

In [29]:
df

Unnamed: 0,Patient ID,Age,Sex,Height,Weight,Pregnancy status,Murmur,location_AV,location_MV,location_PV,location_Phc,location_TV,signal
0,2530,3,1,98.0,15.9,0,0,0,0,1,0,0,"[0.07682987, 0.06061038, 0.039170958, 0.048250..."
1,2530,3,1,98.0,15.9,0,0,1,0,0,0,0,"[-0.01187718, 0.029969877, 0.01927742, -0.0206..."
2,2530,3,1,98.0,15.9,0,0,0,1,0,0,0,"[0.37442628, 0.32439327, 0.095518045, -0.06558..."
3,2530,3,1,98.0,15.9,0,0,0,0,0,0,1,"[0.06770988, 0.073658854, 0.072224066, 0.08253..."
4,9979,3,1,103.0,13.1,0,1,0,1,0,0,0,"[0.15039496, 0.18560724, 0.17212218, 0.1603406..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3002,85345,3,1,132.0,38.1,0,0,1,0,0,0,0,"[0.03814805, 0.06008572, 0.03808801, -0.008758..."
3003,85345,3,1,132.0,38.1,0,0,0,0,1,0,0,"[-0.0061015976, 0.029588033, 0.020953469, 0.00..."
3004,85349,0,1,115.9,25.1,1,0,1,0,0,0,0,"[0.00026825635, 0.0034792388, 0.014762115, -0...."
3005,85349,0,1,115.9,25.1,1,0,0,0,1,0,0,"[0.1387334, 0.08302983, 0.13867667, -0.0078439..."


## MFCC Data w/wo Patient Information

In [33]:
sr = 4096

In [30]:
signals = df['signal']

In [32]:
librosa.feature.mfcc(df['signal'][0], sr = sr, )

53478

## Unprocessed signal data w/wo Patient Information