In [1]:
import os
from glob import glob
import boto3
import pandas as pd
import numpy as np
import librosa
from sklearn.model_selection import train_test_split

import random


In [2]:
dir = '/Users/sayyedjilani/datasci210/'
os.chdir(dir)
os.getcwd()

'/Users/sayyedjilani/datasci210'

## Downsampling

In [3]:
fan_minus6 = glob('/Users/sayyedjilani/datasci210/-6dB_fan/*/*/*.wav')
pump_minus6 = glob('/Users/sayyedjilani/datasci210/-6dB_pump/*/*/*.wav')
slider_minus6 = glob('/Users/sayyedjilani/datasci210/-6dB_slider/*/*/*.wav')
valve_minus6 = glob('/Users/sayyedjilani/datasci210/-6dB_valve/*/*/*.wav')

In [4]:
raw_data_list = [] 
sample_rate_list = []
sound_source_list = [] 
snr_list = []
binary_classification_list = [] 
wav_file_list = [] 
model_list = []

for audio_file in fan_minus6:
    split_string = str(audio_file).split('/')
    sound_source = split_string[-4].split('_')[1]
    snr = split_string[-4].split('_')[0]
    binary_classification = split_string[-2]
    wav_file = split_string[-1]
    model = split_string[-3]
    raw_data, sample_rate = librosa.load(audio_file, sr=None)
    sound_source_list.append(sound_source)
    snr_list.append(snr)
    binary_classification_list.append(binary_classification)
    wav_file_list.append(wav_file)
    raw_data_list.append(raw_data)
    sample_rate_list.append(sample_rate)
    model_list.append(model)

In [5]:
df = pd.DataFrame(columns=["Sound Source", "SNR", "Model", ".wav File", "Sample Rate", "Raw Data", "Classification"])

df["Sound Source"] = sound_source_list
df["SNR"] = snr_list
df["Classification"] = binary_classification_list
df[".wav File"] = wav_file_list
df["Sample Rate"] = sample_rate_list
df["Raw Data"] = raw_data_list
df["Model"] = model_list

In [6]:
df.head()

Unnamed: 0,Sound Source,SNR,Model,.wav File,Sample Rate,Raw Data,Classification
0,fan,-6dB,id_00,00000059.wav,16000,"[-0.015693665, -0.016227722, -0.015602112, -0....",abnormal
1,fan,-6dB,id_00,00000071.wav,16000,"[0.008773804, 0.007896423, 0.0039596558, 0.000...",abnormal
2,fan,-6dB,id_00,00000065.wav,16000,"[-0.0024223328, -0.004142761, -0.003967285, -0...",abnormal
3,fan,-6dB,id_00,00000273.wav,16000,"[0.01563263, 0.012084961, 0.0070114136, 0.0028...",abnormal
4,fan,-6dB,id_00,00000267.wav,16000,"[0.0054855347, 0.0052452087, 0.0067481995, 0.0...",abnormal


In [9]:
def stratified_sample(df, n_samples=450):
    sampled_df = pd.DataFrame(columns=df.columns)
    
    for classification, group in df.groupby(['Sound Source', 'SNR', 'Classification']):
        # Proportional sampling within each classification group
        model_counts = group['Model'].value_counts()
        total_count = model_counts.sum()
        
        stratified_group = pd.DataFrame(columns=group.columns)
        
        for model, count in model_counts.items():
            model_sample_size = int((count / total_count) * n_samples)
            stratified_group = pd.concat([stratified_group, group[group['Model'] == model].sample(min(len(group[group['Model'] == model]), model_sample_size), random_state=56)])
        
        # If there are not enough samples, randomly sample the remaining rows to fill the gap
        if len(stratified_group) < n_samples:
            additional_samples = group.drop(stratified_group.index).sample(n_samples - len(stratified_group), random_state=1, replace=True)
            stratified_group = pd.concat([stratified_group, additional_samples])
        
        sampled_df = pd.concat([sampled_df, stratified_group])
    
    return sampled_df

In [10]:
sampled_df = stratified_sample(df, n_samples=456)
print(sampled_df['Classification'].value_counts())
print()
distribution = sampled_df.groupby(['Classification', 'Model']).size()
print(distribution)

Classification
abnormal    456
normal      456
Name: count, dtype: int64

Classification  Model
abnormal        id_00    127
                id_02    111
                id_04    107
                id_06    111
normal          id_00    114
                id_02    113
                id_04    115
                id_06    114
dtype: int64


In [11]:
fan_df = glob('/Users/sayyedjilani/datasci210/*_fan/*/*/*.wav')

raw_data_list = [] 
sample_rate_list = []
sound_source_list = [] 
snr_list = []
binary_classification_list = [] 
wav_file_list = [] 
model_list = []

for audio_file in fan_df:
    split_string = str(audio_file).split('/')
    sound_source = split_string[-4].split('_')[1]
    snr = split_string[-4].split('_')[0]
    binary_classification = split_string[-2]
    wav_file = split_string[-1]
    model = split_string[-3]
    raw_data, sample_rate = librosa.load(audio_file, sr=None)
    sound_source_list.append(sound_source)
    snr_list.append(snr)
    binary_classification_list.append(binary_classification)
    wav_file_list.append(wav_file)
    raw_data_list.append(raw_data)
    sample_rate_list.append(sample_rate)
    model_list.append(model)

df = pd.DataFrame(columns=["Sound Source", "SNR", "Model", ".wav File", "Sample Rate", "Raw Data", "Classification"])

df["Sound Source"] = sound_source_list
df["SNR"] = snr_list
df["Classification"] = binary_classification_list
df[".wav File"] = wav_file_list
df["Sample Rate"] = sample_rate_list
df["Raw Data"] = raw_data_list
df["Model"] = model_list

In [17]:
sampled_fans = stratified_sample(df, n_samples=456)
distribution = sampled_fans.groupby(['Sound Source', 'SNR', 'Classification', 'Model']).size()
print(distribution)

Sound Source  SNR   Classification  Model
fan           -6dB  abnormal        id_00    127
                                    id_02    111
                                    id_04    107
                                    id_06    111
                    normal          id_00    114
                                    id_02    113
                                    id_04    115
                                    id_06    114
              0dB   abnormal        id_00    127
                                    id_02    111
                                    id_04    107
                                    id_06    111
                    normal          id_00    114
                                    id_02    113
                                    id_04    115
                                    id_06    114
              6dB   abnormal        id_00    127
                                    id_02    111
                                    id_04    107
                           

In [18]:
pump_df = glob('/Users/sayyedjilani/datasci210/*_pump/*/*/*.wav')

raw_data_list = [] 
sample_rate_list = []
sound_source_list = [] 
snr_list = []
binary_classification_list = [] 
wav_file_list = [] 
model_list = []

for audio_file in pump_df:
    split_string = str(audio_file).split('/')
    sound_source = split_string[-4].split('_')[1]
    snr = split_string[-4].split('_')[0]
    binary_classification = split_string[-2]
    wav_file = split_string[-1]
    model = split_string[-3]
    raw_data, sample_rate = librosa.load(audio_file, sr=None)
    sound_source_list.append(sound_source)
    snr_list.append(snr)
    binary_classification_list.append(binary_classification)
    wav_file_list.append(wav_file)
    raw_data_list.append(raw_data)
    sample_rate_list.append(sample_rate)
    model_list.append(model)

In [19]:
df = pd.DataFrame(columns=["Sound Source", "SNR", "Model", ".wav File", "Sample Rate", "Raw Data", "Classification"])

df["Sound Source"] = sound_source_list
df["SNR"] = snr_list
df["Classification"] = binary_classification_list
df[".wav File"] = wav_file_list
df["Sample Rate"] = sample_rate_list
df["Raw Data"] = raw_data_list
df["Model"] = model_list

In [20]:
sampled_pumps = stratified_sample(df, n_samples=456)
distribution = sampled_pumps.groupby(['Sound Source', 'SNR', 'Classification', 'Model']).size()
print(distribution)

Sound Source  SNR   Classification  Model
pump          -6dB  abnormal        id_00    143
                                    id_02    111
                                    id_04    100
                                    id_06    102
                    normal          id_00    122
                                    id_02    122
                                    id_04     85
                                    id_06    127
              0dB   abnormal        id_00    143
                                    id_02    111
                                    id_04    100
                                    id_06    102
                    normal          id_00    122
                                    id_02    122
                                    id_04     85
                                    id_06    127
              6dB   abnormal        id_00    143
                                    id_02    111
                                    id_04    100
                           

In [21]:
slider_df = glob('/Users/sayyedjilani/datasci210/*_slider/*/*/*.wav')

raw_data_list = [] 
sample_rate_list = []
sound_source_list = [] 
snr_list = []
binary_classification_list = [] 
wav_file_list = [] 
model_list = []

for audio_file in slider_df:
    split_string = str(audio_file).split('/')
    sound_source = split_string[-4].split('_')[1]
    snr = split_string[-4].split('_')[0]
    binary_classification = split_string[-2]
    wav_file = split_string[-1]
    model = split_string[-3]
    raw_data, sample_rate = librosa.load(audio_file, sr=None)
    sound_source_list.append(sound_source)
    snr_list.append(snr)
    binary_classification_list.append(binary_classification)
    wav_file_list.append(wav_file)
    raw_data_list.append(raw_data)
    sample_rate_list.append(sample_rate)
    model_list.append(model)

In [22]:
df = pd.DataFrame(columns=["Sound Source", "SNR", "Model", ".wav File", "Sample Rate", "Raw Data", "Classification"])

df["Sound Source"] = sound_source_list
df["SNR"] = snr_list
df["Classification"] = binary_classification_list
df[".wav File"] = wav_file_list
df["Sample Rate"] = sample_rate_list
df["Raw Data"] = raw_data_list
df["Model"] = model_list

In [23]:
sampled_sliders = stratified_sample(df, n_samples=456)
distribution = sampled_sliders.groupby(['Sound Source', 'SNR', 'Classification', 'Model']).size()
print(distribution)

Sound Source  SNR   Classification  Model
slider        -6dB  abnormal        id_00    183
                                    id_02    136
                                    id_04     92
                                    id_06     45
                    normal          id_00    152
                                    id_02    152
                                    id_04     76
                                    id_06     76
              0dB   abnormal        id_00    183
                                    id_02    136
                                    id_04     92
                                    id_06     45
                    normal          id_00    152
                                    id_02    152
                                    id_04     76
                                    id_06     76
              6dB   abnormal        id_00    183
                                    id_02    136
                                    id_04     92
                           

In [24]:
valve_df = glob('/Users/sayyedjilani/datasci210/*_valve/*/*/*.wav')

raw_data_list = [] 
sample_rate_list = []
sound_source_list = [] 
snr_list = []
binary_classification_list = [] 
wav_file_list = [] 
model_list = []

for audio_file in valve_df:
    split_string = str(audio_file).split('/')
    sound_source = split_string[-4].split('_')[1]
    snr = split_string[-4].split('_')[0]
    binary_classification = split_string[-2]
    wav_file = split_string[-1]
    model = split_string[-3]
    raw_data, sample_rate = librosa.load(audio_file, sr=None)
    sound_source_list.append(sound_source)
    snr_list.append(snr)
    binary_classification_list.append(binary_classification)
    wav_file_list.append(wav_file)
    raw_data_list.append(raw_data)
    sample_rate_list.append(sample_rate)
    model_list.append(model)

In [25]:
df = pd.DataFrame(columns=["Sound Source", "SNR", "Model", ".wav File", "Sample Rate", "Raw Data", "Classification"])

df["Sound Source"] = sound_source_list
df["SNR"] = snr_list
df["Classification"] = binary_classification_list
df[".wav File"] = wav_file_list
df["Sample Rate"] = sample_rate_list
df["Raw Data"] = raw_data_list
df["Model"] = model_list

In [26]:
sampled_valves = stratified_sample(df, n_samples=456)
distribution = sampled_valves.groupby(['Sound Source', 'SNR', 'Classification', 'Model']).size()
print(distribution)

Sound Source  SNR   Classification  Model
valve         -6dB  abnormal        id_00    114
                                    id_02    114
                                    id_04    114
                                    id_06    114
                    normal          id_00    123
                                    id_02     87
                                    id_04    123
                                    id_06    123
              0dB   abnormal        id_00    114
                                    id_02    114
                                    id_04    114
                                    id_06    114
                    normal          id_00    123
                                    id_02     87
                                    id_04    123
                                    id_06    123
              6dB   abnormal        id_00    114
                                    id_02    114
                                    id_04    114
                           

### Concat all dfs, save as pkl and upload to bucket

In [35]:
df = pd.concat([sampled_fans, sampled_pumps, sampled_sliders, sampled_valves])
df.shape

(10944, 7)

In [36]:
df.head()

Unnamed: 0,Sound Source,SNR,Model,.wav File,Sample Rate,Raw Data,Classification
145,fan,-6dB,id_00,00000174.wav,16000,"[-0.005207062, -0.0051956177, -0.00573349, -0....",abnormal
170,fan,-6dB,id_00,00000188.wav,16000,"[0.010253906, 0.013095856, 0.013023376, 0.0088...",abnormal
2,fan,-6dB,id_00,00000065.wav,16000,"[-0.0024223328, -0.004142761, -0.003967285, -0...",abnormal
329,fan,-6dB,id_00,00000251.wav,16000,"[-0.0028533936, -0.005077362, -0.0044822693, -...",abnormal
362,fan,-6dB,id_00,00000308.wav,16000,"[-0.005332947, -0.0024414062, 0.00037002563, 0...",abnormal


In [64]:
df.tail()

Unnamed: 0,Sound Source,SNR,Model,.wav File,Sample Rate,Raw Data,Classification
12496,valve,6dB,id_02,00000322.wav,16000,"[-0.0011672974, -0.0015106201, -0.0015563965, ...",normal
12138,valve,6dB,id_02,00000635.wav,16000,"[0.00038146973, -0.0006790161, -0.00093078613,...",normal
12096,valve,6dB,id_02,00000177.wav,16000,"[-0.0018463135, -0.0012435913, -0.00075149536,...",normal
9784,valve,6dB,id_06,00000471.wav,16000,"[-0.0062789917, -0.0063591003, -0.0060272217, ...",normal
8721,valve,6dB,id_00,00000198.wav,16000,"[-0.0014572144, -0.0020713806, -0.0020561218, ...",normal


In [46]:
df.to_pickle('sampled_audio_data.pkl')

In [48]:
df_read = pd.read_pickle('sampled_audio_data.pkl')

In [61]:
# check if vectors are preserved
df_read.iloc[0]['Raw Data'].shape

(160000,)