In [1]:
%load_ext autoreload
%autoreload 2

In [40]:
warnings.filterwarnings("ignore")
plt.style.use("Solarize_Light2")
prop_cycle = plt.rcParams["axes.prop_cycle"]
colors = prop_cycle.by_key()["color"]

In [2]:
import glob
import os
import warnings
import numpy as np
from bokeh.io import export_svgs, output_notebook
from bokeh.models import BoxAnnotation, ColumnDataSource, HoverTool
from bokeh.plotting import figure, show
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    average_precision_score,
    precision_recall_curve,
    roc_auc_score,
    roc_curve,
)
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
from utils.measuring_performance import (
    get_prediction,
    plot_confusion_matrix,
    plot_histogram_by_class,
    plot_loss_per_epoch,
    plot_pr_curve,
    plot_roc_curve,
)
from utils.misc import build_files_list, dump_pickle, load_pickle
from utils.sound_utils import extract_signal_features, generate_dataset, load_sound_file

output_notebook()
warnings.filterwarnings("ignore")
np.random.seed(42)



In [3]:
import tensorflow as tf
from tensorflow.keras import Input
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
#from tensorflow.python.keras.utils.multi_gpu_utils import multi_gpu_model
#from tensorflow.keras.utils import multi_gpu_model
from tensorflow.python.client import device_lib

tf.random.set_seed(42)

2022-08-25 18:24:47.903367: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-08-25 18:24:47.903398: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [4]:
def get_available_gpus():
    local_devices = device_lib.list_local_devices()
    return [x.name for x in local_devices if x.device_type == "GPU"]

In [5]:
# Specify path to the folders
DATA_PATH = "../../data/mimii-anomaly-detection"
IMAGE_PATH = "./img"
MODEL_PATH = "./models"
MERGE_MACHINE_ID = True

os.makedirs(os.path.join(DATA_PATH, "dataset"), exist_ok=True)
os.makedirs(MODEL_PATH, exist_ok=True)

file_paths = sorted(
    glob.glob(DATA_PATH + "/*/*" if MERGE_MACHINE_ID else DATA_PATH + "/*/*/*")
)

In [6]:
file_path = file_paths[0]  # the first machine ID or machine type selected
file_path_split = file_path.split("/")
SUFFIX = "_".join(["", file_path_split[-1], file_path_split[-2]])

if MERGE_MACHINE_ID:
    print(f"db: {file_path_split[-2]}, machine type: {file_path_split[-1]}")

else:
    print(
        f"db: {file_path_split[-3]}, machine type: {file_path_split[-2]}, machine id: {file_path_split[-1]}"
    )
    SUFFIX = "_".join([SUFFIX, file_path_split[-3]])

db: 6dB, machine type: fan


In [7]:
normal_files, abnormal_files = build_files_list(root_dir=file_path)
normal_labels = np.zeros(len(normal_files))
abnormal_labels = np.ones(len(abnormal_files))
print(normal_labels)
print(abnormal_labels)

[0. 0. 0. ... 0. 0. 0.]
[1. 1. 1. ... 1. 1. 1.]


In [9]:
len(normal_files[:1475]), len(abnormal_files)

(1475, 1475)

In [18]:
len(normal_labels[:1475]), len(abnormal_labels)

(1475, 1475)

In [12]:
mix = np.concatenate((normal_files[:1475], abnormal_files), axis=0)

In [19]:
mix_labels = np.concatenate((normal_labels[:1475], abnormal_labels), axis=0)

In [21]:
print("Amount of mix files:" , len(mix))
print("Unique value counts of test labels", np.unique(mix_labels, return_counts=True))
print("Amount of test files:" , len(mix_labels))
len(mix), len(mix_labels)


Amount of mix files: 2950
Unique value counts of test labels (array([0., 1.]), array([1475, 1475]))
Amount of test files: 2950


(2950, 2950)

In [23]:
train_files, test_files, train_labels, test_labels = train_test_split(
    mix, mix_labels, train_size=0.8, random_state=42, shuffle=True
)
print("Example train file path: ", train_files[500])
print("Amount of train files", len(train_files))
print("Unique value counts of train labels", np.unique(train_labels, return_counts=True))
print(len(train_labels))
print("---------------")
print("Example test file path: ", test_files[500])
print("Amount of test files", len(test_files))
print("Unique value counts of test labels", np.unique(test_labels, return_counts=True))

Example train file path:  ../../data/mimii-anomaly-detection/6dB/fan/id_04/abnormal/00000207.wav
Amount of train files 2360
Unique value counts of train labels (array([0., 1.]), array([1159, 1201]))
2360
---------------
Example test file path:  ../../data/mimii-anomaly-detection/6dB/fan/id_02/abnormal/00000308.wav
Amount of test files 590
Unique value counts of test labels (array([0., 1.]), array([316, 274]))


In [25]:
# ABNORMAL 1, NORMAL 0
print(
    f"Train set has {train_labels.shape[0]} signals including {train_labels.sum():.0f} abnormal signals, \
but test set has {test_labels.shape[0]} signals including {test_labels.sum():.0f} abnormal signals."
)

Train set has 2360 signals including 1201 abnormal signals, but test set has 590 signals including 274 abnormal signals.


In [27]:
dataset = {
    "train_files": train_files,
    "test_files": test_files,
    "train_labels": train_labels,
    "test_labels": test_labels,
}

# Save metadata to .txt file
for key, values in dataset.items():
    file_name = os.path.join(DATA_PATH, "dataset_cnn", key + SUFFIX + ".txt")
    with open(file_name, "w") as f:
        for item in values:
            f.write(str(item) + "\n")

In [29]:
len(train_files), len(train_labels), len(test_files), len(test_labels)

(2360, 2360, 590, 590)

In [30]:
# this is the number of samples in a window per fft
n_fft = 2048
# The amount of samples we are shifting after each fft
hop_length = 512
# number of Mel bands to generate
n_mels = 64
frames = 5

train_data_path = os.path.join(DATA_PATH, "dataset_cnn", "train_data" + SUFFIX + ".pkl")

if os.path.exists(train_data_path):
    print("Train data already exists, loading from file...")
    train_data = load_pickle(train_data_path)

else:
    train_data = generate_dataset(
        train_files, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels, frames=frames
    )
    print("Saving train data to disk...")
    # Serialize object
    dump_pickle(train_data_path, train_data)
    print("Done.")

print(f"Train data has a {train_data.shape} shape.")

  0%|          | 0/2360 [00:00<?, ?it/s]

Saving train data to disk...
Done.
Train data has a (729240, 320) shape.


In [37]:
train_data[2].shape

(320,)

In [42]:
from keras.models import Sequential
input_dim = n_mels * frames
model = Sequential()
model.add(Dense(128, activation='relu', input_dim=input_dim))
#model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [43]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_8 (Dense)             (None, 128)               41088     
                                                                 
 dense_9 (Dense)             (None, 64)                8256      
                                                                 
 dense_10 (Dense)            (None, 1)                 65        
                                                                 
Total params: 49,409
Trainable params: 49,409
Non-trainable params: 0
_________________________________________________________________


In [52]:
batch_size = 128
epochs = 10

model.compile(
    optimizer=Adam(learning_rate=1e-03),
    loss="mean_squared_error"
)

In [53]:
%%time
history = model.fit(
    train_data,
    train_data,
    batch_size=batch_size,
    epochs=epochs,
    verbose=True,
    # https://keras.io/api/callbacks/early_stopping/
    callbacks=[EarlyStopping(monitor="val_loss", patience=10)],
    validation_split=0.1,
    shuffle=True
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 2min 36s, sys: 39.9 s, total: 3min 16s
Wall time: 1min 27s
