In [None]:
!pip install py7zr -q

In [None]:
import os
import py7zr
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras import layers, models
import matplotlib.pyplot as plt
from tqdm import tqdm
import seaborn as sns
import librosa
import math
from IPython import display
from IPython.display import Audio, display

In [None]:
data_dir = "/kaggle/input/tensorflow-speech-recognition-challenge/"
extract_dir = '/kaggle/working/extracted_data_train'

def extract_7z(filepath, dest_dir):
    with py7zr.SevenZipFile(filepath, mode='r') as z:
        z.extractall(path=dest_dir)


if not os.path.exists(extract_dir):
    filepath = os.path.join(data_dir, "train.7z")
    print(f"Extracting files from {filepath} to {extract_dir}...")
    extract_7z(filepath, extract_dir)
else:
    print(f"Data already extracted at {extract_dir}")

In [None]:

commands = np.array([d for d in tf.io.gfile.listdir(os.path.join(extract_dir, "train/audio")) 
                    if d != '_background_noise_'])
print('Commands:', commands)
noise_dir = os.path.join(extract_dir, "train/audio/_background_noise_")
noise_files = tf.io.gfile.glob(noise_dir + '/*.wav')

# decode audio files
def decode_audio(audio_binary):
    audio, _ = tf.audio.decode_wav(contents=audio_binary)
    return tf.squeeze(audio, axis=-1)

def get_label(file_path):
    label = tf.strings.split(input=file_path, sep=os.path.sep)[-2]
    return tf.cond(tf.reduce_any(tf.equal(commands, label)), lambda: label, lambda: tf.constant("unknown", dtype=tf.string))

# waveform --> spectrogram
def get_spectrogram(waveform):
    input_len = 16000
    waveform = waveform[:input_len]
    zero_padding = tf.zeros([input_len] - tf.shape(waveform), dtype=tf.float32)
    waveform = tf.cast(waveform, dtype=tf.float32)
    equal_length = tf.concat([waveform, zero_padding], 0)
    spectrogram = tf.signal.stft(equal_length, frame_length=255, frame_step=128)
    spectrogram = tf.abs(spectrogram)
    return spectrogram[..., tf.newaxis]

#background noise 
def add_background_noise(waveform, noise_files):
    noise_file = random.choice(noise_files)
    noise_audio_binary = tf.io.read_file(noise_file)
    noise_waveform = decode_audio(noise_audio_binary)
    waveform_len = tf.shape(waveform)[0]
    noise_len = tf.shape(noise_waveform)[0]
    if noise_len > waveform_len:
        offset = tf.random.uniform(shape=[], minval=0, maxval=noise_len - waveform_len, dtype=tf.int32)
        noise_waveform = noise_waveform[offset:offset + waveform_len]
    else:
        padding = tf.zeros([waveform_len - noise_len], dtype=tf.float32)
        noise_waveform = tf.concat([noise_waveform, padding], axis=0)

    noise_factor = tf.random.uniform(shape=[], minval=0.0, maxval=0.5)
    augmented_waveform = waveform + noise_factor * noise_waveform
    return tf.clip_by_value(augmented_waveform, -1.0, 1.0)



In [None]:

def adjust_speed_and_pad(waveform, speed_factor=1.0, target_length=16000):

    
    waveform = tf.cast(waveform, dtype=tf.float32)
    frame_length = 220 
    frame_step = 160   
    stft = tf.signal.stft(waveform, frame_length=frame_length, frame_step=frame_step, window_fn=tf.signal.hann_window)
    stft_real = tf.math.real(stft)
    stft_imag = tf.math.imag(stft)

    # STRETCHING
    num_frames = tf.shape(stft_real)[0]
    new_num_frames = tf.cast(tf.cast(num_frames, tf.float32) / speed_factor, tf.int32)

    resized_real = tf.image.resize(stft_real[tf.newaxis, :, :], [new_num_frames, tf.shape(stft_real)[1]])[0]
    resized_imag = tf.image.resize(stft_imag[tf.newaxis, :, :], [new_num_frames, tf.shape(stft_imag)[1]])[0]

    stretched_stft = tf.complex(resized_real, resized_imag)
    waveform_stretched = tf.signal.inverse_stft(
        stretched_stft,
        frame_length=frame_length,
        frame_step=frame_step,
        window_fn=tf.signal.hann_window
    )
    waveform_length = tf.shape(waveform_stretched)[0]
    if waveform_length < target_length:
        padding = target_length - waveform_length
        left_pad = padding // 2
        right_pad = padding - left_pad
        padded_waveform = tf.pad(waveform_stretched, [[left_pad, right_pad]], constant_values=0.0)
    else:
        padded_waveform = waveform_stretched[:target_length]

    return padded_waveform


In [None]:

from IPython.display import Audio, display
audio_dir = os.path.join(extract_dir, "train/audio")
commands = [d for d in os.listdir(audio_dir) if os.path.isdir(os.path.join(audio_dir, d)) and d != '_background_noise_']

audio_files = []
for command in commands:
    command_path = os.path.join(audio_dir, command)
    audio_files += [os.path.join(command_path, f) for f in os.listdir(command_path) if f.endswith('.wav')]
audio_files = random.sample(audio_files, 10)



In [None]:

augmented_audio_files = []

plt.figure(figsize=(15, 20))

for i, file_path in enumerate(audio_files):
    audio_binary = tf.io.read_file(file_path)
    waveform = decode_audio(audio_binary)
    spectrogram = get_spectrogram(waveform)
    label = get_label(file_path).numpy().decode('utf-8')
    augmented_waveform = add_background_noise(waveform, noise_files)
    augmented_spectrogram = get_spectrogram(augmented_waveform)
    augmented_audio_files.append(augmented_waveform)
    plt.subplot(10, 4, 4 * i + 1)
    plt.plot(waveform.numpy())
    plt.title(f"Original Waveform: {os.path.basename(file_path)} ({label})", fontsize=8)
    plt.subplot(10, 4, 4 * i + 2)
    plt.imshow(np.log(np.squeeze(spectrogram.numpy()) + 1e-10).T, aspect='auto', origin='lower')
    plt.title(f"Original Spectrogram: {os.path.basename(file_path)} ({label})", fontsize=8)
    plt.subplot(10, 4, 4 * i + 3)
    plt.plot(augmented_waveform.numpy())
    plt.title(f"Augmented Waveform: {os.path.basename(file_path)} ({label})", fontsize=8)
    plt.subplot(10, 4, 4 * i + 4)
    plt.imshow(np.log(np.squeeze(augmented_spectrogram.numpy()) + 1e-10).T, aspect='auto', origin='lower')
    plt.title(f"Augmented Spectrogram: {os.path.basename(file_path)} ({label})", fontsize=8)

plt.tight_layout()
plt.show()


In [None]:

print("\nPlaying original and add backgound noise augmented audio files:")

for i, (file_path, augmented_waveform) in enumerate(zip(audio_files, augmented_audio_files)):

    print(f"\nOriginal audio {i+1}: {os.path.basename(file_path)}")
    audio_binary = tf.io.read_file(file_path)
    original_waveform = decode_audio(audio_binary)  
    display(Audio(original_waveform.numpy(), rate=16000))
    print(f"Augmented audio {i+1}: {os.path.basename(file_path)}")
    display(Audio(augmented_waveform.numpy(), rate=16000))


In [None]:
augmented_audio_files_speed = []

plt.figure(figsize=(15, 20))

for i, file_path in enumerate(audio_files):
    # original waveform
    audio_binary = tf.io.read_file(file_path)
    waveform = decode_audio(audio_binary)
    spectrogram = get_spectrogram(waveform)
    label = get_label(file_path).numpy().decode('utf-8')
    speed_factor = random.uniform(0.8, 1.5)  
    speed_adjusted_waveform = adjust_speed_and_pad(waveform, speed_factor=speed_factor, target_length=16000)
    speed_adjusted_spectrogram = get_spectrogram(speed_adjusted_waveform)
    augmented_audio_files_speed.append(speed_adjusted_waveform)
    plt.subplot(10, 4, 4 * i + 1)
    plt.plot(waveform.numpy())
    plt.title(f"Original Waveform: {os.path.basename(file_path)} ({label})", fontsize=8)
    plt.subplot(10, 4, 4 * i + 2)
    plt.imshow(np.log(np.squeeze(spectrogram.numpy()) + 1e-10).T, aspect='auto', origin='lower')
    plt.title(f"Original Spectrogram: {os.path.basename(file_path)} ({label})", fontsize=8)
    plt.subplot(10, 4, 4 * i + 3)
    plt.plot(speed_adjusted_waveform.numpy())
    plt.title(f"Speed-Adjusted Waveform: {os.path.basename(file_path)}", fontsize=8)
    plt.subplot(10, 4, 4 * i + 4)
    plt.imshow(np.log(np.squeeze(speed_adjusted_spectrogram.numpy()) + 1e-10).T, aspect='auto', origin='lower')
    plt.title(f"Speed-Adjusted Spectrogram: {os.path.basename(file_path)}", fontsize=8)
plt.tight_layout()
plt.show()


In [None]:

print("\nPlaying original and speed adjustment augmented audio files:")

for i, (file_path, speed_augmented_waveform) in enumerate(zip(audio_files, augmented_audio_files_speed)):
    print(f"\nOriginal audio {i+1}: {os.path.basename(file_path)}")
    audio_binary = tf.io.read_file(file_path)
    original_waveform = decode_audio(audio_binary)  
    display(Audio(original_waveform.numpy(), rate=16000))
    print(f"Augmented audio {i+1}: {os.path.basename(file_path)}")
    display(Audio(speed_augmented_waveform.numpy(), rate=16000))


3.노이즈와 속도 함께 증강

In [None]:
augmented_audio_files_speed_noise = []
plt.figure(figsize=(20, 25))

for i, file_path in enumerate(audio_files):
    audio_binary = tf.io.read_file(file_path)
    waveform = decode_audio(audio_binary)
    spectrogram = get_spectrogram(waveform)
    label = get_label(file_path).numpy().decode('utf-8')
    speed_factor = random.uniform(0.8, 1.8)  
    speed_adjusted_waveform = adjust_speed_and_pad(waveform, speed_factor=speed_factor, target_length=16000)
    speed_adjusted_spectrogram = get_spectrogram(speed_adjusted_waveform)
    noise_added_waveform = add_background_noise(speed_adjusted_waveform, noise_files)
    noise_added_spectrogram = get_spectrogram(noise_added_waveform)
    augmented_audio_files_speed_noise.append(noise_added_waveform)
    plt.subplot(10, 6, 6 * i + 1)
    plt.plot(waveform.numpy())
    plt.title(f"Original Waveform: {os.path.basename(file_path)} ({label})", fontsize=8)
    plt.subplot(10, 6, 6 * i + 2)
    plt.imshow(np.log(np.squeeze(spectrogram.numpy()) + 1e-10).T, aspect='auto', origin='lower')
    plt.title(f"Original Spectrogram: {os.path.basename(file_path)} ({label})", fontsize=8)
    plt.subplot(10, 6, 6 * i + 3)
    plt.plot(speed_adjusted_waveform.numpy())
    plt.title(f"Speed-Adjusted Waveform: {os.path.basename(file_path)}", fontsize=8)
    plt.subplot(10, 6, 6 * i + 4)
    plt.imshow(np.log(np.squeeze(speed_adjusted_spectrogram.numpy()) + 1e-10).T, aspect='auto', origin='lower')
    plt.title(f"Speed-Adjusted Spectrogram: {os.path.basename(file_path)}", fontsize=8)
    plt.subplot(10, 6, 6 * i + 5)
    plt.plot(noise_added_waveform.numpy())
    plt.title(f"Noise-Added Waveform: {os.path.basename(file_path)}", fontsize=8)
    plt.subplot(10, 6, 6 * i + 6)
    plt.imshow(np.log(np.squeeze(noise_added_spectrogram.numpy()) + 1e-10).T, aspect='auto', origin='lower')
    plt.title(f"Noise-Added Spectrogram: {os.path.basename(file_path)}", fontsize=8)

plt.tight_layout()
plt.show()


In [None]:

print("\nPlaying original and augmented audio files:")

for i, (file_path, noise_added_waveform) in enumerate(zip(audio_files, augmented_audio_files_speed_noise)):
    print(f"\nOriginal audio {i+1}: {os.path.basename(file_path)}")
    audio_binary = tf.io.read_file(file_path)
    original_waveform = decode_audio(audio_binary) 
    display(Audio(original_waveform.numpy(), rate=16000))
    print(f"FINAL Augmented audio {i+1}: {os.path.basename(file_path)}")
    display(Audio(noise_added_waveform.numpy(), rate=16000))


In [None]:
def preprocess_dataset(files, augment=False):
    files_ds = tf.data.Dataset.from_tensor_slices(files)
    output_ds = files_ds.map(
        lambda file_path: (tf.io.read_file(file_path), get_label(file_path)), 
        num_parallel_calls=tf.data.AUTOTUNE
    )
    output_ds = output_ds.map(
        lambda audio_binary, label: (decode_audio(audio_binary), label),
        num_parallel_calls=tf.data.AUTOTUNE
    )

    if augment:
        output_ds = output_ds.map(
            lambda waveform, label: (
                add_background_noise(  
                    adjust_speed_and_pad(waveform, speed_factor=tf.random.uniform([], 1.0, 1.3), target_length=16000),
                    noise_files
                ), 
                label
            ), 
            num_parallel_calls=tf.data.AUTOTUNE
        )
    output_ds = output_ds.map(
        lambda waveform, label: (get_spectrogram(waveform), tf.argmax(label == commands)),
        num_parallel_calls=tf.data.AUTOTUNE
    )

    return output_ds


In [None]:
filepath_data = os.path.join(extract_dir, "train/audio")

filenames = tf.io.gfile.glob([os.path.join(filepath_data, d,  '*') for d in commands])
filenames = tf.random.shuffle(filenames)

total_samples = len(filenames)
train_size = int(0.7 * total_samples)
val_size = int(0.15 * total_samples)  
test_size = total_samples - train_size - val_size 
train_files = filenames[:train_size]
val_files = filenames[train_size:train_size + val_size]
test_files = filenames[train_size + val_size:]

print('Training set size:', len(train_files))
print('Validation set size:', len(val_files))
print('Test set size:', len(test_files))

In [None]:

train_ds = preprocess_dataset(train_files, augment=True)
val_ds = preprocess_dataset(val_files)
test_ds = preprocess_dataset(test_files)

In [None]:

num_samples_to_display = 10
random_ds = train_ds.shuffle(buffer_size=len(train_files)).take(num_samples_to_display)
def decode_label(label_index):
    return commands[label_index]  
for i, (spectrogram, label) in enumerate(random_ds):
    label_np = decode_label(label.numpy())  
    spectrogram_np = spectrogram.numpy()[:, :, 0]  
    waveform = tf.signal.inverse_stft(
        tf.cast(spectrogram[:, :, 0], tf.complex64), frame_length=255, frame_step=128
    ).numpy()
    plt.figure(figsize=(15, 5))
    plt.subplot(1, 2, 1)
    plt.plot(waveform)
    plt.title(f"Waveform (Label: {label_np})")
    plt.xlabel("Time")
    plt.ylabel("Amplitude")
    plt.subplot(1, 2, 2)
    plt.imshow(np.log(spectrogram_np + 1e-10).T, aspect='auto', origin='lower')
    plt.title(f"Spectrogram (Label: {label_np})")
    plt.xlabel("Time")
    plt.ylabel("Frequency")
    plt.colorbar()
    plt.tight_layout()
    plt.show()
    if i + 1 >= num_samples_to_display:
        break


In [None]:
batch_size = 32
train_ds = train_ds.batch(batch_size).cache().prefetch(tf.data.AUTOTUNE)
val_ds = val_ds.batch(batch_size).cache().prefetch(tf.data.AUTOTUNE)

In [None]:
print(train_ds)

In [None]:
for spectrogram, label in train_ds.take(1):
    print("Spectrogram shape:", spectrogram.shape)  
    print("Label shape:", label.shape)  
    print("Label sample:", label.numpy())  
for spectrogram, label in val_ds.take(1):
    print("Spectrogram shape:", spectrogram.shape)
    print("Label shape:", label.shape)


In [None]:
for spectrogram, _ in train_ds.take(1):
    input_shape = spectrogram.shape[1:]

print('Input shape:', input_shape)
num_labels = len(commands)
norm_layer = layers.Normalization()
norm_layer.adapt(data=train_ds.map(lambda spec, label: spec))

timesteps = 16
model = models.Sequential([
    layers.Input(shape=input_shape),
    layers.Resizing(32, 32),
    norm_layer,
    layers.Conv2D(32, 3, activation='relu'),
    layers.Conv2D(64, 3, activation='relu'),
    layers.Conv2D(128, 3, activation='relu'),
    layers.MaxPooling2D(),
    layers.Dropout(0.25),
    layers.Flatten(),
    layers.Reshape((-1, timesteps, 21632 // timesteps)), 
    tf.keras.layers.Lambda(lambda x: tf.squeeze(x, axis=1)),
    layers.GRU(64),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(num_labels)
])

model.summary()

model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

EPOCHS = 10
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    callbacks=tf.keras.callbacks.EarlyStopping(verbose=1, patience=2),
)

In [None]:
metrics = history.history
plt.plot(history.epoch, metrics['loss'], metrics['val_loss'])
plt.legend(['loss', 'val_loss'])
plt.show()

test_audio = []
test_labels = []

for audio, label in test_ds:
    test_audio.append(audio.numpy())
    test_labels.append(label.numpy())

test_audio = np.array(test_audio)
test_labels = np.array(test_labels)

y_pred = np.argmax(model.predict(test_audio), axis=1)
y_true = test_labels

test_acc = sum(y_pred == y_true) / len(y_true)
print(f'Test set accuracy: {test_acc:.0%}')

confusion_mtx = tf.math.confusion_matrix(y_true, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(confusion_mtx, xticklabels=commands, yticklabels=commands, annot=True, fmt='g')
plt.xlabel('Prediction')
plt.ylabel('Label')
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
true_counts = np.array([np.sum(y_true == i) for i in range(len(commands))])
correct_predictions = y_true[y_true == y_pred]
correct_counts = np.array([np.sum(correct_predictions == i) for i in range(len(commands))])

x = np.arange(len(commands))  
width = 0.4 
plt.figure(figsize=(15, 8))
plt.bar(x - width/2, true_counts, width, label="True Counts", color="blue")  # 실제 라벨 개수
plt.bar(x + width/2, correct_counts, width, label="Correct Predictions", color="green")  # 맞게 예측한 라벨 개수
plt.xlabel("Labels", fontsize=12)
plt.ylabel("Count", fontsize=12)
plt.title("True Counts : Correct Predictions per Label", fontsize=14)
plt.xticks(x, commands, rotation=45, ha="right")  
plt.legend()
for i in range(len(true_counts)):
    plt.text(x[i] - width/2, true_counts[i] + 5, str(true_counts[i]), ha='center', va='bottom', fontsize=10, color='blue')
    plt.text(x[i] + width/2, correct_counts[i] + 5, str(correct_counts[i]), ha='center', va='bottom', fontsize=10, color='green')

plt.show()


In [None]:
from sklearn.metrics import classification_report
import pandas as pd
# classification report
report = classification_report(y_true, y_pred, target_names=commands, output_dict=True)
report_df = pd.DataFrame(report).transpose()
print(report_df)


In [None]:
def load_waveform(file_path):
    audio_binary = tf.io.read_file(file_path)
    waveform, _ = tf.audio.decode_wav(audio_binary)
    return np.array(waveform)
def visualize_waveforms(noise_files):
    if not noise_files:
        print("No files to visualize.")
        return

    plt.figure(figsize=(15, 10))
    for i, file_path in enumerate(noise_files):
        waveform = load_waveform(file_path)
        plt.subplot(3, 2, i + 1)
        plt.plot(waveform)
        plt.title(f"Waveform: {file_path.split('/')[-1]}")
        plt.xlabel("Sample Index")
        plt.ylabel("Amplitude")
    plt.tight_layout()
    plt.show()

visualize_waveforms(noise_files)


In [None]:
for file in noise_files:
    print(file)

In [None]:

filtered_noise_files = [file for file in noise_files if "pink_noise.wav" not in file and "white_noise.wav" not in file]

def add_background_noise_remove(waveform, noise_files):

    noise_file = random.choice(filtered_noise_files)
    noise_audio_binary = tf.io.read_file(noise_file)
    noise_waveform = decode_audio(noise_audio_binary)
    waveform_len = tf.shape(waveform)[0]
    noise_len = tf.shape(noise_waveform)[0]
    if noise_len > waveform_len:
        offset = tf.random.uniform(shape=[], minval=0, maxval=noise_len - waveform_len, dtype=tf.int32)
        noise_waveform = noise_waveform[offset:offset + waveform_len]
    else:
        padding = tf.zeros([waveform_len - noise_len], dtype=tf.float32)
        noise_waveform = tf.concat([noise_waveform, padding], axis=0)

    noise_factor = tf.random.uniform(shape=[], minval=0.0, maxval=0.5)
    augmented_waveform = waveform + noise_factor * noise_waveform
    return tf.clip_by_value(augmented_waveform, -1.0, 1.0)


In [None]:
for file in filtered_noise_files:
    print(file)

In [None]:
def preprocess_dataset_remove_noise(files, augment=False):
    files_ds = tf.data.Dataset.from_tensor_slices(files)
    output_ds = files_ds.map(
        lambda file_path: (tf.io.read_file(file_path), get_label(file_path)), 
        num_parallel_calls=tf.data.AUTOTUNE
    )
    output_ds = output_ds.map(
        lambda audio_binary, label: (decode_audio(audio_binary), label),
        num_parallel_calls=tf.data.AUTOTUNE
    )

    if augment:
        output_ds = output_ds.map(
            lambda waveform, label: (
                add_background_noise_remove(  
                    adjust_speed_and_pad(waveform, speed_factor=tf.random.uniform([], 1.0, 1.3), target_length=16000),
                    filtered_noise_files
                ), 
                label
            ), 
            num_parallel_calls=tf.data.AUTOTUNE
        )
    output_ds = output_ds.map(
        lambda waveform, label: (get_spectrogram(waveform), tf.argmax(label == commands)),
        num_parallel_calls=tf.data.AUTOTUNE
    )

    return output_ds


In [None]:
train_ds_2 = preprocess_dataset_remove_noise(train_files, augment=True)
val_ds_2 = preprocess_dataset_remove_noise(val_files)
test_ds_2 = preprocess_dataset_remove_noise(test_files)

In [None]:

num_samples_to_display = 10
random_ds_2 = train_ds_2.shuffle(buffer_size=len(train_files)).take(num_samples_to_display)
def decode_label(label_index):
    return commands[label_index]  
for i, (spectrogram, label) in enumerate(random_ds_2):
    label_np = decode_label(label.numpy())  
    spectrogram_np = spectrogram.numpy()[:, :, 0]  
    waveform = tf.signal.inverse_stft(
        tf.cast(spectrogram[:, :, 0], tf.complex64), frame_length=255, frame_step=128
    ).numpy()
    plt.figure(figsize=(15, 5))
    plt.subplot(1, 2, 1)
    plt.plot(waveform)
    plt.title(f"Waveform (Label: {label_np})")
    plt.xlabel("Time")
    plt.ylabel("Amplitude")
    plt.subplot(1, 2, 2)
    plt.imshow(np.log(spectrogram_np + 1e-10).T, aspect='auto', origin='lower')
    plt.title(f"Spectrogram (Label: {label_np})")
    plt.xlabel("Time")
    plt.ylabel("Frequency")
    plt.colorbar()
    plt.tight_layout()
    plt.show()
    if i + 1 >= num_samples_to_display:
        break


In [None]:
batch_size = 32
train_ds_2 = train_ds_2.batch(batch_size).cache().prefetch(tf.data.AUTOTUNE)
val_ds_2 = val_ds_2.batch(batch_size).cache().prefetch(tf.data.AUTOTUNE)
print(train_ds_2)
for spectrogram, label in train_ds_2.take(1):
    print("Spectrogram shape:", spectrogram.shape)  
    print("Label shape:", label.shape)  
    print("Label sample:", label.numpy())  
for spectrogram, label in val_ds_2.take(1):
    print("Spectrogram shape:", spectrogram.shape)
    print("Label shape:", label.shape)


In [None]:
for spectrogram, _ in train_ds_2.take(1):
    input_shape = spectrogram.shape[1:]

print('Input shape:', input_shape)
num_labels = len(commands)
norm_layer = layers.Normalization()
norm_layer.adapt(data=train_ds_2.map(lambda spec, label: spec))

timesteps = 16
model = models.Sequential([
    layers.Input(shape=input_shape),
    layers.Resizing(32, 32),
    norm_layer,
    layers.Conv2D(32, 3, activation='relu'),
    layers.Conv2D(64, 3, activation='relu'),
    layers.Conv2D(128, 3, activation='relu'),
    layers.MaxPooling2D(),
    layers.Dropout(0.25),
    layers.Flatten(),
    layers.Reshape((-1, timesteps, 21632 // timesteps)), 
    tf.keras.layers.Lambda(lambda x: tf.squeeze(x, axis=1)),
    layers.GRU(64),
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.5),
    layers.Dense(num_labels)
])

model.summary()

model.compile(
    optimizer=tf.keras.optimizers.Adam(),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=['accuracy']
)

EPOCHS = 10
history = model.fit(
    train_ds_2,
    validation_data=val_ds_2,
    epochs=EPOCHS,
    callbacks=tf.keras.callbacks.EarlyStopping(verbose=1, patience=2),
)

In [None]:
metrics = history.history
plt.plot(history.epoch, metrics['loss'], metrics['val_loss'])
plt.legend(['loss', 'val_loss'])
plt.show()

test_audio = []
test_labels = []

for audio, label in test_ds_2:
    test_audio.append(audio.numpy())
    test_labels.append(label.numpy())

test_audio = np.array(test_audio)
test_labels = np.array(test_labels)

y_pred = np.argmax(model.predict(test_audio), axis=1)
y_true = test_labels

test_acc = sum(y_pred == y_true) / len(y_true)
print(f'Test set accuracy: {test_acc:.0%}')

confusion_mtx = tf.math.confusion_matrix(y_true, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(confusion_mtx, xticklabels=commands, yticklabels=commands, annot=True, fmt='g')
plt.xlabel('Prediction')
plt.ylabel('Label')
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
true_counts = np.array([np.sum(y_true == i) for i in range(len(commands))])
correct_predictions = y_true[y_true == y_pred]
correct_counts = np.array([np.sum(correct_predictions == i) for i in range(len(commands))])


x = np.arange(len(commands))  
width = 0.4  


plt.figure(figsize=(15, 8))
plt.bar(x - width/2, true_counts, width, label="True Counts", color="blue")  # 실제 라벨 개수
plt.bar(x + width/2, correct_counts, width, label="Correct Predictions", color="green")  # 맞게 예측한 라벨 개수


plt.xlabel("Labels", fontsize=12)
plt.ylabel("Count", fontsize=12)
plt.title("True Counts : Correct Predictions per Label", fontsize=14)
plt.xticks(x, commands, rotation=45, ha="right")  
plt.legend()
for i in range(len(true_counts)):
    plt.text(x[i] - width/2, true_counts[i] + 5, str(true_counts[i]), ha='center', va='bottom', fontsize=10, color='blue')
    plt.text(x[i] + width/2, correct_counts[i] + 5, str(correct_counts[i]), ha='center', va='bottom', fontsize=10, color='green')

plt.show()


In [None]:
from sklearn.metrics import classification_report
import pandas as pd
report = classification_report(y_true, y_pred, target_names=commands, output_dict=True)

report_df = pd.DataFrame(report).transpose()

print(report_df)