In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from sklearn.model_selection import train_test_split
from glob import glob
import os
import zipfile

In [None]:
# Path to the zip file in Google Drive
zip_path = '/content/drive/MyDrive/Command_Keyword.zip'  # Replace with your zip file path
extracted_path = '/content/unzipped'  # Path to extract files

# Unzip the file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extracted_path)

# Verify extraction
print(f"Files extracted to: {extracted_path}")
print("Sample files:", glob(os.path.join(extracted_path, '*')))

# Process the extracted directory
data_dir = extracted_path  # Use this as the data directory for loading files

Files extracted to: /content/unzipped
Sample files: ['/content/unzipped/__MACOSX', '/content/unzipped/Command_Keyword']


In [None]:
import os
import numpy as np
from scipy.signal import stft
from glob import glob
from sklearn.model_selection import train_test_split
import tensorflow as tf
from scipy.io import wavfile
from scipy.signal import resample

# Define STFT parameters
fs = 8192  # Sampling rate in Hz
nperseg = 256  # Window size
noverlap = nperseg // 2  # 50% overlap
nfft = 256  # FFT size
window = 'hamming'

# Load noise samples
noise_files = [
    '/content/drive/MyDrive/record_fft/audio_waveform0.txt',
    '/content/drive/MyDrive/record_fft/audio_waveform1.txt',
    '/content/drive/MyDrive/record_fft/audio_waveform2.txt',
    '/content/drive/MyDrive/record_fft/audio_waveform3.txt',
    '/content/drive/MyDrive/record_fft/audio_waveform4.txt',
    '/content/drive/MyDrive/record_fft/audio_waveform5.txt',
    '/content/drive/MyDrive/record_fft/audio_waveform6.txt',
    '/content/drive/MyDrive/record_fft/audio_waveform7.txt',
]
noise_samples = [np.loadtxt(noise_file) for noise_file in noise_files]

# Normalize noise samples
noise_samples = [noise / np.max(np.abs(noise)) for noise in noise_samples]

# Extract labels from filenames
def extract_label(filename):
    return filename.split('/')[-1].split('_')[0]  # Adjust based on your naming convention

# Add customized noise
def add_custom_noise(audio, noise_samples):
    noise = noise_samples[np.random.randint(0, len(noise_samples))]
    noise = np.interp(np.arange(len(audio)), np.linspace(0, len(audio), len(noise)), noise)
    noise_level = np.random.uniform(0.01, 0.1)  # Random noise level
    return audio + noise_level * noise

# Augmentation functions
def time_stretch(audio, stretch_rate=None):
    if stretch_rate is None:
        stretch_rate = np.random.uniform(0.8, 1.2)  # Random stretch rate
    return np.interp(
        np.arange(0, len(audio), stretch_rate), np.arange(0, len(audio)), audio
    )[:len(audio)]  # Ensure the length matches original

def pitch_shift(audio, shift_samples=None):
    if shift_samples is None:
        shift_samples = np.random.randint(-200, 200)  # Random pitch shift
    return np.roll(audio, shift_samples)

def volume_scale(audio, scale_factor=None):
    if scale_factor is None:
        scale_factor = np.random.uniform(0.8, 1.5)  # Random scale factor
    return audio * scale_factor

# Perform STFT preprocessing
def preprocess_audio(audio):
    frequencies, times, Zxx = stft(audio, fs, window=window, nperseg=nperseg, noverlap=noverlap, nfft=nfft)
    magnitude = np.abs(Zxx)
    fft_waveform = np.mean(magnitude, axis=1)
    return fft_waveform

# Augment audio data with combined random augmentations
def augment_audio(audio, noise_samples):
    augmented_data = []

    # Add original
    #augmented_data.append(audio)

    # Add noise
    audio_with_noise = add_custom_noise(audio, noise_samples)
    #augmented_data.append(audio_with_noise)

    # Apply random augmentations
    for _ in range(1):  # Generate three random augmentations
        augmented_audio = audio_with_noise
        if np.random.rand() < 0.5:  # Apply time stretch randomly
            augmented_audio = time_stretch(augmented_audio)
        if np.random.rand() < 0.5:  # Apply pitch shift randomly
            augmented_audio = pitch_shift(augmented_audio)
        if np.random.rand() < 0.5:  # Apply volume scaling randomly
            augmented_audio = volume_scale(augmented_audio)
        augmented_data.append(augmented_audio)

    return augmented_data

# Load audio data, augment, and extract FFT features
def load_data_with_fft(data_dir):
    file_paths = glob(os.path.join(data_dir, '*.wav'))
    print(f"Found {len(file_paths)} files in {data_dir}")
    data = []
    labels = []
    for file_path in file_paths:
        print(f"Loading file: {file_path}")
        sr, audio = wavfile.read(file_path)
        if sr != fs:
            print(f"Resampling from {sr} Hz to {fs} Hz")
            # Resample to match the target sampling rate (fs)
            audio = resample(audio, int(len(audio) * fs / sr))
        audio = audio / np.max(np.abs(audio))  # Normalize audio

        # Augment audio and extract FFT features
        augmented_audios = augment_audio(audio, noise_samples)
        for augmented_audio in augmented_audios:
            fft_features = preprocess_audio(augmented_audio)
            data.append(fft_features)
            labels.append(extract_label(file_path))
    return np.array(data), np.array(labels)

# Encode labels as integers
def encode_labels(labels):
    label_to_index = {label: idx for idx, label in enumerate(set(labels))}
    return np.array([label_to_index[label] for label in labels]), label_to_index

# Load and preprocess data
data_dir = '/content/unzipped/Command_Keyword'
data, labels = load_data_with_fft(data_dir)
#plot the first data sample
# print(data[4])


labels, label_to_index = encode_labels(labels)

# Split dataset into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(data, labels, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Convert to TensorFlow datasets
train_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(32).shuffle(buffer_size=100)
val_ds = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(32)
test_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(32)

# Print data statistics
print(f"Training samples: {len(X_train)}, Validation samples: {len(X_val)}, Testing samples: {len(X_test)}")
print(f"Labels mapping: {label_to_index}")

[1;30;43m串流輸出內容已截斷至最後 5000 行。[0m
Loading file: /content/unzipped/Command_Keyword/Previous_p293_variation7.wav
Loading file: /content/unzipped/Command_Keyword/Pause_p301_variation10.wav
Loading file: /content/unzipped/Command_Keyword/Start_p347_variation8.wav
Resampling from 8000 Hz to 8192 Hz
Loading file: /content/unzipped/Command_Keyword/Start_p343_variation10.wav
Resampling from 8000 Hz to 8192 Hz
Loading file: /content/unzipped/Command_Keyword/Pause_p236_variation6.wav
Loading file: /content/unzipped/Command_Keyword/Start_p246_variation4.wav
Resampling from 8000 Hz to 8192 Hz
Loading file: /content/unzipped/Command_Keyword/Next_p363_variation7.wav
Loading file: /content/unzipped/Command_Keyword/Previous_p295_variation1.wav
Loading file: /content/unzipped/Command_Keyword/Start_p363_variation2.wav
Resampling from 8000 Hz to 8192 Hz
Loading file: /content/unzipped/Command_Keyword/Next_p258_variation4.wav
Loading file: /content/unzipped/Command_Keyword/Pause_p343_variation9.wav
Loadi

In [None]:
import tensorflow as tf

# Define the model
model = tf.keras.Sequential([
    tf.keras.layers.InputLayer(input_shape=(129,)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(4, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
history = model.fit(train_ds,
                    validation_data=(val_ds),
                    epochs=50, batch_size=32, callbacks=[
                        tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)
                    ])

# Test the model
test_loss, test_accuracy = model.evaluate(test_ds)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")



Epoch 1/50
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 20ms/step - accuracy: 0.3837 - loss: 1.3656 - val_accuracy: 0.5619 - val_loss: 1.1439
Epoch 2/50
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - accuracy: 0.6056 - loss: 0.9888 - val_accuracy: 0.6789 - val_loss: 0.7314
Epoch 3/50
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7094 - loss: 0.6894 - val_accuracy: 0.7523 - val_loss: 0.5954
Epoch 4/50
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7285 - loss: 0.5819 - val_accuracy: 0.7844 - val_loss: 0.5283
Epoch 5/50
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7408 - loss: 0.5421 - val_accuracy: 0.7661 - val_loss: 0.5136
Epoch 6/50
[1m109/109[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7770 - loss: 0.4908 - val_accuracy: 0.7936 - val_loss: 0.4540
Epoch 7/50
[1m109/109[0m 

In [None]:
import numpy as np
from scipy.signal import stft
import tensorflow as tf

# Define STFT parameters
fs = 8000  # Sampling rate in Hz
nperseg = 256  # Window size
noverlap = nperseg // 2  # 50% overlap
nfft = 256  # FFT size
window = 'hamming'

# Load the input vector from the text file
file_path = '/content/drive/MyDrive/record_fft/audio_waveform_fft0.txt'  # Replace with the correct path
audio_waveform = np.loadtxt(file_path)

# Perform STFT and preprocess the input to generate the FFT waveform
def preprocess_audio_input(audio_waveform):
    frequencies, times, Zxx = stft(audio_waveform, fs, window=window, nperseg=nperseg, noverlap=noverlap, nfft=nfft)
    magnitude = np.abs(Zxx)
    fft_waveform = np.mean(magnitude, axis=1)  # Compute the mean magnitude across time
    return fft_waveform

# Preprocess the input
fft_waveform = preprocess_audio_input(audio_waveform)

# Ensure the shape matches the model input
fft_waveform = fft_waveform.reshape(1, -1)  # Reshape to match model's input dimensions
print(fft_waveform)
# Load the trained model
# model = tf.keras.models.load_model('/content/drive/My Drive/your_model_path')  # Replace with your model's path

# Make predictions
predictions = model.predict(fft_waveform)

# Get the predicted class
predicted_class = np.argmax(predictions, axis=1)
print(f"Predicted class: {predicted_class}")

[[439685.53730623 294499.87239277 269707.473398   233827.97921626
  282722.87540077 215341.73336263 230299.00988721 172892.40422036
  193227.35234346 148363.37392089 169648.62703439 148851.1644218
  183975.07875963 144659.22225853 171204.94403809 143132.18587429
  161388.97749156 136581.498293   158016.89846991 129780.20737541
  147262.62200236 124253.96978713 147749.76817722 130516.18518615
  154836.8291903  133395.52665069 160876.83383689 135025.01109406
  155416.47925142 128924.62872922 146871.14913435 121654.30225363
  134851.4748933  114503.82236278 131540.2965634  113758.90140942
  130021.99743322 113466.5456002  133566.73776615 116410.86939847
  136792.97366333 118813.90883759 136495.31000968 117914.73426564
  135744.44323298 118808.14683361 136111.16397515 119382.9203198
  135317.81123955 120962.79963544 141908.3853626  121412.62340855
  140857.05616057 124740.86969196 145713.66885202 126280.02450403
  145954.10367482 123979.87369624 144347.33508864 124182.12365077
  141124.228

In [None]:
file_path = '/content/output_audio 3.wav'
audio, sr = tf.audio.decode_wav(tf.io.read_file(file_path))  # Assuming sampling rate matches fs
audio = tf.squeeze(audio).numpy()  # Convert TensorFlow tensor to numpy array

#normalize audio
audio = audio / np.max(np.abs(audio))

# Preprocess to extract FFT features
fft_features = preprocess_audio(audio)

# # Make predictions
predictions = model.predict(fft_features.reshape(1, -1))

#see relative probabilities
print(predictions)

# Get the predicted class
predicted_class = np.argmax(predictions, axis=1)
print(f"Predicted class: {predicted_class}")

# Labels mapping: {'Previous': 0, 'Next': 1, 'Start': 2, 'Pause': 3}
# Labels mapping
labels = {
    0: 'Previous',
    1: 'Next',
    2: 'Start',
    3: 'Pause'
}
print(f"Predicted class: {labels.get(predicted_class[0], 'Unknown')}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 181ms/step
[[3.0020727e-03 9.9699783e-01 8.6879140e-08 9.1314465e-12]]
Predicted class: [1]
Predicted class: Next


In [None]:
model.save('command_spotter.h5')  # Saves the model in TensorFlow's SavedModel format
model.save('command_spotter.keras')  # Saves the model in Keras format

