In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import librosa
import os
import numpy as np
from sklearn.model_selection import train_test_split

def prepare_data(samples, num_of_samples=176400, num_of_common=44100):
    data = []
    for offset in range(0, len(samples), num_of_common):
        start = offset
        end = offset + num_of_samples
        chunk = samples[start:end]
        if len(chunk) == num_of_samples:
            data.append(chunk)
    return data

folder_paths = {
    'car': '/content/drive/MyDrive/sound/car',
    'bike': '/content/drive/MyDrive/sound/bike',
    'bird': '/content/drive/MyDrive/sound/bird',
    'wind': '/content/drive/MyDrive/sound/wind',
    'rain': '/content/drive/MyDrive/sound/rain',
    'crowd': '/content/drive/MyDrive/sound/crowd',
    'chatter': '/content/drive/MyDrive/sound/chatter',
    'park': '/content/drive/MyDrive/sound/park'
}

wav_files = {category: [f for f in os.listdir(folder) if f.endswith('.wav')] for category, folder in folder_paths.items()}

max_files = {
    'car': 150,
    'bike': 1000,
    'bird': 15,
    'wind': 15,
    'rain': 14,
    'crowd': 15,
    'chatter': 12,
    'park': 25
}

categories = {category: [] for category in folder_paths.keys()}

for category, files in wav_files.items():
    files = files[:max_files[category]]
    for file in files:
        file_path = os.path.join(folder_paths[category], file)
        samples, sample_rate = librosa.load(file_path, sr=44100)
        processed_data = prepare_data(samples)
        categories[category].extend(processed_data)

audio = np.concatenate([categories[category] for category in folder_paths.keys()])
labels = np.concatenate([
    np.full(len(categories[category]), idx) for idx, category in enumerate(folder_paths.keys())
])

x_tr, x_val, y_tr, y_val = train_test_split(audio, labels, stratify=labels, test_size=0.1, random_state=777, shuffle=True)

In [None]:
import numpy as np

unique_labels, counts = np.unique(labels, return_counts=True)

class_names = ['car', 'bike', 'bird', 'wind', 'rain', 'crowd', 'chatter', 'park']

print("Number of samples per class:")
for label, count in zip(unique_labels, counts):
    print(f"{class_names[int(label)]}: {count}")


Number of samples per class:
car: 832
bike: 1408
bird: 1001
wind: 1376
rain: 1145
crowd: 998
chatter: 707
park: 2406


In [None]:
from keras.layers import *
from keras.models import *
from keras.callbacks import *
from keras.utils import to_categorical
from keras import backend as K

def cnn(x_tr, num_classes=3):
    K.clear_session()
    inputs = Input(shape=(x_tr.shape[1], x_tr.shape[2]))

    conv = Conv1D(8, 13, padding='same', activation='relu')(inputs)
    conv = Dropout(0.3)(conv)
    conv = MaxPooling1D(2)(conv)

    conv = Conv1D(16, 11, padding='same', activation='relu')(conv)
    conv = Dropout(0.3)(conv)
    conv = MaxPooling1D(2)(conv)

    conv = GlobalMaxPool1D()(conv)

    conv = Dense(16, activation='relu')(conv)

    outputs = Dense(num_classes, activation='softmax')(conv)

    model = Model(inputs, outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    model_checkpoint = ModelCheckpoint('best_model.hdf5', monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')

    return model, model_checkpoint

In [None]:
from scipy import signal

In [None]:
def log_specgram(audio, sample_rate, eps=1e-10):
   nperseg  = 1764
   noverlap = 441
   freqs, times, spec = signal.spectrogram(audio,fs=sample_rate,
                           nperseg=nperseg,noverlap=noverlap,detrend=False)
   return freqs, times, np.log(spec.T.astype(np.float32) + eps)

def extract_spectrogram_features(x_tr):
 features=[]
 for i in x_tr:
   _, _, spectrogram = log_specgram(i, sample_rate)
   mean = np.mean(spectrogram, axis=0)
   std = np.std(spectrogram, axis=0)
   spectrogram = (spectrogram - mean) / std
   features.append(spectrogram)
 return np.array(features)

x_tr_features  = extract_spectrogram_features(x_tr)
x_val_features = extract_spectrogram_features(x_val)

from keras.utils import to_categorical

y_tr_encoded = to_categorical(y_tr, num_classes=8)
y_val_encoded = to_categorical(y_val, num_classes=8)

model, mc = cnn(x_tr_features, num_classes=len(folder_paths))
history = model.fit(x_tr_features, y_tr_encoded, epochs=20, callbacks=[mc], batch_size=32, validation_data=(x_val_features, y_val_encoded))
#history=model.fit(x_tr_features, y_tr, epochs=10, callbacks=[mc], batch_size=32, validation_data=(x_val_features,y_val))

Epoch 1/20
Epoch 1: val_accuracy improved from -inf to 0.53239, saving model to best_model.hdf5
Epoch 2/20
 12/278 [>.............................] - ETA: 2s - loss: 1.1480 - accuracy: 0.6016

  saving_api.save_model(


Epoch 2: val_accuracy improved from 0.53239 to 0.63765, saving model to best_model.hdf5
Epoch 3/20
Epoch 3: val_accuracy improved from 0.63765 to 0.68117, saving model to best_model.hdf5
Epoch 4/20
Epoch 4: val_accuracy improved from 0.68117 to 0.69332, saving model to best_model.hdf5
Epoch 5/20
Epoch 5: val_accuracy improved from 0.69332 to 0.74798, saving model to best_model.hdf5
Epoch 6/20
Epoch 6: val_accuracy improved from 0.74798 to 0.76923, saving model to best_model.hdf5
Epoch 7/20
Epoch 7: val_accuracy improved from 0.76923 to 0.77126, saving model to best_model.hdf5
Epoch 8/20
Epoch 8: val_accuracy did not improve from 0.77126
Epoch 9/20
Epoch 9: val_accuracy improved from 0.77126 to 0.80972, saving model to best_model.hdf5
Epoch 10/20
Epoch 10: val_accuracy did not improve from 0.80972
Epoch 11/20
Epoch 11: val_accuracy improved from 0.80972 to 0.81781, saving model to best_model.hdf5
Epoch 12/20
Epoch 12: val_accuracy did not improve from 0.81781
Epoch 13/20
Epoch 13: val_a

In [None]:
sample_rate = 44100
ind = 6
test_audio = x_val[ind]

test_audio_features = extract_spectrogram_features([test_audio])

feature = test_audio_features[0]
prob = model.predict(feature.reshape(1, *feature.shape))

print("Class probabilities:")
for class_name, probability in zip(class_names, prob[0]):
    print(f"{class_name}: {probability:.4f}")

pred_index = np.argmax(prob, axis=1)[0]

predicted_class = class_names[pred_index]

print("\nPrediction:", predicted_class)

Class probabilities:
car: 0.0000
bike: 0.0000
bird: 0.0000
wind: 0.0000
rain: 0.0000
crowd: 0.0261
chatter: 0.0228
park: 0.9511

Prediction: park


In [None]:
from IPython.display import Audio

Audio(data=test_audio, rate=sample_rate)

In [None]:
sample_rate = 44100
ind = 0
test_audio = x_val[ind]

test_audio_features = extract_spectrogram_features([test_audio])

feature = test_audio_features[0]
prob = model.predict(feature.reshape(1, *feature.shape))

print("Class probabilities:")
for class_name, probability in zip(class_names, prob[0]):
    print(f"{class_name}: {probability:.4f}")

pred_index = np.argmax(prob, axis=1)[0]

predicted_class = class_names[pred_index]

print("\nPrediction:", predicted_class)

Class probabilities:
car: 0.0000
bike: 0.0000
bird: 0.0002
wind: 0.0000
rain: 0.9950
crowd: 0.0016
chatter: 0.0011
park: 0.0020

Prediction: rain


In [None]:
Audio(data=test_audio, rate=sample_rate)

In [None]:
import tensorflow as tf

new_model = tf.keras.models.load_model("best_model.hdf5")
converter = tf.lite.TFLiteConverter.from_keras_model(new_model)
tflite_model = converter.convert()
with open('model.tflite', 'wb') as f:
  f.write(tflite_model)

In [None]:
import numpy as np
import tensorflow as tf
import numpy as np
import librosa
from scipy import signal
import os

def log_specgram(audio, sample_rate, eps=1e-10):
    nperseg = 1764
    noverlap = 441
    freqs, times, spec = signal.spectrogram(audio, fs=sample_rate, nperseg=nperseg, noverlap=noverlap, detrend=False)
    return freqs, times, np.log(spec.T.astype(np.float32) + eps)

def prepare_data(samples, num_of_samples=176400):
    if len(samples) >= num_of_samples:
        data = samples[:num_of_samples]
        return [data]
    return []

def extract_spectrogram_features(x_tr, sample_rate=44100):
    features = []
    for i in x_tr:
        _, _, spectrogram = log_specgram(i, sample_rate)
        mean = np.mean(spectrogram, axis=0)
        std = np.std(spectrogram, axis=0)
        spectrogram = (spectrogram - mean) / std
        features.append(spectrogram)
    return np.array(features)

def extract_features_from_file(file_path, sample_rate=44100):
    samples, _ = librosa.load(file_path, sr=sample_rate)
    processed_data = prepare_data(samples)
    features = extract_spectrogram_features(processed_data, sample_rate)
    return features
    #return processed_data

file_path = '1.wav'

features = extract_features_from_file(file_path)

interpreter = tf.lite.Interpreter(model_path="model.tflite")
interpreter.allocate_tensors()

input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

input_shape = input_details[0]['shape']
input_data = np.array(features[0]).reshape(input_shape).astype(np.float32)

interpreter.set_tensor(input_details[0]['index'], input_data)

interpreter.invoke()

output_data = interpreter.get_tensor(output_details[0]['index'])
softmax_output = tf.nn.softmax(output_data).numpy()
print(softmax_output)

  samples, _ = librosa.load(file_path, sr=sample_rate)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


FileNotFoundError: [Errno 2] No such file or directory: '1.wav'

In [None]:
Audio(data=processed_data, rate=44100)

In [None]:
# 모델 로드
interpreter = tf.lite.Interpreter(model_path="model.tflite")
interpreter.allocate_tensors()

# 입력 텐서 정보 얻기
input_details = interpreter.get_input_details()
input_shape = input_details[0]['shape']
print("모델의 입력 형태:", input_shape)

모델의 입력 형태: [  1 133 883]


In [None]:
# 파일 경로 지정
file_path = '1.wav'

# 파일에서 피처 추출
features = extract_features_from_file(file_path)

# 피처 형태 출력
if features.size > 0:
    print("추출된 피처의 형태:", features.shape)
else:
    print("피처 추출 실패: 데이터가 비어 있습니다.")

In [None]:
# 모델 로드 및 텐서 할당
interpreter = tf.lite.Interpreter(model_path="model.tflite")
interpreter.allocate_tensors()

# 입력 및 출력 텐서 정보 얻기
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

# 실제 추출된 피처를 모델 입력으로 사용
input_data = features.astype(np.float32)

# 모델에 데이터 입력
interpreter.set_tensor(input_details[0]['index'], input_data)

# 모델 실행
interpreter.invoke()

# 출력 결과 얻기
output_data = interpreter.get_tensor(output_details[0]['index'])
softmax_output = tf.nn.softmax(output_data).numpy()
print(softmax_output)