In [None]:
import gradio as gr
import numpy as np  

In [45]:
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.regularizers import L2

# 使用 Sequential 定義模型
model = models.Sequential()

# Layer 1
model.add(layers.Conv2D(16, kernel_size=(3, 3), strides=(1, 1), padding='same', activation='relu', input_shape=(64, 64, 3)))
#model.add(layers.BatchNormalization())
model.add(layers.MaxPooling2D(pool_size=(2, 2)))

# Layer 2
model.add(layers.Conv2D(32, kernel_size=(3, 3), strides=(1, 1), padding='same', activation='relu'))
#model.add(layers.BatchNormalization())
model.add(layers.MaxPooling2D(pool_size=(2, 2)))

# Layer 3
model.add(layers.Conv2D(64, kernel_size=(3, 3), strides=(1, 1), padding='same', activation='relu'))
#model.add(layers.BatchNormalization())
model.add(layers.MaxPooling2D(pool_size=(2, 2)))

# Layer 4
model.add(layers.Conv2D(64, kernel_size=(3, 3), strides=(1, 1), padding='same', activation='relu'))
#model.add(layers.BatchNormalization())
model.add(layers.MaxPooling2D(pool_size=(2, 2)))

# Layer 5
model.add(layers.Conv2D(32, kernel_size=(3, 3), strides=(1, 1), padding='same', activation='relu'))
#model.add(layers.BatchNormalization())
model.add(layers.MaxPooling2D(pool_size=(2, 2)))

# Flatten Layer
model.add(layers.Flatten())

# Fully Connected Layer
model.add(layers.Dense(4))

# 檢查模型結構
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_15 (Conv2D)          (None, 64, 64, 16)        448       
                                                                 
 max_pooling2d_15 (MaxPoolin  (None, 32, 32, 16)       0         
 g2D)                                                            
                                                                 
 conv2d_16 (Conv2D)          (None, 32, 32, 32)        4640      
                                                                 
 max_pooling2d_16 (MaxPoolin  (None, 16, 16, 32)       0         
 g2D)                                                            
                                                                 
 conv2d_17 (Conv2D)          (None, 16, 16, 64)        18496     
                                                                 
 max_pooling2d_17 (MaxPoolin  (None, 8, 8, 64)        

In [38]:
weight_path = '../tf_model_no_batchnorm_v3/TFModel.h5'

In [46]:
# If successfully loaded the model, it will return None
# model = model.load_weights(weight_path, by_name=True)
model.load_weights(weight_path, by_name=True)

Note: here is a simple test, so the audio file will draw mel-spectrogram with python version
So, the model predictions may be wrong.

In [None]:
from tensorflow.keras.preprocessing.image import load_img, img_to_array
import numpy as np

# 定義圖片大小
image_size = (64, 64)
# 字典對應表
dict_for_label = {
    'boat': [1, 0, 0, 0], 'dolphin': [0, 1, 0, 0], 'fish': [0, 0, 1, 0], 'whale': [0, 0, 0, 1],
    'boat+fish': [1, 0, 1, 0], 'boat+whale': [1, 0, 0, 1],
    'dolphin+boat': [1, 1, 0, 0], 'dolphin+whale': [0, 1, 0, 1], 'dolphin+fish': [0, 1, 1, 0],
    'fish+whale': [0, 0, 1, 1]
}
# 將字典反過來查詢值
# list 不能當 key，所以轉成 tuple 的 string
pred_for_label = {str(tuple(v)): k for k, v in dict_for_label.items()}

def preprocess_image(image_path, target_size=(64, 64)):
    """
    將圖片處理為符合模型輸入的格式
    :param image_path: 圖片路徑
    :param target_size: 模型輸入的目標大小
    :return: 處理後的圖片數據，形狀為 (1, height, width, channels)
    """
    # 加載圖片並調整大小
    img = load_img(image_path, target_size=target_size)
    img_array = img_to_array(img) / 255.0  # 標準化到 [0, 1]
    return np.expand_dims(img_array, axis=0)  # 增加 batch 維度

def predict_image_class(image_path, model):
    """
    預測圖片類別
    :param image_path: 圖片路徑
    :param model: 訓練好的模型
    :return: 預測結果
    """

    # Add batch dimension to the image
    processed_image = preprocess_image(image_path)
    print("Processed Image Shape:", processed_image.shape)
    predictions = model.predict(processed_image)
    # 計算每個類別的預測概率（Sigmoid 函數處理）
    predicted_prob = tf.sigmoid(predictions)
    # 將每個類別的預測概率與閾值 0.5 比較，大於 0.5 表示該類別存在 (標記為 1)
    predicted = tf.cast(predicted_prob > 0.5, tf.int32)
    return np.array(predicted).squeeze()  # 去除多餘的維度


# 測試圖片路徑
test_image_path = r'20201016_070712.png'  # 替換為你的圖片路徑

# 檢查處理後的圖片形狀
predictions = predict_image_class(test_image_path, model)
print("Predicted result: ", pred_for_label[str(tuple(predictions))])

Processed Image Shape: (1, 64, 64, 3)
Predicted result:  boat+whale


In [79]:
# Audio params
SAMPLE_RATE = 22050  # (samples/sec)
DURATION = 5.0  # duration in second (sec)
AUDIO_LEN = int(SAMPLE_RATE * DURATION)  # total number of samples in DURATION

# Spectrogram params
N_MELS = 128  # freq axis, number of filters
N_FFT = 2048  # frame size
HOP_LEN = 512  # non-overlap region, which means 1/4 portion overlapping
SPEC_WIDTH = AUDIO_LEN // HOP_LEN + 1  # time axis
FMAX = SAMPLE_RATE // 2  # max frequency, based on the rule, it should be half of SAMPLE_RATE
SPEC_SHAPE = [N_MELS, SPEC_WIDTH]  # expected output spectrogram shape

In [82]:
import PIL
import PIL.Image
import matplotlib.pyplot as plt
import librosa
import os

def prediction(audio_input):
    """
    Process audio input, generate a spectrogram, and predict the sound type.
    
    Args:
        audio_input: Audio input from Gradio (tuple of (sample_rate, audio_data))
        
    Returns:
        String prediction result
    """
    # Check if audio input is provided
    if audio_input is None:
        return "請提供音頻檔案"
    
    # Gradio audio input format is (sample_rate, audio_data)
    sr, audio_data = audio_input
    # transfer to float32 (Because librosa need float32)
    audio_data = audio_data.astype(np.float32)

    # Ensure audio_data is numpy array
    if not isinstance(audio_data, np.ndarray):
        return "音頻數據格式錯誤"
    
    # Create a temporary file name for the output spectrogram
    temp_filename = "temp_audio.png"
    
    # Generate the mel spectrogram directly from the audio data
    spec = librosa.feature.melspectrogram(
        y=audio_data, sr=sr, fmax=FMAX, 
        n_mels=N_MELS, hop_length=HOP_LEN, n_fft=N_FFT
    )
    spec = librosa.power_to_db(spec)
    
    # Plot the mel spectrogram
    plt.figure()
    librosa.display.specshow(
        spec, sr=sr, hop_length=HOP_LEN, 
        x_axis='time', y_axis='mel', cmap='viridis'
    )
    plt.title("Spectrogram_temp_audio", fontsize=17)
    plt.colorbar(format='%+2.0f dB')
    plt.tight_layout()
    
    # Save the spectrogram image
    plt.savefig(temp_filename)
    plt.close()
    
    # Print the path for debugging
    print(f"Saved spectrogram to {temp_filename}")
    
    # Use the model to predict the class
    predictions = predict_image_class(temp_filename, model)
    result = pred_for_label[str(tuple(predictions))] # a string of class result
    
    # Load and display the appropriate images based on prediction
    class_result = result.split('+')
    images = []
    
    # Create an image for each detected class
    for res in class_result:
        if res == 'boat':
            img_path = './images/boat.jpg'
        elif res == 'dolphin':
            img_path = './images/dolphin.jpg'
        elif res == 'fish':
            img_path = './images/fish.jpg'
        elif res == 'whale':
            img_path = './images/whale.jpg'
        
        # Check if image file exists
        if os.path.exists(img_path):
            img = plt.imread(img_path)
            images.append(img)
        else:
            print(f"Warning: Image file {img_path} not found")
    
    # Combine images into a single figure if there are multiple classes
    if len(images) > 0:
        fig, axs = plt.subplots(1, len(images), figsize=(5*len(images), 5))
        
        # Handle the case of a single image
        if len(images) == 1:
            axs.imshow(images[0])
            axs.axis('off')
            axs.set_title(class_result[0])
        else:
            for i, (img, cls) in enumerate(zip(images, class_result)):
                axs[i].imshow(img)
                axs[i].axis('off')
                axs[i].set_title(cls)
        
        plt.tight_layout()
        result_img_path = "prediction_result.png"
        plt.savefig(result_img_path)
        plt.close(fig)
        
        # Return both the text prediction and the image
        return f"預測結果: {result}", PIL.Image.open(result_img_path)
    
    # If no images were found, return only the text prediction
    return f"預測結果: {result}", None

In [None]:
# create a Gradio interface
demo = gr.Interface(
        fn = prediction,
        inputs = [gr.Audio()],
        outputs = ["text", "image"],       # get prediction result
        title = '水下聲音辨識測試',
        description = '可預測聲音類別為：船、海豚、魚、鯨魚'
)

# Show demo
demo.launch()

Running on local URL:  http://127.0.0.1:7869

To create a public link, set `share=True` in `launch()`.




Saved spectrogram to temp_audio.png
