In [1]:
import numpy as np
import torch.onnx
import torch.nn as nn
import onnx
import onnx_tf
import tensorflow as tf
import torchaudio
import torchaudio.transforms as T
from torchvision.transforms import Compose


TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 

 The versions of TensorFlow you are currently using is 2.9.3 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


In [2]:
class ResBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride=1):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=1)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        self.downsample = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride),
            nn.BatchNorm2d(out_channels)
        ) if stride != 1 or in_channels != out_channels else nn.Identity()

    def forward(self, x):
        identity = x
        identity = self.downsample(identity)
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.bn2(x)
        x += identity
        x = self.relu(x)
        x = self.dropout(x)
        return x

In [3]:
class SoundClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.con1 = nn.Conv2d(1, 16, kernel_size=7, stride=2, padding=3, bias=False)
        self.dropout = nn.Dropout(0.2)
        self.bn1 = nn.BatchNorm2d(16)
        self.layers = nn.Sequential(
            ResBlock(16, 16, 1),
            ResBlock(16, 32, 2),
            ResBlock(32, 32, 1),
            ResBlock(32, 32, 1),
            ResBlock(32, 64, 2),
            ResBlock(64, 64, 1),
            ResBlock(64, 64, 1),
            ResBlock(64, 128, 2)
        )
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(128, 15)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.bn1(self.con1(x)))
        x = self.dropout(x)
        x = self.layers(x)

        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return x

In [4]:
classifier = SoundClassifier().cuda()
classifier.load_state_dict(torch.load("../model/classifier.pth"))
classifier.eval()

SoundClassifier(
  (con1): Conv2d(1, 16, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (dropout): Dropout(p=0.2, inplace=False)
  (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (layers): Sequential(
    (0): ResBlock(
      (conv1): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (bn1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (bn2): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU()
      (dropout): Dropout(p=0.2, inplace=False)
      (downsample): Identity()
    )
    (1): ResBlock(
      (conv1): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
      (bn1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1

In [5]:
# export to ONNX
dummy_input = torch.randn(1, 1, 64, 431).cuda()  # match input shape
onnx_path = "../model/classifier.onnx"
torch.onnx.export(
    classifier,
    dummy_input,
    onnx_path,
    input_names=["input"],
    output_names=["output"],
    dynamic_axes={"input": {0: "batch_size", 3: "time_steps"}, "output": {0: "batch_size"}},
    opset_version=11,
)

In [6]:
# convert ONNX to TensorFlow
onnx_model = onnx.load(onnx_path)
tf_rep = onnx_tf.backend.prepare(onnx_model)
tf_path = "../model/tf_model"
tf_rep.export_graph(tf_path)



INFO:tensorflow:Assets written to: ../model/tf_model\assets


INFO:tensorflow:Assets written to: ../model/tf_model\assets


In [7]:
# convert TensorFlow to TFLite
converter = tf.lite.TFLiteConverter.from_saved_model(tf_path)
converter.experimental_new_converter = True
converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS]

# ensure correct input shape
converter.allow_custom_ops = True  # allow custom shapes
converter.optimizations = [tf.lite.Optimize.DEFAULT]  # optimize model

tflite_model = converter.convert()
tflite_path = "../model/classifier.tflite"
with open(tflite_path, "wb") as f:
    f.write(tflite_model)

In [8]:
interpreter = tf.lite.Interpreter(model_path=tflite_path)
interpreter.allocate_tensors()
input_details = interpreter.get_input_details()
print("Expected input shape:", input_details[0]['shape'])

Expected input shape: [ 1  1 64  1]


In [9]:
# load the TFLite model
interpreter = tf.lite.Interpreter(model_path=tflite_path)
interpreter.allocate_tensors()
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

In [10]:
def preprocess_audio(audio_path):
    waveform, sample_rate = torchaudio.load(audio_path)
    transform = Compose([
        T.MelSpectrogram(sample_rate=44100, n_fft=1024, hop_length=512, n_mels=64),
        T.AmplitudeToDB()
    ])
    spectrogram = transform(waveform)  # Shape: [1, 64, time_steps]

    # Pad or truncate to match the expected time steps (431)
    target_time_steps = 431
    current_time_steps = spectrogram.shape[2]

    if current_time_steps < target_time_steps:
        # Pad with zeros
        pad_size = target_time_steps - current_time_steps
        spectrogram = torch.nn.functional.pad(spectrogram, (0, pad_size), "constant", 0)
    elif current_time_steps > target_time_steps:
        # Truncate
        spectrogram = spectrogram[:, :, :target_time_steps]

    # Add batch and channel dimensions
    spectrogram = spectrogram.unsqueeze(0)  # Shape: [1, 1, 64, 431]
    return spectrogram.numpy()

In [11]:
# run inference
def classify_audio(audio_path, confidence_threshold=0.8):
    input_data = preprocess_audio(audio_path)
    interpreter.set_tensor(input_details[0]['index'], input_data)
    interpreter.invoke()
    output_data = interpreter.get_tensor(output_details[0]['index'])
    probabilities = tf.nn.softmax(output_data).numpy()
    confidence = np.max(probabilities)
    pred = np.argmax(probabilities)

    if confidence < confidence_threshold:
        print("Unknown audio class")
    else:
        target_classes = [
            'siren', 'car_horn', 'chainsaw', 'fireworks', 'glass_breaking',
            'door_wood_knock', 'clock_alarm', 'crying_baby', 'thunderstorm',
            'helicopter', 'train', 'door_wood_creaks', 'washing_machine',
            'clapping', 'footsteps'
        ]
        print(f"Audio category: {target_classes[pred]}")

In [12]:
# example usage
audio_path = "../data/ESC-50-master/audio/1-101296-B-19.wav"
input_data = preprocess_audio(audio_path)
interpreter.resize_tensor_input(input_details[0]['index'], [1, 1, 64, 431])
interpreter.allocate_tensors()
interpreter.set_tensor(input_details[0]['index'], input_data)
classify_audio(audio_path)

Audio category: thunderstorm
