## Libary Preparation

In [20]:
! pip install librosa
! pip install tensorflow
! pip install torch
! pip install torchvision

Collecting torchvision
  Downloading torchvision-0.21.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (6.1 kB)
Downloading torchvision-0.21.0-cp311-cp311-macosx_11_0_arm64.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: torchvision
Successfully installed torchvision-0.21.0


In [None]:
import pandas as pd
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import os
from tqdm import tqdm

In [14]:
ESC50_ROOT   = 'ESC-50-master'          
META_CSV     = os.path.join(ESC50_ROOT, 'meta', 'esc50.csv')
AUDIO_DIR    = os.path.join(ESC50_ROOT, 'audio')
OUT_DIR      = 'spectrograms'
URBAN_CLASSES = [
    'siren', 'car_horn', 'engine',
    'train', 'airplane', 'fireworks', 'chainsaw',
    'helicopter', 'hand_saw'
]

In [None]:
# 1. load metadata
meta = pd.read_csv(META_CSV)

# 2. filter to only the urban‑noise categories
df = meta[meta['category'].isin(URBAN_CLASSES)].reset_index(drop=True)

# 3. prepare output dir
os.makedirs(OUT_DIR, exist_ok=True)

def save_mel_spectrogram(wav_path, img_path,
                         n_mels=128, n_fft=2048, hop_length=512):
    """Load WAV → compute log‑Mel spectrogram → save PNG"""
    y, sr = librosa.load(wav_path, sr=None)

    
    S = librosa.feature.melspectrogram(
        y=y,
        sr=sr,
        n_mels=n_mels,
        n_fft=n_fft,
        hop_length=hop_length
    )
    S_db = librosa.power_to_db(S, ref=np.max)

    plt.figure(figsize=(4,4))
    plt.axis('off')
    librosa.display.specshow(
        S_db,
        sr=sr,
        hop_length=hop_length,
        x_axis='time',
        y_axis='mel'
    )
    plt.savefig(img_path, bbox_inches='tight', pad_inches=0)
    plt.close()

# 4. loop and save
for _, row in tqdm(df.iterrows(), total=len(df)):
    fname = row['filename']               # e.g. "1-100032-A-0.wav"
    category = row['category']            # e.g. "siren"
    wav_path = os.path.join(AUDIO_DIR, fname)
    out_fname = os.path.splitext(fname)[0] + '.png'
    out_path = os.path.join(OUT_DIR, out_fname)

    # optional: organize by class
    class_dir = os.path.join(OUT_DIR, category)
    os.makedirs(class_dir, exist_ok=True)
    out_path = os.path.join(class_dir, out_fname)

    save_mel_spectrogram(wav_path, out_path)

100%|██████████| 360/360 [00:27<00:00, 13.00it/s]


In [25]:
# dataset.py
import os, json
import numpy as np
import torch
from PIL import Image, ImageDraw
import torchvision.transforms as T

In [26]:
class UrbanNoiseDataset(torch.utils.data.Dataset):
    def __init__(self, annotations_file, img_dir, classes, transforms=None):
        # load COCO json
        with open(annotations_file) as f:
            self.coco = json.load(f)

        self.img_dir    = img_dir
        self.classes    = classes
        self.transforms = transforms

        # build image lookup
        self.images = { img['id']: img for img in self.coco['images'] }

        # build annotations per image
        self.anns = {}
        for ann in self.coco['annotations']:
            self.anns.setdefault(ann['image_id'], []).append(ann)

        self.ids = list(self.images.keys())

        # build a map: category_id → label index
        # assume your JSON 'categories' list has unique "id" fields
        self.catid2label = {
            cat['id']: idx+1
            for idx, cat in enumerate(self.coco['categories'])
        }

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        img_id   = self.ids[idx]
        img_info = self.images[img_id]
        fname    = img_info['file_name']      # e.g. "siren/1-100032-A-0.png"

        # full path
        img_path = os.path.join(self.img_dir, fname)
        img      = Image.open(img_path).convert("RGB")

        annots = self.anns.get(img_id, [])
        boxes, masks, labels = [], [], []
        for ann in annots:
            # bounding box
            x, y, w, h = ann['bbox']
            boxes.append([x, y, x + w, y + h])

            # create a blank mask and draw polygon(s)
            mask = Image.new('L',
                             (img_info['width'], img_info['height']),
                             0)
            draw = ImageDraw.Draw(mask)
            for seg in ann['segmentation']:
                poly = [(seg[i], seg[i+1]) for i in range(0, len(seg), 2)]
                draw.polygon(poly, outline=1, fill=1)

            masks.append(torch.as_tensor(np.array(mask), dtype=torch.uint8))
            labels.append(self.catid2label[ann['category_id']])

        # stack into tensors
        boxes  = torch.as_tensor(boxes, dtype=torch.float32)
        masks  = torch.stack(masks)
        labels = torch.as_tensor(labels, dtype=torch.int64)

        target = {
            'boxes':     boxes,
            'labels':    labels,
            'masks':     masks,
            'image_id':  torch.tensor([img_id])
        }

        # optional augmentations
        if self.transforms:
            img, target = self.transforms(img, target)

        # final ToTensor on image only
        return T.ToTensor()(img), target

def get_transform(train):
    transforms = []
    if train:
        transforms.append(T.RandomHorizontalFlip(0.5))
    # wrap into a torchvision‐style callable that accepts (img, target)
    return T.Compose(transforms)
