<a href="https://colab.research.google.com/github/harsha-9977/AIML/blob/main/dog_audio_detector.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
import os
import requests
from tqdm import tqdm

API_KEY = "oPoKowbhXAZ0Y7eMWaRbVhphU1EUhfMjIUoIYIBL"
BASE_URL = "https://freesound.org/apiv2"
HEADERS = {"Authorization": f"Token {API_KEY}"}

def search_and_download(query, label, count=50):
    save_dir = f"data/{label}"
    os.makedirs(save_dir, exist_ok=True)

    params = {
        "query": query,
        "fields": "id,previews",  # make sure previews field is included
        "page_size": count,
    }

    r = requests.get(f"{BASE_URL}/search/text/", headers=HEADERS, params=params)
    results = r.json()["results"]

    for i, item in enumerate(tqdm(results, desc=f"Downloading {label}")):
        try:
            preview_url = item["previews"].get("preview-hq-mp3") or item["previews"].get("preview_lq_mp3")
            if not preview_url:
                print(f"Skipping {i}: no preview found")
                continue
            audio_data = requests.get(preview_url).content
            with open(f"{save_dir}/{label}_{i}.mp3", "wb") as f:
                f.write(audio_data)
        except Exception as e:
            print(f"Error with item {i}: {e}")

# Get 50 dog barks and 50 not-dog sounds
search_and_download("dog bark", "dog", 50)
search_and_download("car horn", "not_dog", 50)  # you can change this to birds, rain, etc.


Downloading dog: 100%|██████████| 50/50 [00:53<00:00,  1.08s/it]
Downloading not_dog: 100%|██████████| 50/50 [00:51<00:00,  1.03s/it]


In [7]:
import librosa
import librosa.display
import numpy as np
import os
import soundfile as sf

def extract_features(folder):
    features, labels = [], []
    for label in ["dog", "not_dog"]:
        files = os.listdir(f"data/{label}")
        for file in tqdm(files, desc=f"Processing {label}"):
            filepath = f"data/{label}/{file}"
            try:
                y, sr = librosa.load(filepath, sr=22050)
                melspec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
                melspec_db = librosa.power_to_db(melspec, ref=np.max)
                if melspec_db.shape[1] < 128:
                    continue  # skip short clips
                features.append(melspec_db[:, :128])  # crop/pad to same shape
                labels.append(1 if label == "dog" else 0)
            except Exception as e:
                print(f"Error with {file}: {e}")
    return np.array(features), np.array(labels)

X, y = extract_features("data/")
X = X[..., np.newaxis]  # add channel dimension for CNN
print("Feature shape:", X.shape, "Labels:", y.shape)


Processing dog: 100%|██████████| 50/50 [00:21<00:00,  2.29it/s]
Processing not_dog: 100%|██████████| 50/50 [00:06<00:00,  8.08it/s]

Feature shape: (73, 128, 128, 1) Labels: (73,)





In [8]:
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=X.shape[1:]),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(X_train, y_train, epochs=10, validation_data=(X_test, y_test))
model.save("dog_audio_classifier.h5")


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 2s/step - accuracy: 0.5575 - loss: 62.6729 - val_accuracy: 0.4000 - val_loss: 226.6372
Epoch 2/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 193ms/step - accuracy: 0.5679 - loss: 132.3344 - val_accuracy: 0.6000 - val_loss: 28.5874
Epoch 3/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 141ms/step - accuracy: 0.4217 - loss: 44.5325 - val_accuracy: 0.6000 - val_loss: 19.0648
Epoch 4/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 254ms/step - accuracy: 0.4009 - loss: 23.4182 - val_accuracy: 0.4000 - val_loss: 16.0125
Epoch 5/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 332ms/step - accuracy: 0.6096 - loss: 12.4711 - val_accuracy: 0.4000 - val_loss: 20.1751
Epoch 6/10
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 302ms/step - accuracy: 0.5887 - loss: 12.3224 - val_accuracy: 0.4000 - val_loss: 6.4880
Epoch 7/10
[1m2/2[0m [32m━━



In [57]:
import librosa
import numpy as np
from tensorflow.keras.models import load_model

# Load trained model once globally
model = load_model("dog_audio_classifier.h5")

def predict_audio(filename):
    try:
        # Load and process audio
        y, sr = librosa.load(filename, sr=22050)
        melspec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
        melspec_db = librosa.power_to_db(melspec, ref=np.max)

        # Pad or crop to (128, 128)
        if melspec_db.shape[1] < 128:
            melspec_db = np.pad(melspec_db, ((0, 0), (0, 128 - melspec_db.shape[1])), mode='constant')
        else:
            melspec_db = melspec_db[:, :128]

        # Add channel and batch dimension
        input_data = melspec_db[np.newaxis, ..., np.newaxis]

        # Predict
        prediction = model.predict(input_data, verbose=0)[0][0]
        result = "🐶 Dog Sound Detected!" if prediction > 0.5 else "❌ Not a Dog Sound"
        confidence = f"Confidence: {prediction:.2f}"

        return result, confidence

    except Exception as e:
        return f"Error: {e}", None




In [58]:
result, confidence = predict_audio("/content/561801__garuda1982__radio-controlled-toy-car-on-wooden-floor-sound-effect.wav")
print(result)
print(confidence)


🐶 Dog Sound Detected!
Confidence: 1.00


In [59]:
result, confidence = predict_audio("/content/432754__xpoki__dog_bark_small.wav")
print(result)
print(confidence)


🐶 Dog Sound Detected!
Confidence: 0.98


In [60]:
result, confidence = predict_audio("/content/test.mp3")
print(result)
print(confidence)


🐶 Dog Sound Detected!
Confidence: 0.94


In [61]:
result, confidence = predict_audio("/content/data/not_dog/not_dog_16.mp3")
print(result)
print(confidence)


❌ Not a Dog Sound
Confidence: 0.01


In [62]:
result, confidence = predict_audio("/content/data/dog/dog_43.mp3")
print(result)
print(confidence)


🐶 Dog Sound Detected!
Confidence: 1.00


In [63]:
result, confidence = predict_audio("/content/data/dog/dog_16.mp3")
print(result)
print(confidence)


🐶 Dog Sound Detected!
Confidence: 1.00


In [68]:
result, confidence = predict_audio("/content/data/not_dog/not_dog_49.mp3")
print(result)
print(confidence)


❌ Not a Dog Sound
Confidence: 0.03


In [48]:
label, confidence = predict_audio("data/dog/dog_0.mp3")
print(label, confidence)


❌ Not a Dog Sound Confidence: 0.24


In [37]:
from sklearn.metrics import accuracy_score

y_pred = (model.predict(X_test) > 0.5).astype("int")
print("Accuracy:", accuracy_score(y_test, y_pred))


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
Accuracy: 0.8666666666666667
