In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import csv

import matplotlib.pyplot as plt
from IPython.display import Audio
from scipy.io import wavfile

In [None]:
# Load the model.
model = hub.load('https://tfhub.dev/google/yamnet/1')

In [None]:
def get_top_classes(scores, class_names, top_n=5):
    mean_scores = scores.mean(axis=0)
    top_indices = np.argsort(mean_scores)[::-1][:top_n]
    top_classes = [(class_names[i], mean_scores[i]) for i in top_indices]
    return top_classes

def guess_voicemail_type(top_classes):
    human_keywords = ['Speech', 'Male speech', 'Female speech', 'Child speech']
    machine_keywords = ['Telephone', 'DTMF tone', 'Dial tone', 'Synthesizer', 'Beep']

    top_class_names = [name for name, score in top_classes]

    if any(k in ' '.join(top_class_names) for k in human_keywords):
        return "Likely Human Voicemail (Human voice detected)"

    if any(k in ' '.join(top_class_names) for k in machine_keywords):
        return "Likely Machine Voicemail (Machine sounds detected)"

    return "Voicemail type unclear"

In [None]:
scores, embeddings, spectrogram = model(waveform)

top_classes = get_top_classes(scores.numpy(), class_names, top_n=5)
print("Top predicted classes and scores:")
for name, score in top_classes:
    print(f"{name}: {score:.3f}")

voicemail_guess = guess_voicemail_type(top_classes)
print(voicemail_guess)

In [None]:
scores, embeddings, spectrogram = model(waveform)

top_classes = get_top_classes(scores.numpy(), class_names, top_n=5)
print("Top predicted classes and scores:")
for name, score in top_classes:
    print(f"{name}: {score:.3f}")

voicemail_guess = guess_voicemail_type(top_classes)
print(voicemail_guess)

In [None]:
# imports and model loading code...

def get_top_classes(scores, class_names, top_n=5):
    mean_scores = scores.mean(axis=0)
    top_indices = mean_scores.argsort()[-top_n:][::-1]
    top = [(class_names[i], mean_scores[i]) for i in top_indices]
    return top

def guess_voicemail_type(top_classes):
    # Example heuristic:
    keywords_human = ['Speech', 'Human voice', 'Conversation']
    keywords_machine = ['Machine', 'Beep', 'Ringtone']

    for name, score in top_classes:
        if any(k in name for k in keywords_human):
            return "This audio likely contains human voice."
        if any(k in name for k in keywords_machine):
            return "This audio likely contains machine voicemail or automated message."
    return "Unable to determine voicemail type."

# Load audio, preprocess, normalize waveform...
scores, embeddings, spectrogram = model(waveform)

top_classes = get_top_classes(scores.numpy(), class_names, top_n=5)
print("Top predicted classes and scores:")
for name, score in top_classes:
    print(f"{name}: {score:.3f}")

voicemail_guess = guess_voicemail_type(top_classes)
print(voicemail_guess)

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import csv
import scipy.signal
from scipy.io import wavfile

# Load the YAMNet model from TensorFlow Hub
model = hub.load('https://tfhub.dev/google/yamnet/1')

# Load class names for the model
def class_names_from_csv(class_map_csv_text):
    class_names = []
    with tf.io.gfile.GFile(class_map_csv_text) as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            class_names.append(row['display_name'])
    return class_names

class_map_path = model.class_map_path().numpy()
class_names = class_names_from_csv(class_map_path)

# Resample waveform to 16kHz if needed
def ensure_sample_rate(original_sample_rate, waveform, desired_sample_rate=16000):
    if original_sample_rate != desired_sample_rate:
        desired_length = int(round(float(len(waveform)) / original_sample_rate * desired_sample_rate))
        waveform = scipy.signal.resample(waveform, desired_length)
    return desired_sample_rate, waveform

# Function to get top classes from scores
def get_top_classes(scores, class_names, top_n=5):
    mean_scores = np.mean(scores, axis=0)
    top_indices = mean_scores.argsort()[-top_n:][::-1]
    return [(class_names[i], mean_scores[i]) for i in top_indices]

# Function to guess voicemail type based on predicted classes
def guess_voicemail_type(top_classes):
    keywords_human = ['Speech', 'Conversation', 'Narration', 'Male speech', 'Female speech']
    keywords_machine = ['Telephone', 'Beep', 'Ring', 'Alarm', 'Computer', 'Electronic']

    for class_name, score in top_classes:
        if any(k in class_name for k in keywords_human):
            return "Detected: Human voice or voicemail"
        if any(k in class_name for k in keywords_machine):
            return "Detected: Machine voicemail or alert"
    return "Unable to determine voicemail type."

# Main prediction function
def predict_audio_file(audio_path):
    sample_rate, wav_data = wavfile.read(audio_path, 'rb')
    sample_rate, wav_data = ensure_sample_rate(sample_rate, wav_data)

    # Normalize waveform to [-1.0, 1.0]
    waveform = wav_data / tf.int16.max

    # Run the model
    scores, embeddings, spectrogram = model(waveform)

    # Get top predicted classes
    top_classes = get_top_classes(scores.numpy(), class_names, top_n=5)

    print(f"Top predicted classes and scores for '{audio_path}':")
    for name, score in top_classes:
        print(f"{name}: {score:.3f}")

    # Guess voicemail type
    voicemail_guess = guess_voicemail_type(top_classes)
    print(voicemail_guess)

# Example usage:
# Replace 'your_audio.wav' with your audio file path
audio_file_path = '/content/20250430-194418-17328631663.wav'
predict_audio_file(audio_file_path)

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import csv
import scipy.signal
from scipy.io import wavfile

# Load the YAMNet model from TensorFlow Hub
model = hub.load('https://tfhub.dev/google/yamnet/1')

# Load class names for the model
def class_names_from_csv(class_map_csv_text):
    class_names = []
    with tf.io.gfile.GFile(class_map_csv_text) as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            class_names.append(row['display_name'])
    return class_names

class_map_path = model.class_map_path().numpy()
class_names = class_names_from_csv(class_map_path)

# Resample waveform to 16kHz if needed
def ensure_sample_rate(original_sample_rate, waveform, desired_sample_rate=16000):
    if original_sample_rate != desired_sample_rate:
        desired_length = int(round(float(len(waveform)) / original_sample_rate * desired_sample_rate))
        waveform = scipy.signal.resample(waveform, desired_length)
    return desired_sample_rate, waveform

# Function to get top classes from scores
def get_top_classes(scores, class_names, top_n=5):
    mean_scores = np.mean(scores, axis=0)
    top_indices = mean_scores.argsort()[-top_n:][::-1]
    return [(class_names[i], mean_scores[i]) for i in top_indices]

# Improved function to guess voicemail type based on predicted classes
def guess_voicemail_type(top_classes):
    class_scores = {name.lower(): score for name, score in top_classes}

    speech_score = class_scores.get('speech', 0)
    telephone_score = class_scores.get('telephone', 0)
    beep_score = class_scores.get('beep', 0)
    dial_tone_score = class_scores.get('dial tone', 0)
    ringtone_score = class_scores.get('ringtone', 0)
    modem_score = class_scores.get('modem', 0)
    fax_score = class_scores.get('fax', 0)

    speech_threshold = 0.3
    machine_sound_threshold = 0.1
    telephone_threshold = 0.05

    if speech_score > speech_threshold:
        if (telephone_score > telephone_threshold or
            beep_score > machine_sound_threshold or
            dial_tone_score > machine_sound_threshold or
            ringtone_score > machine_sound_threshold or
            modem_score > machine_sound_threshold or
            fax_score > machine_sound_threshold):
            return "Detected: Human voicemail"
        else:
            return "Detected: Human voice"
    else:
        if (beep_score > machine_sound_threshold or
            dial_tone_score > machine_sound_threshold or
            ringtone_score > machine_sound_threshold or
            modem_score > machine_sound_threshold or
            fax_score > machine_sound_threshold):
            return "Detected: Machine voicemail"
        else:
            return "Unable to determine voicemail type."

# Main prediction function
def predict_audio_file(audio_path):
    sample_rate, wav_data = wavfile.read(audio_path, 'rb')
    sample_rate, wav_data = ensure_sample_rate(sample_rate, wav_data)

    waveform = wav_data / tf.int16.max

    scores, embeddings, spectrogram = model(waveform)

    top_classes = get_top_classes(scores.numpy(), class_names, top_n=10)

    print(f"Top predicted classes and scores for '{audio_path}':")
    for name, score in top_classes:
        print(f"{name}: {score:.3f}")

    voicemail_guess = guess_voicemail_type(top_classes)
    print(voicemail_guess)

# Example usage:
audio_file_path = '/content/20250430-194418-17328631663.wav'
predict_audio_file(audio_file_path)

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import csv
import scipy.signal
from scipy.io import wavfile

# Load the YAMNet model from TensorFlow Hub
model = hub.load('https://tfhub.dev/google/yamnet/1')

# Load class names for the model
def class_names_from_csv(class_map_csv_text):
    class_names = []
    with tf.io.gfile.GFile(class_map_csv_text) as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            class_names.append(row['display_name'])
    return class_names

class_map_path = model.class_map_path().numpy()
class_names = class_names_from_csv(class_map_path)

# Resample waveform to 16kHz if needed
def ensure_sample_rate(original_sample_rate, waveform, desired_sample_rate=16000):
    if original_sample_rate != desired_sample_rate:
        desired_length = int(round(float(len(waveform)) / original_sample_rate * desired_sample_rate))
        waveform = scipy.signal.resample(waveform, desired_length)
    return desired_sample_rate, waveform

# Function to get top classes from scores
def get_top_classes(scores, class_names, top_n=5):
    mean_scores = np.mean(scores, axis=0)
    top_indices = mean_scores.argsort()[-top_n:][::-1]
    return [(class_names[i], mean_scores[i]) for i in top_indices]

# Improved function to guess voicemail type based on predicted classes
def guess_voicemail_type(top_classes):
    class_scores = {name.lower(): score for name, score in top_classes}

    speech_score = class_scores.get('speech', 0)
    telephone_score = class_scores.get('telephone', 0)
    beep_score = class_scores.get('beep', 0)
    dial_tone_score = class_scores.get('dial tone', 0)
    ringtone_score = class_scores.get('ringtone', 0)
    modem_score = class_scores.get('modem', 0)
    fax_score = class_scores.get('fax', 0)

    speech_threshold = 0.3
    machine_sound_threshold = 0.1
    telephone_threshold = 0.05

    if speech_score > speech_threshold:
        if (telephone_score > telephone_threshold or
            beep_score > machine_sound_threshold or
            dial_tone_score > machine_sound_threshold or
            ringtone_score > machine_sound_threshold or
            modem_score > machine_sound_threshold or
            fax_score > machine_sound_threshold):
            return "Detected: Human voicemail"
        else:
            return "Detected: Human voice"
    else:
        if (beep_score > machine_sound_threshold or
            dial_tone_score > machine_sound_threshold or
            ringtone_score > machine_sound_threshold or
            modem_score > machine_sound_threshold or
            fax_score > machine_sound_threshold):
            return "Detected: Machine voicemail"
        else:
            return "Unable to determine voicemail type."

# Main prediction function
def predict_audio_file(audio_path):
    sample_rate, wav_data = wavfile.read(audio_path, 'rb')
    sample_rate, wav_data = ensure_sample_rate(sample_rate, wav_data)

    waveform = wav_data / tf.int16.max

    scores, embeddings, spectrogram = model(waveform)

    top_classes = get_top_classes(scores.numpy(), class_names, top_n=10)

    print(f"Top predicted classes and scores for '{audio_path}':")
    for name, score in top_classes:
        print(f"{name}: {score:.3f}")

    voicemail_guess = guess_voicemail_type(top_classes)
    print(voicemail_guess)

# Example usage:
audio_file_path = '/content/20250422-211457-12692086177.wav'
predict_audio_file(audio_file_path)

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import csv
import scipy.signal
from scipy.io import wavfile

# Load the YAMNet model from TensorFlow Hub
model = hub.load('https://tfhub.dev/google/yamnet/1')

# Load class names for the model
def class_names_from_csv(class_map_csv_text):
    class_names = []
    with tf.io.gfile.GFile(class_map_csv_text) as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            class_names.append(row['display_name'])
    return class_names

class_map_path = model.class_map_path().numpy()
class_names = class_names_from_csv(class_map_path)

# Resample waveform to 16kHz if needed
def ensure_sample_rate(original_sample_rate, waveform, desired_sample_rate=16000):
    if original_sample_rate != desired_sample_rate:
        desired_length = int(round(float(len(waveform)) / original_sample_rate * desired_sample_rate))
        waveform = scipy.signal.resample(waveform, desired_length)
    return desired_sample_rate, waveform

# Function to get top classes from scores
def get_top_classes(scores, class_names, top_n=5):
    mean_scores = np.mean(scores, axis=0)
    top_indices = mean_scores.argsort()[-top_n:][::-1]
    return [(class_names[i], mean_scores[i]) for i in top_indices]

# Improved function to guess voicemail type based on predicted classes and scores
def guess_voicemail_type(top_classes):
    # Convert list of tuples to dict for easy lookup
    class_score_dict = {name: score for name, score in top_classes}

    speech_score = class_score_dict.get('Speech', 0)
    speech_synth_score = class_score_dict.get('Speech synthesizer', 0)
    telephone_score = class_score_dict.get('Telephone', 0)
    beep_score = class_score_dict.get('Beep', 0)
    ring_score = class_score_dict.get('Ring', 0)
    computer_score = class_score_dict.get('Computer', 0)
    electronic_score = class_score_dict.get('Electronic', 0)

    # Thresholds - you can tune these based on your dataset
    speech_threshold = 0.4
    machine_threshold = 0.1

    if speech_score > speech_threshold and all(class_score_dict.get(k, 0) < machine_threshold for k in ['Telephone', 'Beep', 'Ring', 'Computer', 'Electronic']):
        return "Detected: Human voice"
    elif speech_score > speech_threshold and any(class_score_dict.get(k, 0) >= machine_threshold for k in ['Telephone', 'Beep', 'Ring', 'Computer', 'Electronic']):
        return "Detected: Human voicemail"
    elif any(class_score_dict.get(k, 0) >= speech_threshold for k in ['Telephone', 'Beep', 'Ring', 'Computer', 'Electronic', 'Speech synthesizer']):
        return "Detected: Machine voicemail"
    else:
        return "Unable to determine voicemail type."

# Main prediction function
def predict_audio_file(audio_path):
    sample_rate, wav_data = wavfile.read(audio_path, 'rb')
    sample_rate, wav_data = ensure_sample_rate(sample_rate, wav_data)

    # Normalize waveform to [-1.0, 1.0]
    waveform = wav_data / tf.int16.max

    # Run the model
    scores, embeddings, spectrogram = model(waveform)

    # Get top predicted classes
    top_classes = get_top_classes(scores.numpy(), class_names, top_n=10)

    print(f"Top predicted classes and scores for '{audio_path}':")
    for name, score in top_classes:
        print(f"{name}: {score:.3f}")

    # Guess voicemail type
    voicemail_guess = guess_voicemail_type(top_classes)
    print(voicemail_guess)

# Example usage:
# Replace 'your_audio.wav' with your actual audio file path
audio_file_path = '/content/20250422-211457-12692086177.wav'
predict_audio_file(audio_file_path)

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import csv
import scipy.signal
from scipy.io import wavfile

# Load YAMNet model
model = hub.load('https://tfhub.dev/google/yamnet/1')

# Load class names from YAMNet
def class_names_from_csv(csv_path):
    class_names = []
    with tf.io.gfile.GFile(csv_path) as f:
        reader = csv.DictReader(f)
        for row in reader:
            class_names.append(row['display_name'])
    return class_names

class_map_path = model.class_map_path().numpy()
class_names = class_names_from_csv(class_map_path)

# Ensure 16kHz sample rate
def ensure_sample_rate(sr, waveform, target_sr=16000):
    if sr != target_sr:
        desired_length = int(round(float(len(waveform)) / sr * target_sr))
        waveform = scipy.signal.resample(waveform, desired_length)
        sr = target_sr
    return sr, waveform

# Get top N classes from prediction scores
def get_top_classes(scores, class_names, top_n=5):
    mean_scores = np.mean(scores, axis=0)
    top_indices = mean_scores.argsort()[-top_n:][::-1]
    return [(class_names[i], mean_scores[i]) for i in top_indices]

# Heuristic function to determine voicemail type
def guess_voicemail_type(top_classes):
    class_score_dict = {name: score for name, score in top_classes}

    speech = class_score_dict.get('Speech', 0)
    synth = class_score_dict.get('Speech synthesizer', 0)
    telephone = class_score_dict.get('Telephone', 0)
    beep = class_score_dict.get('Beep', 0)
    computer = class_score_dict.get('Computer keyboard', 0)
    narration = class_score_dict.get('Narration, monologue', 0)
    conversation = class_score_dict.get('Conversation', 0)

    # Rules
    if synth > 0.1 or (telephone + beep + computer) > 0.2:
        return "Detected: Machine voicemail"
    elif speech > 0.4 and (narration > 0.05 or conversation > 0.05):
        return "Detected: Human voicemail"
    elif speech > 0.6 and synth < 0.05:
        return "Detected: Human voice"
    else:
        return "Unable to determine voicemail type."

# Main function
def predict_audio_file(audio_path):
    sr, wav_data = wavfile.read(audio_path)
    sr, wav_data = ensure_sample_rate(sr, wav_data)
    waveform = wav_data / np.iinfo(np.int16).max

    # In case of stereo audio, use only one channel
    if len(waveform.shape) > 1:
        waveform = waveform[:, 0]

    scores, embeddings, spectrogram = model(waveform)

    top_classes = get_top_classes(scores.numpy(), class_names, top_n=5)

    print(f"\nTop predicted classes and scores for '{audio_path}':")
    for name, score in top_classes:
        print(f"{name}: {score:.3f}")

    voicemail_type = guess_voicemail_type(top_classes)
    print(voicemail_type)

# Example usage
audio_file_path = '/content/20250422-211457-12692086177.wav'  # Replace this with your .wav file path
predict_audio_file(audio_file_path)

Below code is for just testing on google dataset which are applied yamnet model for prediction which are predict on it correctly.**bold text**

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import csv

import matplotlib.pyplot as plt
from IPython.display import Audio
from scipy.io import wavfile

In [None]:
# Load the model.
model = hub.load('https://tfhub.dev/google/yamnet/1')

In [None]:
# Find the name of the class with the top score when mean-aggregated across frames.
def class_names_from_csv(class_map_csv_text):
  """Returns list of class names corresponding to score vector."""
  class_names = []
  with tf.io.gfile.GFile(class_map_csv_text) as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
      class_names.append(row['display_name'])

  return class_names

class_map_path = model.class_map_path().numpy()
class_names = class_names_from_csv(class_map_path)

In [None]:
def ensure_sample_rate(original_sample_rate, waveform,
                       desired_sample_rate=16000):
  """Resample waveform if required."""
  if original_sample_rate != desired_sample_rate:
    desired_length = int(round(float(len(waveform)) /
                               original_sample_rate * desired_sample_rate))
    waveform = scipy.signal.resample(waveform, desired_length)
  return desired_sample_rate, waveform

In [None]:
!curl -O https://storage.googleapis.com/audioset/speech_whistling2.wav

In [None]:
!curl -O https://storage.googleapis.com/audioset/miaow_16k.wav

In [None]:
# wav_file_name = 'speech_whistling2.wav'
wav_file_name = 'miaow_16k.wav'
sample_rate, wav_data = wavfile.read(wav_file_name, 'rb')
sample_rate, wav_data = ensure_sample_rate(sample_rate, wav_data)

# Show some basic information about the audio.
duration = len(wav_data)/sample_rate
print(f'Sample rate: {sample_rate} Hz')
print(f'Total duration: {duration:.2f}s')
print(f'Size of the input: {len(wav_data)}')

# Listening to the wav file.
Audio(wav_data, rate=sample_rate)

In [None]:
waveform = wav_data / tf.int16.max

In [None]:
# Run the model, check the output.
scores, embeddings, spectrogram = model(waveform)

In [None]:
scores_np = scores.numpy()
spectrogram_np = spectrogram.numpy()
infered_class = class_names[scores_np.mean(axis=0).argmax()]
print(f'The main sound is: {infered_class}')

In [None]:
plt.figure(figsize=(10, 6))

# Plot the waveform.
plt.subplot(3, 1, 1)
plt.plot(waveform)
plt.xlim([0, len(waveform)])

# Plot the log-mel spectrogram (returned by the model).
plt.subplot(3, 1, 2)
plt.imshow(spectrogram_np.T, aspect='auto', interpolation='nearest', origin='lower')

# Plot and label the model output scores for the top-scoring classes.
mean_scores = np.mean(scores, axis=0)
top_n = 10
top_class_indices = np.argsort(mean_scores)[::-1][:top_n]
plt.subplot(3, 1, 3)
plt.imshow(scores_np[:, top_class_indices].T, aspect='auto', interpolation='nearest', cmap='gray_r')

# patch_padding = (PATCH_WINDOW_SECONDS / 2) / PATCH_HOP_SECONDS
# values from the model documentation
patch_padding = (0.025 / 2) / 0.01
plt.xlim([-patch_padding-0.5, scores.shape[0] + patch_padding-0.5])
# Label the top_N classes.
yticks = range(0, top_n, 1)
plt.yticks(yticks, [class_names[top_class_indices[x]] for x in yticks])
_ = plt.ylim(-0.5 + np.array([top_n, 0]))