In [11]:
audio_path = "./vocals_only/test_1_vocals.wav"

In [12]:
# import librosa
# import numpy as np
# import matplotlib.pyplot as plt


# def voice_activity_detection(audio_path, threshold=0.01):
#     # Load the audio file
#     y, sr = librosa.load(audio_path, sr=None)

#     # Compute the Short-Time Fourier Transform and Compute the magnitude of the STFT and Compute the root mean square (RMS) energy for each frame
#     rms = librosa.feature.rms(
#         S=np.abs(librosa.stft(y, n_fft=512)), frame_length=512, hop_length=128
#     )

#     # Normalize the RMS energy
#     rms_normalized = rms / np.max(rms)
#     del rms

#     # Detect voice activity based on the threshold
#     voice_activity = rms_normalized > threshold

#     # Plot the results
#     plt.figure(figsize=(21, 8), dpi=1000)
#     plt.subplot(2, 1, 1)
#     librosa.display.waveshow(y, sr=sr)
#     plt.title("Waveform")
#     plt.subplot(2, 1, 2)
#     plt.plot(rms_normalized[0], label="RMS Energy")
#     plt.plot(voice_activity[0], label="Voice Activity", color="r")
#     plt.title("Voice Activity Detection")
#     plt.xlabel("Frames")
#     plt.ylabel("Normalized RMS Energy")
#     plt.legend()
#     plt.tight_layout()
#     plt.show()

#     return voice_activity


# # Example usage
# audio_path = "./vocals_only/test_1_vocals.wav"
# voice_activity = voice_activity_detection(audio_path)

In [13]:
# import librosa
# import numpy as np
# import matplotlib.pyplot as plt
# import pandas as pd


# def voice_activity_detection(audio_path, threshold=0.01, min_silence_duration=1.5):
#     # Load the audio file
#     y, sr = librosa.load(audio_path, sr=None)

#     # Compute the Short-Time Fourier Transform and RMS energy
#     rms = librosa.feature.rms(
#         S=np.abs(librosa.stft(y, n_fft=512)), frame_length=512, hop_length=128
#     )

#     # Normalize the RMS energy
#     rms_normalized = rms / np.max(rms)

#     # Detect voice activity based on the threshold
#     voice_activity = rms_normalized > threshold

#     # Convert frame indices to time
#     times = librosa.frames_to_time(
#         np.arange(len(voice_activity[0])), sr=sr, hop_length=128
#     )

#     # Find intervals of voice activity
#     intervals = []
#     start_time = None
#     for i, active in enumerate(voice_activity[0]):
#         if active and start_time is None:
#             start_time = times[i]
#         elif not active and start_time is not None:
#             end_time = times[i]
#             intervals.append((start_time, end_time))
#             start_time = None

#     # Concatenate intervals with small gaps
#     concatenated_intervals = []
#     for start, end in intervals:
#         if (
#             concatenated_intervals
#             and start - concatenated_intervals[-1][1] < min_silence_duration
#         ):
#             concatenated_intervals[-1] = (concatenated_intervals[-1][0], end)
#         else:
#             concatenated_intervals.append((start, end))

#     # Prepare data for CSV
#     data = []
#     for start, end in concatenated_intervals:
#         avg_energy = np.mean(
#             rms_normalized[0][int(start * sr / 128) : int(end * sr / 128)]
#         )
#         data.append(
#             {
#                 "Start": librosa.time_to_frames(start, sr=sr, hop_length=128),
#                 "End": librosa.time_to_frames(end, sr=sr, hop_length=128),
#                 "Energy": avg_energy,
#                 "Threshold": threshold,
#             }
#         )

#     # Create a DataFrame and save to CSV
#     df = pd.DataFrame(data)
#     df.to_csv("voice_activity_intervals.csv", index=False)

#     # Plot the results
#     plt.figure(figsize=(21, 8), dpi=1000)
#     plt.subplot(2, 1, 1)
#     librosa.display.waveshow(y, sr=sr)
#     plt.title("Waveform")
#     plt.subplot(2, 1, 2)
#     plt.plot(rms_normalized[0], label="RMS Energy")
#     plt.plot(voice_activity[0], label="Voice Activity", color="r")
#     plt.title("Voice Activity Detection")
#     plt.xlabel("Frames")
#     plt.ylabel("Normalized RMS Energy")
#     plt.legend()
#     plt.tight_layout()
#     plt.show()

#     return voice_activity


# voice_activity = voice_activity_detection(audio_path)

In [14]:
import librosa
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from datetime import timedelta


def voice_activity_detection(audio_path, threshold=0.01, min_silence_duration=1.5):
    # Load the audio file
    y, sr = librosa.load(audio_path, sr=None)

    # Compute the Short-Time Fourier Transform and RMS energy
    rms = librosa.feature.rms(
        S=np.abs(librosa.stft(y, n_fft=512)), frame_length=512, hop_length=128
    )

    # Normalize the RMS energy
    rms_normalized = rms / np.max(rms)

    # Detect voice activity based on the threshold
    voice_activity = rms_normalized > threshold

    # Convert frame indices to time
    times = librosa.frames_to_time(
        np.arange(len(voice_activity[0])), sr=sr, hop_length=128
    )

    # Find intervals of voice activity
    intervals = []
    start_time = None
    for i, active in enumerate(voice_activity[0]):
        if active and start_time is None:
            start_time = times[i]
        elif not active and start_time is not None:
            end_time = times[i]
            intervals.append((start_time, end_time))
            start_time = None

    # Concatenate intervals with small gaps
    concatenated_intervals = []
    for start, end in intervals:
        if (
            concatenated_intervals
            and start - concatenated_intervals[-1][1] < min_silence_duration
        ):
            concatenated_intervals[-1] = (concatenated_intervals[-1][0], end)
        else:
            concatenated_intervals.append((start, end))

    # Prepare data for CSV
    data = []
    for start, end in concatenated_intervals:
        avg_energy = np.mean(
            rms_normalized[0][int(start * sr / 128) : int(end * sr / 128)]
        )
        # Convert start and end times to hh:mm:ss format
        start_hh_mm_ss = str(timedelta(seconds=int(start)))
        end_hh_mm_ss = str(timedelta(seconds=int(end)))
        data.append(
            {
                "Start": start_hh_mm_ss,
                "End": end_hh_mm_ss,
                "Energy": avg_energy,
                "Threshold": threshold,
            }
        )

    # Create a DataFrame and save to CSV
    pd.DataFrame(data).to_csv("voice_activity_intervals.csv", index=False)

    # # Plot the results
    # plt.figure(figsize=(21, 8), dpi=1000)
    # plt.subplot(2, 1, 1)
    # librosa.display.waveshow(y, sr=sr)
    # plt.title("Waveform")
    # plt.subplot(2, 1, 2)
    # plt.plot(rms_normalized[0], label="RMS Energy")
    # plt.plot(voice_activity[0], label="Voice Activity", color="r")
    # plt.title("Voice Activity Detection")
    # plt.xlabel("Frames")
    # plt.ylabel("Normalized RMS Energy")
    # plt.legend()
    # plt.tight_layout()
    # plt.show()

    # return voice_activity


# voice_activity = voice_activity_detection(audio_path)
voice_activity_detection(audio_path=audio_path)

In [15]:
# import librosa
# import numpy as np
# import matplotlib.pyplot as plt

# # Load the audio file
# y, sr = librosa.load(
#     audio_path, sr=None, duration=300
# )  # Load first 5 minutes (300 seconds)

# # Calculate the RMS energy
# frame_length = 512
# hop_length = 128
# rms = librosa.feature.rms(y=y, frame_length=frame_length, hop_length=hop_length)[0]

# # Convert RMS to decibels
# rms_db = librosa.amplitude_to_db(rms, ref=np.max)

# # Plot the RMS energy
# plt.figure(figsize=(26, 7), dpi=800)
# plt.plot(rms_db, label="RMS Energy (dB)")
# plt.axhline(y=-40, color="r", linestyle="--", label="Silence Threshold")
# plt.title("RMS Energy of Audio")
# plt.xlabel("Frames")
# plt.ylabel("RMS Energy (dB)")
# plt.legend()
# plt.show()

# # Determine silence vs conversation
# silence_threshold = -40  # Adjust this threshold based on your audio characteristics
# silence_frames = rms_db < silence_threshold

# # Print results
# print(f"Number of silent frames: {np.sum(silence_frames)}")
# print(f"Number of conversation frames: {len(silence_frames) - np.sum(silence_frames)}")

In [16]:
# import librosa
# import soundfile as sf
# import numpy as np
# import datetime


# def analyze_audio_decibels(
#     audio_path, output_file, duration_limit_seconds=240
# ):  # Limit to 4 minutes
#     """
#     Analyzes an audio file, calculates the RMS decibels for each second,
#     and stores the results in a text file.

#     Args:
#         audio_path: Path to the audio file.
#         output_file: Path to the output text file.
#         duration_limit_seconds: Maximum duration to analyze in seconds.
#     """
#     try:
#         y, sr = librosa.load(
#             audio_path, sr=None, duration=duration_limit_seconds
#         )  # Load with original sampling rate and apply duration limit
#     except librosa.LibrosaError as e:
#         print(f"Error loading audio file: {e}")
#         return
#     except sf.SoundFileError as e:
#         print(f"Error reading audio file: {e}")
#         return
#     except Exception as e:
#         print(f"An unexpected error occurred: {e}")
#         return

#     with open(output_file, "w") as f:
#         f.write("Time - Db\n")  # Header

#         for second in range(
#             int(min(len(y) / sr, duration_limit_seconds))
#         ):  # Iterate up to the duration limit or the audio length
#             start_sample = second * sr
#             end_sample = (second + 1) * sr

#             # Ensure we don't go beyond the audio length
#             end_sample = min(end_sample, len(y))

#             if start_sample >= len(y):
#                 break  # Exit loop if we've reached the end of the audio

#             frame = y[int(start_sample) : int(end_sample)]

#             if len(frame) == 0:
#                 db = -np.inf  # If frame is empty assign -inf dB
#             else:
#                 rms = np.sqrt(np.mean(np.square(frame)))
#                 if rms > 0:
#                     db = 20 * np.log10(rms)
#                 else:
#                     db = -np.inf  # Handle silence to avoid log of zero

#             time_str = str(datetime.timedelta(seconds=second))
#             f.write(f"{time_str} - Db = {db:.2f}\n")


# output_text_file = "voice_activity_decibels.txt"
# analyze_audio_decibels(audio_path, output_text_file)
# print(f"Decibel analysis complete. Results saved to {output_text_file}")


import librosa
import soundfile as sf
import numpy as np
import datetime


def analyze_audio_decibels(
    audio_path,
    output_file_db,
    output_file_intervals,
    threshold_db=-40,
    min_interval_gap=2,
):
    """
    Analyzes an audio file, calculates RMS decibels, stores dB values, and identifies intervals above a threshold.

    Args:
        audio_path: Path to the audio file.
        output_file_db: Path to the output text file for decibel values.
        output_file_intervals: Path to the output text file for intervals above threshold.
        threshold_db: Decibel threshold for interval detection.
        min_interval_gap: Minimum gap in seconds to consider intervals separate.
    """
    try:
        y, sr = librosa.load(audio_path, sr=None)  # Load the entire audio
    except librosa.LibrosaError as e:
        print(f"Error loading audio file: {e}")
        return
    except sf.SoundFileError as e:
        print(f"Error reading audio file: {e}")
        return
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
        return

    intervals = []
    current_interval_start = None
    previous_time = None

    with open(output_file_db, "w") as f_db, open(
        output_file_intervals, "w"
    ) as f_intervals:
        f_db.write("Time - Db\n")
        f_intervals.write("Start - End\n")

        for second in range(int(len(y) / sr)):
            start_sample = second * sr
            end_sample = (second + 1) * sr
            end_sample = min(
                end_sample, len(y)
            )  # Ensure we don't go beyond the audio length
            if start_sample >= len(y):
                break  # Exit loop if we've reached the end of the audio
            frame = y[int(start_sample) : int(end_sample)]

            if len(frame) == 0:
                db = -np.inf
            else:
                rms = np.sqrt(np.mean(np.square(frame)))
                if rms > 0:
                    db = 20 * np.log10(rms)
                else:
                    db = -np.inf

            time_str = str(datetime.timedelta(seconds=second))
            f_db.write(f"{time_str} - Db = {db:.2f}\n")

            if db > threshold_db:
                if current_interval_start is None:
                    current_interval_start = second
                elif (
                    previous_time is not None
                    and (second - previous_time) > min_interval_gap
                ):
                    intervals.append((current_interval_start, previous_time))
                    current_interval_start = second
                previous_time = second
            elif current_interval_start is not None:
                intervals.append((current_interval_start, previous_time))
                current_interval_start = None
                previous_time = None
        # Handle last interval if it is active
        if current_interval_start is not None:
            intervals.append((current_interval_start, previous_time))
            current_interval_start = None
            previous_time = None

        for start, end in intervals:
            start_time_str = str(datetime.timedelta(seconds=start))
            end_time_str = str(datetime.timedelta(seconds=end))
            f_intervals.write(f"{start_time_str} - {end_time_str}\n")


output_db_file = "voice_activity_decibels.txt"
output_intervals_file = "voice_activity_decibels_ACTIVE.txt"
analyze_audio_decibels(audio_path, output_db_file, output_intervals_file)
print(
    f"Analysis complete. dB values saved to {output_db_file}, active intervals to {output_intervals_file}"
)

Analysis complete. dB values saved to voice_activity_decibels.txt, active intervals to voice_activity_decibels_ACTIVE.txt
