In [None]:
import sys

if sys.platform == 'win32':
    file_path = 'C:\\videos\\TheGreat_2_2001_169_2398_ProRes422HQ_ENG20_ENG51_BPO20_BPO51_Primary_A17913780.mov'
elif sys.platform == 'linux':
    file_path = '/mnt/c/videos/TheGreat_2_2001_169_2398_ProRes422HQ_ENG20_ENG51_BPO20_BPO51_Primary_A17913780.mov'
else:
    file_path = '/Users/leandro.correia/Documents/videos/TheGreat_2_2001_169_2398_ProRes422HQ_ENG20_ENG51_BPO20_BPO51_Primary_A17913780.mov'

In [None]:
from pymediainfo import MediaInfo
import numpy as np


# Extract media information
media_info = MediaInfo.parse(file_path)

# Initialize variables to store audio track count and metadata
audio_tracks = []
audio_metadata = []

# Iterate through tracks to find audio tracks
for track in media_info.tracks:
    if track.track_type == "Audio":
        audio_tracks.append(track)
        audio_metadata.append(track.to_data())

# Output the number of audio tracks and their metadata
print(f"Number of audio tracks: {len(audio_tracks)}")
print("Audio track metadata:")
for idx, metadata in enumerate(audio_metadata, start=1):
    print(f"Track {idx}: {metadata}")

In [None]:
import sys
import os
import numpy as np
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

from extractor.audio_ffmpeg import AudioFFmpegExtractor
from pymediainfo import MediaInfo


SAMPLING_RATE = 16000

# Extract media information
media_info = MediaInfo.parse(file_path)

# Initialize variables to store audio track count and metadata
audio_tracks = []

ffmpeg_extractor = AudioFFmpegExtractor()
# Iterate through tracks to find audio tracks
for track in media_info.tracks:
    if track.track_type == "Audio":
        buffer = ffmpeg_extractor.extract(file_path, track.to_data().get('stream_identifier'), SAMPLING_RATE)
        
        norm_flat_buffer = buffer.flatten().astype(np.float32) / 32768.0
        audio_tracks.append({
            'id': track.to_data().get('stream_identifier'),
            'data': norm_flat_buffer,
            'channel_layout': track.to_data().get('channel_layout')
            })


for audio in audio_tracks:
    print(audio.get('id'), audio.get('channel_layout'))

In [None]:
import numpy as np
from dataclasses import dataclass
from typing import List, Tuple
from datetime import timedelta


@dataclass
class Silence:
    start: timedelta
    duration: timedelta
    index_start: int
    index_end: int


class SilenceProcessor:
    def is_silence(self, amplitude: float, threshold_db: int) -> bool:
        if amplitude == 0:
            return True
        dB = 20 * np.log10(abs(amplitude))
        return dB < threshold_db

    def get_silence_duration(
        self,
        samples: np.ndarray,  # 1D or 2D (channels x samples or interleaved)
        sample_rate: int,
        channels: int,
        min_silence: timedelta,
        silence_threshold_db: int = -40
    ) -> Tuple[timedelta, timedelta, int, int]:

        # If stereo or multi-channel interleaved, convert to mono by averaging channels
        if samples.ndim == 2:
            samples = samples.mean(axis=0)

        threshold = 10 ** (silence_threshold_db / 20)

        counter_start = -1
        counter_length = 0

        samples_per_ms = sample_rate / 1000.0
        count_min_silence = int(min_silence.total_seconds() * sample_rate)

        for i, sample in enumerate(samples):
            if self.is_silence(sample, silence_threshold_db):
                if counter_start == -1:
                    counter_start = i
                counter_length += 1
            else:
                if counter_start != -1:
                    if counter_length >= count_min_silence:
                        break
                    counter_start = -1
                    counter_length = 0

        if counter_start == -1:
            return timedelta(milliseconds=-1), timedelta(milliseconds=-1), -1, -1

        silence_start_ms = counter_start / samples_per_ms
        silence_duration_ms = counter_length / samples_per_ms

        return (
            timedelta(milliseconds=silence_start_ms),
            timedelta(milliseconds=silence_duration_ms),
            counter_start,
            counter_length
        )

    def get_silence(
        self,
        samples: np.ndarray,
        sample_rate: int,
        channels: int,
        min_silence: timedelta,
        silence_threshold_db: int = -40
    ) -> Silence:
        start, duration, index_start, index_count = self.get_silence_duration(
            samples, sample_rate, channels, min_silence, silence_threshold_db
        )
        return Silence(
            start=start,
            duration=duration,
            index_start=index_start,
            index_end=index_start + index_count - 1
        )

    def get_all_silences(
        self,
        samples: np.ndarray,
        sample_rate: int,
        channels: int,
        min_silence: timedelta,
        silence_threshold_db: int = -40
    ) -> List[Silence]:
        silences = []
        slicer = 0
        time_since_start = timedelta(0)

        while slicer < len(samples):
            sub_samples = samples[slicer:]
            start, duration, index_start, index_count = self.get_silence_duration(
                sub_samples, sample_rate, channels, min_silence, silence_threshold_db
            )

            if start.total_seconds() < 0:
                break

            actual_start = time_since_start + start
            silence = Silence(
                start=actual_start,
                duration=duration,
                index_start=slicer + index_start,
                index_end=slicer + index_start + index_count - 1
            )
            silences.append(silence)

            time_since_start += start + duration
            slicer += index_start + index_count

        return silences


In [None]:
raw_bytes = audio_tracks[0].get('data')
type(raw_bytes[0])

In [None]:
processor = SilenceProcessor()
silences = processor.get_all_silences(
    audio_tracks[0].get('data'),
    sample_rate=16000,
    channels=1,
    min_silence=timedelta(milliseconds=500),
    silence_threshold_db=-40
)

for silence in silences:
    print(silence)

In [None]:
for i in range(1, len(silences)):
    prev_end = silences[i-1].start + silences[i-1].duration
    curr_start = silences[i].start
    distance = (curr_start - prev_end).total_seconds()
    print(f"Distance between silence {i-1} and {i}: {distance:.3f} seconds")

In [None]:
import matplotlib.pyplot as plt

audio_data = audio_tracks[0].get('data')
time_axis = np.arange(len(audio_data)) / SAMPLING_RATE

plt.figure(figsize=(15, 4))
plt.plot(time_axis, audio_data, label='Audio waveform', alpha=0.7)

for silence in silences:
    start_sec = silence.start.total_seconds()
    end_sec = (silence.start + silence.duration).total_seconds()
    plt.axvspan(start_sec, end_sec, color='red', alpha=0.3, label='Silence' if 'Silence' not in plt.gca().get_legend_handles_labels()[1] else "")

plt.xlabel('Time (seconds)')
plt.ylabel('Amplitude')
plt.title('Audio waveform with detected silences')
plt.legend()
plt.tight_layout()
plt.show()