## Using mediainfo to load media technical information

In [None]:
from pymediainfo import MediaInfo
import numpy as np

file_path = 'C:\\videos\\6230046.mxf'

# Extract media information
media_info = MediaInfo.parse(file_path)

# Initialize variables to store audio track count and metadata
audio_tracks = []
audio_metadata = []

# Iterate through tracks to find audio tracks
for track in media_info.tracks:
    if track.track_type == "Audio":
        audio_tracks.append(track)
        audio_metadata.append(track.to_data())

# Output the number of audio tracks and their metadata
print(f"Number of audio tracks: {len(audio_tracks)}")
print("Audio track metadata:")
for idx, metadata in enumerate(audio_metadata, start=1):
    print(f"Track {idx}: {metadata}")

## The mediainfo audio structure

In [None]:
for key, value in audio_metadata[0].items():
    print(f"{key}: {value}")

## Important properties to check the quality of data importer

In [None]:
audio_metadata[0].get('format') # PCM
audio_metadata[0].get('format_settings__endianness') # Little
audio_metadata[0].get('format_settings__wrapping_mode') # Frame (AES)
audio_metadata[0].get('codec_id') # 0D01030102060300
audio_metadata[0].get('duration') # 691558
audio_metadata[0].get('bit_rate_mode') # CBR
audio_metadata[0].get('bit_rate') # 1152000
audio_metadata[0].get('samples_per_frame') # 1601.6
audio_metadata[0].get('sampling_rate') # 48000
audio_metadata[0].get('samples_count') # 33194784
audio_metadata[0].get('frame_rate') # 29.970
audio_metadata[0].get('frame_count') # 20726
audio_metadata[0].get('bit_depth') # 24
audio_metadata[0].get('delay_dropframe') # Yes
audio_metadata[0].get('stream_size') # 99584352
audio_metadata[0].get('blockalignment') # 3

## Extract audio track

#### using memory extract data from video and open raw data into byte array

In [None]:
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

from extractor.audio_ffmpeg import AudioFFmpegExtractor

sr = 16000

ffmpeg_extractor = AudioFFmpegExtractor()

buffer = ffmpeg_extractor.extract(file_path, 0, int(sr))

In [None]:
type(buffer), type(buffer[0])

## get item array size

In [None]:
dtype = type(buffer[0])
sample_size_bytes = np.dtype(dtype).itemsize
sample_size_bits = sample_size_bytes * 8
print(dtype, "sample size in bytes:", sample_size_bytes, "bits:", sample_size_bits)

In [None]:
def check_bitrate(sr, bits, ch, ms):
    bit_rate = (sr * bits * ch)
    return bit_rate/1000

def check_duration(raw_data, sr):
    duration_seconds = len(raw_data) / sr
    minutes = int(duration_seconds // 60)
    seconds = int(duration_seconds % 60)
    return f"{minutes}:{seconds:02d}"
    

total_duration = audio_metadata[0].get('duration')

print("Bitrate in kbps:", check_bitrate(sr, sample_size_bits, 1, total_duration))
print("Duration in millis:", check_duration(buffer, sr))

In [None]:
norm_flat_buffer = buffer.flatten().astype(np.float32) / 32768.0

## Load data into librosa to draw mel spectogram

In [None]:
import numpy as np
import librosa

import librosa.display
import matplotlib.pyplot as plt

# Generate a mel spectrogram
S = librosa.feature.melspectrogram(y=norm_flat_buffer, sr=sr, n_mels=128, fmax=sr // 2)

# Convert to log scale (dB)
S_dB = librosa.power_to_db(S, ref=np.max)

## using matplotlib to draw

In [None]:
# Plot the mel spectrogram
plt.figure(figsize=(10, 4))
librosa.display.specshow(S_dB, sr=sr, x_axis='time', y_axis='mel', fmax=sr // 2)
plt.colorbar(format='%+2.0f dB')
plt.title('Mel Spectrogram')
plt.tight_layout()
plt.show()

In [None]:
import librosa.display

# Plot the waveform
plt.figure(figsize=(10, 4))
librosa.display.waveshow(norm_flat_buffer, sr=sr, alpha=0.5)
plt.title('Waveform')
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')
plt.tight_layout()
plt.show()

In [None]:
def draw_waveforms(data_list, sr):
    """
    Draw a list of waveforms in a pyplot subplot format.

    Args:
        data_list (list of np.ndarray): List of audio data arrays.
        sr (int): Sampling rate of the audio data.
    """
    num_waveforms = len(data_list)
    rows = (num_waveforms + 1) // 2  # Two waveforms per row

    plt.figure(figsize=(12, rows * 3))

    for idx, data in enumerate(data_list):
        plt.subplot(rows, 2, idx + 1)
        librosa.display.waveshow(data, sr=sr, alpha=0.5)
        plt.title(f'Waveform {idx}')
        plt.xlabel('Time (s)')
        plt.ylabel('Amplitude')

    plt.tight_layout()
    plt.show()

## Drawing and expose silent tracks

In [None]:
import sys
import os
import numpy as np
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

from extractor.audio_ffmpeg import AudioFFmpegExtractor
from pymediainfo import MediaInfo


SAMPLING_RATE = 16000
file_path = 'C:\\videos\\6230046.mxf'

# Extract media information
media_info = MediaInfo.parse(file_path)

# Initialize variables to store audio track count and metadata
audio_tracks = []

ffmpeg_extractor = AudioFFmpegExtractor()
# Iterate through tracks to find audio tracks
for track in media_info.tracks:
    if track.track_type == "Audio":
        buffer = ffmpeg_extractor.extract(file_path, track.to_data().get('stream_identifier'), SAMPLING_RATE)
        norm_flat_buffer = buffer.flatten().astype(np.float32) / 32768.0
        audio_tracks.append(norm_flat_buffer)

        
draw_waveforms(audio_tracks, SAMPLING_RATE)