In [16]:
!pip install python-ffmpeg pyannote.audio python-dotenv git+https://github.com/openai/whisper.git --quiet

In [2]:
import os
import shutil
import glob
from ffmpeg import FFmpeg
import tqdm.auto as tqdm
from pyannote.audio import Pipeline as PyannotePipeline
from pyannote.audio.pipelines.utils.hook import ProgressHook
import dotenv
import torch
import torchaudio
import pickle
import matplotlib.pyplot as plt
import numpy as np
import json
from IPython.display import Video
import whisper

%load_ext autoreload
%autoreload 2
from utils import *

dotenv.load_dotenv()

device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")

## Set constants


In [None]:
video_root = "videos"

## Extract audio from videos


In [None]:
os.makedirs("audio_tracks", exist_ok=True)
video_paths = glob.glob(f"{video_root}/*/*", recursive=True)
for video_path in tqdm.tqdm(video_paths):
    audio_path = os.path.join("audio_tracks", ".".join(os.path.basename(video_path).split(".")[:-1]) + ".mp3")
    if os.path.exists(audio_path):
        continue
    FFmpeg().input(video_path).output(audio_path).execute()

## Diarize


In [None]:
diarization_pipeline = PyannotePipeline.from_pretrained(
    "pyannote/speaker-diarization-3.1", use_auth_token=os.getenv("HUGGINGFACE_TOKEN")
).to(device)

os.makedirs("diarizations", exist_ok=True)
audio_paths = glob.glob("audio_tracks/*.mp3")
for audio_path in tqdm.tqdm(audio_paths):
    diarization_path = os.path.join("diarizations", ".".join(os.path.basename(audio_path).split(".")[:-1]) + ".pkl")
    if os.path.exists(diarization_path):
        continue
    with ProgressHook() as hook:
        waveform, sample_rate = torchaudio.load(audio_path)
        diarization = diarization_pipeline({"waveform": waveform, "sample_rate": sample_rate}, hook=hook)
        with open(diarization_path, "wb") as f:
            pickle.dump(diarization, f)

## Identify speakers


### Generate speech samples for manual annotation


In [None]:
diarization_paths = glob.glob("diarizations/*.pkl")
num_clips = 4
minimum_speaking_time = 50

supercuts = []
for diarization_path in diarization_paths:
    with open(diarization_path, "rb") as f:
        diarization = pickle.load(f)
    video_id = os.path.basename(diarization_path).split(".")[0]
    video_path = glob.glob(f"{os.path.join('videos', video_id)}.*")[0]
    intervals_by_speaker = {}
    for interval, turn, speaker in diarization.itertracks(yield_label=True):
        if speaker not in intervals_by_speaker:
            intervals_by_speaker[speaker] = []
        intervals_by_speaker[speaker].append((interval.start, interval.end))
    total_time_by_speaker = {
        speaker: sum(interval[1] - interval[0] for interval in intervals) for speaker, intervals in intervals_by_speaker.items()
    }
    for speaker in tqdm.tqdm(diarization.labels(), desc=f"Generating super cuts for each speaker in video {video_id}"):
        if total_time_by_speaker[speaker] < minimum_speaking_time:
            continue
        extracted_interval_paths = []
        intervals_by_speaker[speaker] = sorted(
            intervals_by_speaker[speaker], key=lambda interval: interval[1] - interval[0], reverse=True
        )
        for interval_index in range(num_clips):
            interval_index = int(interval_index)
            interval = intervals_by_speaker[speaker][interval_index]
            middle = (interval[0] + interval[1]) / 2
            os.makedirs("tmp", exist_ok=True)
            extract_interval(video_path, interval, f"tmp/{video_id}_{speaker}_{interval_index:02d}.mp4")
            extracted_interval_paths.append(f"tmp/{video_id}_{speaker}_{interval_index:02d}.mp4")
        if len(extracted_interval_paths) < num_clips:
            extracted_interval_paths += [extracted_interval_paths[-1]] * (num_clips - len(extracted_interval_paths))
        output_file = f"tmp/supercuts/{video_id}_{speaker}.mp4"
        concatenate_videos(extracted_interval_paths, output_file)
        supercuts.append((video_id, speaker, output_file))

### Manually annotate speech samples


In [None]:
display_speaker_identification_form(supercuts)

### Generate supercut for each true speaker


In [None]:
with open("diarization_mappings.json", "r") as f:
    diarization_mappings = json.load(f)

intervals_by_speaker = {}

diarization_paths = glob.glob("diarizations/*.pkl")
for diarization_path in diarization_paths:
    with open(diarization_path, "rb") as f:
        diarization = pickle.load(f)
    video_id = os.path.basename(diarization_path).split(".")[0]
    video_path = glob.glob(f"{os.path.join('videos', video_id)}.*")[0]
    for interval, turn, speaker in diarization.itertracks(yield_label=True):
        if speaker not in diarization_mappings[video_id]:
            continue
        speaker_name = diarization_mappings[video_id][speaker]
        if speaker_name not in intervals_by_speaker:
            intervals_by_speaker[speaker_name] = []
        record = {
            "video_id": video_id,
            "start": interval.start,
            "end": interval.end,
        }
        intervals_by_speaker[speaker_name].append(record)

with open("intervals_by_speaker.json", "w") as f:
    json.dump(intervals_by_speaker, f)

for speaker in intervals_by_speaker:
    os.makedirs(f"tmp/speaker_clips/{speaker}", exist_ok=True)
    for record in tqdm.tqdm(intervals_by_speaker[speaker], desc=f"Generating supercut for {speaker}"):
        video_id = record["video_id"]
        video_path = glob.glob(f"{os.path.join('videos', video_id)}.*")[0]
        interval = (record["start"], record["end"])
        extract_interval(
            video_path, interval, f"tmp/speaker_clips/{speaker}/{video_id}_{interval[0]:.2f}_{interval[1]:.2f}.mp4"
        )
    clip_paths = glob.glob(f"tmp/speaker_clips/{speaker}/*.mp4")
    concatenate_videos(clip_paths, f"supercuts/{speaker}.mp4")

### Strip audio from supercuts


In [12]:
shutil.rmtree("tmp", ignore_errors=True)
os.makedirs("tmp/supercut_audio", exist_ok=True)
supercut_video_paths = glob.glob(f"supercuts/*", recursive=True)
for supercut_video_path in tqdm.tqdm(supercut_video_paths):
    supercut_audio_path = os.path.join("tmp", "supercut_audio", os.path.basename(supercut_video_path).split(".")[0] + ".mp3")
    if os.path.exists(supercut_audio_path):
        continue
    FFmpeg().input(supercut_video_path).output(supercut_audio_path).execute()

  0%|          | 0/10 [00:00<?, ?it/s]

## Transcribe


In [20]:
!git clone https://github.com/ggerganov/whisper.cpp.git > /dev/null 2>&1
!bash ./whisper.cpp/models/download-ggml-model.sh base.en
!cd whisper.cpp && make -j10 > /dev/null 2>&1


Downloading ggml model base.en from 'https://huggingface.co/ggerganov/whisper.cpp' ...
Model base.en already exists. Skipping download.
whisper_init_from_file_with_params_no_state: loading model from 'models/ggml-base.en.bin'
whisper_init_from_file_with_params_no_state: failed to open 'models/ggml-base.en.bin'
error: failed to initialize whisper context


In [4]:
os.makedirs("speaker_transcripts", exist_ok=True)
results = []
for supercut_audio_path in tqdm.tqdm(glob.glob("tmp/supercut_audio/*.mp3")):
    wav_path = os.path.join("tmp", "output.wav")
    output_path = os.path.join("speaker_transcripts", os.path.basename(supercut_audio_path).split(".")[0] + ".txt")
    !ffmpeg -i "{supercut_audio_path}" -ar 16000 -ac 1 -c:a pcm_s16le {wav_path} -y > /dev/null 2>&1
    !cd whisper.cpp && ./main -otxt -f "{os.path.abspath(wav_path)}" > /dev/null 2>&1
    !mv "{os.path.abspath(wav_path)}.txt" "{output_path}"

  0%|          | 0/10 [00:00<?, ?it/s]

tmp/supercut_audio/Akashia.mp3
tmp/supercut_audio/Victoria Porkchop Parker.mp3
tmp/supercut_audio/Shannel.mp3
tmp/supercut_audio/RuPaul.mp3
tmp/supercut_audio/Tammie Brown.mp3
tmp/supercut_audio/Nina Flowers.mp3
tmp/supercut_audio/BeBe Zahara Benet.mp3
tmp/supercut_audio/Jade.mp3
tmp/supercut_audio/Rebecca Glasscock.mp3
tmp/supercut_audio/Ongina.mp3
