In [None]:
import pandas as pd
from youtube_transcript_api import YouTubeTranscriptApi

pd.set_option('display.width', 10000)

url = "https://www.youtube.com/watch?v=wDAmezoNHJY"

vid_id = url.partition("/watch?v=")[2]

#retrieve the available transcripts
transcript_list = YouTubeTranscriptApi.list_transcripts(vid_id)
transcript = transcript_list.find_manually_created_transcript(['fr', 'fr-FR'])  
if transcript != "":
    df = pd.DataFrame(transcript.fetch())
    #df.to_csv("transcript.csv", index=False)

In [None]:
from os import path
from yt_dlp import YoutubeDL
from pydub import AudioSegment
from subprocess import STDOUT, DEVNULL, call, run

# Fetch best quality audio, re-encode to mono wav

def fetch_encode_wav(url):
    video_info = YoutubeDL().extract_info(
        url = url,download=False
    )

    options={
        'format':'bestaudio/best',
        'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'wav',
        'preferredquality': '320'
        }],
        'postprocessor_args': [
            '-ar', '16000',
            '-ac', '1'
        ],
        'prefer_ffmpeg': True,
        'keepvideo':False,
        'outtmpl':'audio/audio.wav',
    }

    with YoutubeDL(options) as ydl:
        ydl.download([video_info['webpage_url']])

    print("Download complete !")

    #-f s16le not supported by librosa !!!

fetch_encode_wav("https://www.youtube.com/watch?v=wDAmezoNHJY")

In [None]:
from subprocess import run
from pydub import AudioSegment
import numpy as np
import pandas as pd
import soundfile as sf
import librosa
from pydub import AudioSegment
from pydub.playback import play
import os.path

# Keep only clips containing specific word

pd.set_option('display.width', 10000)

#df = pd.read_csv("transcript.csv")
df.insert(2, 'stop', df['start'] + df['duration'], False)

# Select rows with matching word regex
df2 = df[df['text'].str.contains(r"\bphoto\b", regex=True)]
print(f"{df2.shape[0]} occurrence(s)")
#print(df2)
# Make audio, generate SRT

def cut_merge(cuts, audio_in, audio_out):
    print(audio_in)
    print(audio_out)
    # Takes a list of tuples as cuts [(start, end)...]

    cuts_old = len(cuts) 
    for i in range(len(cuts)):

        if i < len(cuts)-1 and cuts[i][1] == cuts[i+1][0]:
            cuts[i+1] = (cuts[i][0], cuts[i+1][1])
            cuts.pop(i)

    print(f"total cuts : {len(cuts)}, removed {cuts_old - len(cuts)} useless cuts") 

    # Opening file and extracting segment
    segment = AudioSegment.from_wav(audio_in)
    clips = []

    for cut in cuts:
        # cut_start, cut_end should be expressed in ms
        clips.append(segment[cut[0]*1000:cut[1]*1000])

    merged_clips = sum(clips)
    merged_clips.export(audio_out, format="wav")

def rm_silence(audio_in, audio_out):

    # remove silence with librosa
    samples, sr = librosa.load(audio_in, mono=True, sr=None, duration=None, offset=0.0)
    clips = librosa.effects.split(samples, top_db=30)

    # Audio files should be normalized before removing silence, else manually set top_db everytime ???????

    wav_data = []
    for c in clips:
        data = samples[c[0]: c[1]]
        wav_data.extend(data)
    sf.write(audio_out, wav_data, sr)

cut_merge([(start_time,stop_time) for start_time,stop_time in zip(df2['start'], df2["stop"])], "audio/audio.wav", "audio/audio_clip.wav")
rm_silence("audio/audio_clip.wav", "audio/audio_clip_silenced.wav")

In [None]:
# Use vosk to get words timestamp

from vosk import Model, KaldiRecognizer, SetLogLevel
import sys
import os
import wave
import subprocess
import srt
import json
import datetime

SetLogLevel(-1)

if not os.path.exists("model"):
    print ("Please download the model from https://alphacephei.com/vosk/models and unpack as 'model' in the current folder.")
    exit (1)

sample_rate=16000
model = Model("model")
rec = KaldiRecognizer(model, sample_rate)
rec.SetWords(True)
audio_clip = "audio/audio_clip.wav"

process = subprocess.Popen(['ffmpeg', '-loglevel', 'quiet', '-i',
                            audio_clip,
                            '-ar', str(sample_rate) , '-ac', '1', '-f', 's16le', '-'],
                            stdout=subprocess.PIPE)

WORDS_PER_LINE = 1

def transcribe():
    results = []
    subs = []
    while True:
       data = process.stdout.read(4000)
       if len(data) == 0:
           break
       if rec.AcceptWaveform(data):
           results.append(rec.Result())
    results.append(rec.FinalResult())

    for i, res in enumerate(results):
       jres = json.loads(res)
       if not 'result' in jres:
           continue
       words = jres['result']
       for j in range(0, len(words), WORDS_PER_LINE):
           line = words[j : j + WORDS_PER_LINE] 
           s = srt.Subtitle(index=len(subs), 
                   content=" ".join([l['word'] for l in line]),
                   start=line[0]['start'], 
                   end=line[-1]['end'])
           subs.append(s)
    return subs

words = transcribe()

In [None]:
labels = []
for word in words:
    if word.content == "photo":
        labels.append((word.start, word.end))
       
cut_merge(labels, "audio/audio_clip.wav", "audio/audio_final.wav")
print(f"{len(labels)} words transcribed !")
print("playing final audio, turn volume up...")
final_segment = AudioSegment.from_wav("audio/audio_final.wav")
play(final_segment)

In [None]:
# AVERAGE RUN TIME ~ 50s for (18-21)/26, 30 % error (transcribed/occurences)

# Roughly 1 10min vid per minute, 60 vids / hours, if vid = 10min and transcribed = 5 : 60*5 = 300 new words per hour
# Worse results when transcribing with unsilenced clip
# Still other vosk models to try !