In [None]:
!pip install huggingsound moviepy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting huggingsound
  Downloading huggingsound-0.1.6-py3-none-any.whl (28 kB)
Collecting datasets<3.0.0,>=2.6.1 (from huggingsound)
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting jiwer<3.0.0,>=2.5.1 (from huggingsound)
  Downloading jiwer-2.6.0-py3-none-any.whl (20 kB)
Collecting librosa<0.10.0,>=0.9.2 (from huggingsound)
  Downloading librosa-0.9.2-py3-none-any.whl (214 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m214.3/214.3 kB[0m [31m24.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch!=1.12.0,<1.13.0,>=1.7 (from huggingsound)
  Downloading torch-1.12.1-cp310-cp310-manylinux1_x86_64.whl (776.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m776.3/776.3 MB[0m [31m1.8 MB/s[0m eta [36m0:00:

In [None]:
def convert_video_file_to_audio_file(video_file_path):
  import moviepy.editor as mp

  clip = mp.VideoFileClip(video_file_path)

  audio_file_path = video_file_path[:-4] + "_audio.wav"

  clip.audio.write_audiofile(audio_file_path, bitrate="16k")

  return audio_file_path

def get_model():
  from huggingsound import SpeechRecognitionModel
  model = SpeechRecognitionModel("jonatasgrosman/wav2vec2-large-xlsr-53-english")
  return model

def get_transcription(audio_file, model):
  transcriptions = model.transcribe([audio_file])
  return transcriptions[0]

def get_spans_of_text(keyword, transcription):
  import re
  text = transcription['transcription']
  matches = list(re.finditer(keyword, text))
  if matches:
    spans = [(x.span(), x.group()) for x in matches]
    return spans
  else:
    return []

def convert_millisecond_to_second(millis):
  millis = int(millis)
  seconds = (millis/1000)%60
  return seconds

def get_time_frames(transcription, text_spans):
  time_frames = []
  start_timestamps = transcription['start_timestamps']
  end_timestamps = transcription['end_timestamps']
  for span, text in text_spans:
    start_time = start_timestamps[span[0]]
    end_time = end_timestamps[span[1]]
    start_time = convert_millisecond_to_second(start_time)
    end_time = convert_millisecond_to_second(end_time)
    time_frames.append((start_time, end_time))
  return time_frames

In [None]:
video_file_path = "sample_video.mp4"
keyword = "layers and models"
model = get_model()
print("Model is loaded..........")
audio_file = convert_video_file_to_audio_file(video_file_path)
print("Audio File is generated...........")
transcription = get_transcription(audio_file, model)
print("Transcription is predicted.........")
text_spans = get_spans_of_text(keyword=keyword, transcription=transcription)
print("Text spans are created..........")
time_frames = get_time_frames(transcription, text_spans)
print(f"Time frames are of keyword : {keyword} \n {time_frames}")

INFO:huggingsound.speech_recognition.model:Loading model...


Model is loaded..........
MoviePy - Writing audio in sample_video_audio.wav




MoviePy - Done.
Audio File is generated...........


100%|██████████| 1/1 [00:29<00:00, 29.97s/it]

Transcription is predicted.........
Text spans are created..........
Time frames are of keyword : layers and models 
 [(14.38, 15.38)]



