In [1]:
# !pip install -U -q openai-whisper

import warnings
warnings.filterwarnings('ignore')

In [10]:
import whisper
import torch 
import librosa

import numpy as np

# device = "cuda" if torch.cuda.is_available() else "cpu"

model = whisper.load_model("base").to("cpu")

In [31]:
def get_phrases(segments):
  phrases, ini = [], ''

  for i, seg in enumerate(segments):

    if not len(ini) and '.' in seg['text']:
      phrases.append({
          'text': seg['text'].strip(),
          'start': round(seg['start'], 2),
          'end': round(seg['end'], 2)
      })
      ini = ''
      continue

    elif not len(ini):
      ini, start = seg['text'], round(seg['start'], 2)   
      continue

    ini += seg['text']

    if '.' in seg['text']:
      phrases.append({
          'text': ini.strip(),
          'start': start,
          'end': round(seg['end'], 2)
      })
      ini = ''
    
  return phrases


def transcribe(path):
  text = model.transcribe(path)
  # phrase-level segments
  phrases = get_phrases(text['segments'])

  return phrases


def extract_mfcc(path, mfcc=True):
    '''extract MFCC'''
    X, sample_rate = librosa.load(path)
    # remove silence 
    X, _ = librosa.effects.trim(X) 
    
    if mfcc:
        mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=64).T, axis=0)
        result=np.hstack((mfccs))

        return result
    else:
        return None
    

In [5]:
f_name = "pro_NBC.mp4"

phrases = transcribe(f_name)

In [6]:
phrases[ : 5]

[{'text': 'But we begin tonight with a tragedy that is unique to America.',
  'start': 0.0,
  'end': 4.52},
 {'text': 'Another school massacre, this time in Nashville, Tennessee, where a 28-year-old woman shot and killed three students and three staff members at a private Christian school.',
  'start': 4.52,
  'end': 15.32},
 {'text': 'The shooter was armed with two assault-style rifles and a handgun.',
  'start': 15.32,
  'end': 19.64},
 {'text': 'The Nashville Police Department said the shooter was a former student at the school who identifies as trans.',
  'start': 19.64,
  'end': 26.92},
 {'text': 'The children who were fatally shot are Evelyn Dijkhaus, Halley Scruggs, and William Kinney, all elementary school age.',
  'start': 26.92,
  'end': 35.32}]

In [32]:
import subprocess

def convert_wav(path):
    command = f"ffmpeg -loglevel quiet -y -i {path} -acodec pcm_s16le __.mp3" 

    subprocess.call(command, shell=True)


features = []

for i, phrase in enumerate(phrases):
    
    cmd = ["ffmpeg", "-loglevel", "quiet", "-y", "-i", f_name, "-ss", str(phrase['start']), "-to", str(phrase['end']), "-c", "copy", "__.mp4"]
    subprocess.run(cmd, stderr=subprocess.STDOUT)
    
    features.append(extract_mfcc("__.mp4"))


In [34]:
print(len(features), len(phrases))

119 119


In [37]:
import json 

np.save("features", np.array(features))

# _ = json.dumps(phrases, indent = 4)

with open("phrases.json", 'w') as f:
    json.dump(phrases, f)


In [39]:
np.array(features).shape

(119, 64)