# Building Speech Summarization Web App with Hugging Face Transformers and Gradio

## 1. Define extract audio, speech recognize and summary functions

In [1]:
import os
import pafy
import torch
import librosa
import transformers
import gradio as gr
import moviepy.editor as mp

import warnings
import logging

from pathlib import Path
from gradio.mix import Series
from pydub import AudioSegment
from transformers import Wav2Vec2ForMaskedLM, Wav2Vec2Tokenizer

# warnings.filterwarnings("ignore")

### 1.1 Preprocessing data

In [2]:
def audio_from_url(url, dst_dir='data', name=None, format='wav'):
    """
    Download video from url and save the audio from video
    
    Parameters
    ----------
    
    url : str
        The video url
    dst_dir : str
        Download video in this directory
    name : str
        Audiofile's name, if none, assign the name as the video's title
    format :
        Format type for audio file, such as 'wav', 'mp3'. WAV preferred.
        
    Return
    -------
    str
        audiofile's file path, pathlib instance
    """
    try:
        os.mkdir(dst_dir)
    except(FileExistsError):
        pass
    os.chdir(dst_dir)
    
    # download youtube video
    video = pafy.new(url)
    video = video.getbest()
    video.download()
    
    if not name:
        name = video.title
    f_name = fr'{name}.{format}'
    
    video = mp.VideoFileClip(fr'{video.filename}', verbose=False)
    
    # save audio file
    video.audio.write_audiofile(f_name, verbose=False, logger=None)
    video.close()
    os.remove(video.filename)
    os.chdir('..')
    
    return Path(f"{dst_dir}/{f_name}")

In [3]:
def split_audio_by_sec(src, start, end, dst_dir='audio_chunks', filename='test'):
    """
    Split audio by seconds, from start point to end point.
    Then save the audio clip in destination directory by assigned filename.
    
    Parameters
    ----------
    
    src : str
        audio file path
    
    start : int
        The starting position of the clip.
    
    end : int
        The ending position of the clip.
    
    dst_dir : string
        Name for destination directory, save clip under this folder.
    
    filename : string
        File name for the clip.
    """
    
    t1 = start * 1000
    t2 = end * 1000
    clip = src[t1:t2]
    
    dst_dir = Path(f"{dst_dir}/{filename}")
    
    clip.export(dst_dir, bitrate='192k', format='wav')

In [4]:
def split_audio(src, sec_per_split=20, dst_dir='audio_chunks'):
    """ Split audio into clips.
    """
    
    # make directory for audio clips
    try:
        os.mkdir(dst_dir)
    except(FileExistsError):
        pass
    
    # load in audio with 16k frame rate
    audio = AudioSegment.from_file(src).set_frame_rate(16000)
    
    total_secs = int(audio.duration_seconds)
    
    # Spliting audio
    for i in range(0, total_secs, sec_per_split):
        idx = str(i//sec_per_split).zfill(3)
        f_name = f"chunk_{idx}.wav"
        split_audio_by_sec(audio, i, i+sec_per_split, dst_dir=dst_dir, filename=f_name)


### 1.2 Load tokenizer and model, define recognition functions

In [5]:
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")

model = Wav2Vec2ForMaskedLM.from_pretrained("facebook/wav2vec2-large-960h-lv60-self")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Wav2Vec2CTCTokenizer'. 
The class this function is called from is 'Wav2Vec2Tokenizer'.
Some weights of Wav2Vec2ForMaskedLM were not initialized from the model checkpoint at facebook/wav2vec2-large-960h-lv60-self and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
def transcript_audio_clips(src='audio_chunks'):
    """ Speech recognition on all the audio clips. """
    
    result = ''
            
    for file in os.listdir(src):
        # load audio data
        file = Path(f"{src}/{file}")
        audio, _ = librosa.load(file, sr=16000)
        
        # speech recognition with pretrained model
        input_values = tokenizer(audio, return_tensors="pt").input_values
        logits = model(input_values).logits
        prediction = torch.argmax(logits, dim=-1)
        transcription = tokenizer.batch_decode(prediction)[0]
        
        # concatenate transcripts
        result += transcription.lower() + ' '
        os.remove(file)
        
    return result

In [7]:
def transcript_audio(url):
    """ Perform speech recognition on url linked video. """
    
    audio = audio_from_url(url)
    
    # directory for audio chunks/clips
    dst_dir = audio.with_suffix('')
    
    # split audio by certain duration(/sec)
    # if encounter memory crash, adjust sec_per_split to smaller number
    split_audio(src=audio, sec_per_split=10, dst_dir=dst_dir)
    
    # recognize speech through all the clips, obtain result for whole speech
    transcript = transcript_audio_clips(src=dst_dir)
    
    os.remove(audio)
    os.rmdir(dst_dir)
    
    return transcript

In [8]:
speech_recognizer = gr.Interface(
    transcript_audio,
    inputs=gr.inputs.Textbox(),
    outputs='text'
)

summarizer = gr.Interface.load("sshleifer/distilbart-cnn-12-6", src='huggingface')

## 2. Launch gradio

In [9]:
Series(
    speech_recognizer, 
    summarizer,
    title="Video Speech Summarization",
    description="Given a video url, generate a summary on the video\'s speech.",
    inputs=gr.inputs.Textbox(lines=2, placeholder='Paste video url here...', label='URL'),
    outputs=gr.outputs.Textbox(label="English Summary")
).launch()

Running locally at: http://127.0.0.1:7860/
To create a public link, set `share=True` in `launch()`.
Interface loading below...


(<Flask 'gradio.networking'>, 'http://127.0.0.1:7860/', None)