# Building Speech Summarization Web App with Hugging Face Transformers and Gradio

## 1. Define extract audio, speech recognize and summary functions

In [1]:
import os
import pafy
import time
import whisper
import validators
import gradio as gr

from gradio.mix import Series
from transformers import pipeline
from wordcloud import WordCloud, STOPWORDS

In [2]:
# load whisper model for ASR
asr_model = whisper.load_model('base.en')
summarizer = gr.Interface.load("facebook/bart-large-cnn", src='huggingface')

Fetching model from: https://huggingface.co/facebook/bart-large-cnn


### 1.1 Preprocessing data

In [3]:
def audio_from_url(url, dst_dir='data', name=None, format='wav'):
    """ Download video from url and save the audio from video

    :param url: str, the video url
    :param dst_dir: destination directory for save audio
    :param name: audio file's name, if none, assign the name as the video's title
    :param format: format type for audio file, such as 'wav', 'mp3'. WAV is preferred.
    :return: path of audio
    """
    
    if not validators.url(url):
        return None

    os.makedirs(dst_dir, exist_ok=True)
    
    # download audio
    video = pafy.new(url)
    path = os.path.join(dst_dir, f"audio.{format}")
    os.system(f"yt-dlp -f 'ba' -x --audio-format {format} {url}  -o {path} --quiet")
    
    return path

### 1.2 Load tokenizer and model, define recognition functions

In [4]:
def speech_to_text(audio, beam_size=5, best_of=5, language='en'):
    """

    :param audio:
    :param beam_size:
    :param best_of:
    :param language:
    :return:
    """
    
    result = asr_model.transcribe(audio, language=language, beam_size=beam_size, best_of=best_of)

    return result['text']

In [5]:
def text_summarization(text):
    return summarizer(text)

In [6]:
def wordcloud_func(text: str, out_path='wordcloud_output.png'):
    """ generate wordcloud based on text

    :param text:
    :param out_path:
    :return:
    """
    
    if len(text) == 0:
        return None
    
    stopwords = STOPWORDS

    wc = WordCloud(
        background_color='white',
        stopwords=stopwords,
        height=600,
        width=600
    )

    wc.generate(text)

    wc.to_file(out_path)
    return out_path

## 2. Launch gradio

In [7]:
demo = gr.Blocks(title="Speech Summarization")

with demo:
    # data preparation
    with gr.Row():
        with gr.Column():
            url = gr.Textbox(label="URL", placeholder="video url")

            b0 = gr.Button("clear")
            b0.click(lambda x: '', inputs=url, outputs=url)

        speech = gr.Audio(label="speech", type="filepath")

        url.change(audio_from_url, inputs=url, outputs=speech)

    # ASR
    text = gr.Textbox(label="Transcription", placeholder="transcription")

    with gr.Row():
        beam_size = gr.Slider(1, 10, value=5, step=1, label="param: beam_size")
        best_of = gr.Slider(1, 10, value=5, step=1, label="param: best_of")
        
    with gr.Row():
        b1_0 = gr.Button("clear")
        b1_0.click(lambda x: '', inputs=text, outputs=text)
        b1 = gr.Button("Recognize Speech")
        b1.click(speech_to_text, inputs=[speech, beam_size, best_of], outputs=text)

    # summary
    summary = gr.Textbox(label="Summarization")
    
    with gr.Row():
        b2_0 = gr.Button("clear")
        b2_0.click(lambda x: '', inputs=summary, outputs=summary)
        b2 = gr.Button("Summarize")
        b2.click(text_summarization, inputs=text, outputs=summary)

    # wordcloud
    image = gr.Image(label="wordcloud", shape=(400,400), show_label=False)

    text.change(wordcloud_func, inputs=text, outputs=image)

    examples = gr.Examples(examples=["https://www.youtube.com/watch?v=DuX4K4eeTz8",
                                     "https://www.youtube.com/watch?v=nepOSEGHHCQ"],
                           inputs=[url])

demo.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


(<gradio.routes.App at 0x7f95102f3d90>, 'http://127.0.0.1:7860/', None)