In [2]:
# !pip install pyaudio
# !pip install wave
# !pip install transformers torchaudio soundfile

## Speech to Text

In [1]:
import torch
import torchaudio
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer, T5ForConditionalGeneration, T5Tokenizer
import os 


  from .autonotebook import tqdm as notebook_tqdm


In [24]:
# Load pre-trained speech-to-text model and tokenizer
local_model_path = "./wav2vec2-large-960h"
local_tokenizer_path = "./wav2vec2-large-960h"
model_name = "facebook/wav2vec2-large-960h"

if not os.path.exists(local_model_path):
    os.makedirs(local_model_path)
    model = Wav2Vec2ForCTC.from_pretrained(model_name)
    model.save_pretrained(local_model_path)
    tokenizer = Wav2Vec2Tokenizer.from_pretrained(model_name)
    tokenizer.save_pretrained(local_tokenizer_path)
else:
    model = Wav2Vec2ForCTC.from_pretrained(local_model_path)
    tokenizer = Wav2Vec2Tokenizer.from_pretrained(local_tokenizer_path)


In [25]:

def transcribe_audio(file_path):
    try:
        waveform, sample_rate = torchaudio.load(file_path)
        if sample_rate != 16000:
            transform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)
            waveform = transform(waveform)
        input_values = tokenizer(waveform.squeeze().numpy(), return_tensors="pt").input_values
        logits = model(input_values).logits
        predicted_ids = torch.argmax(logits, dim=-1)
        transcription = tokenizer.batch_decode(predicted_ids)[0]
        return transcription
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
        return None

## Text Summarization

In [26]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
def summarize_text(text):
    tokenizer = T5Tokenizer.from_pretrained('t5-base')
    model = T5ForConditionalGeneration.from_pretrained('./t5_base_trained')
    # Tokenize the input dialogue
    inputs = tokenizer.encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)

    # Generate summary
    summary_ids = model.generate(inputs, max_length=150, num_beams=4, length_penalty=2.0, early_stopping=True)
    generated_summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)
    return generated_summary

In [28]:
def transcribe_and_summarize(file_path):
    transcription = transcribe_audio(file_path)
    if transcription:
        summary = summarize_text(transcription)
        return transcription, summary
    return None, None

# Example usage


In [30]:
file_path = "Recording3.mp3"
transcription, summary = transcribe_and_summarize(file_path)
print("Transcription:", transcription)
print("Summary:", summary)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Transcription: ALEX HAY BOB ARE YOU HEADING TO THE GIM TO DAY BOB I WAS PLANNING TO BUT THE WEATHER IS SO HUMAN AND WET IT'S ALMOST UNBEARABLE ALEX TELL ME ABOUT IT I TEPED OUTSIDE FOR A MINUTE AND FELT LIKE I WAS IN ASANA BOB EXACTLY I DON'T THINK I CAN HANDLE A WORK OUT IN THIS KIND OF WEATHER IT'S JUST TOO UNCOMFORTABLE ALEX SAME HERE MAYBE WE SHOULD JUST SKIP THE JIM TO DAY AND DO SOME LIGHT EXERCISES AT HOME INSTEAD BOB THAT SOUNDS LIKE A GOOD IDEA WE CAN STILL STAY ACTIVE WITHOUT SUFFERING IN THIS HUMIITY ALEX AGREED I'LL DO SOME YOGA AND BODYWEIGHT EXERCISES WHAT ABOUT YOU BOB I MIGHT DO THE SAME PLUS IT GIVES US AN EXCUSE TO AVOID THE WEATHER ALEX TRUE LET'S STAY COOL AND AVOID THE HUMIDITY BOB DEFINITELY SEE YOU TOMORROW HOPEFULLY WITH BETTER WEATHER ALEX FEGER'S CROSSED TAKE CARE BOB BOB YOU TOO ALEX
Summary: ALEX HAY IS HEADING TO THE GIM TO DAY. He TEPED OUTSIDE for a minute and felt like he was in ASANA.
