In [None]:
%matplotlib inline

In [None]:
from google.colab import drive 
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd  /content/drive/MyDrive/speech2/punctuation-restoration-master/src

/content/drive/MyDrive/speech2/punctuation-restoration-master/src


In [None]:
!pip install webrtcvad
!pip install transformers==4.18.0
!pip install TorchCRF
!pip install https://github.com/kpu/kenlm/archive/master.zip 
!pip install pyctcdecode==0.3.0
!pip install datasets==2.0.0 



# Speech Recognition with Wav2Vec2

**Author**: [Moto Hira](moto@fb.com)_

This tutorial shows how to perform speech recognition using
pre-trained models from wav2vec 2.0
[[paper](https://arxiv.org/abs/2006.11477)_].


## Overview

The process of speech recognition looks like the following.

1. Extract the acoustic features from audio waveform

2. Estimate the class of the acoustic features frame-by-frame

3. Generate hypothesis from the sequence of the class probabilities

Torchaudio provides easy access to the pre-trained weights and
associated information, such as the expected sample rate and class
labels. They are bundled together and available under
:py:func:`torchaudio.pipelines` module.




In [None]:
 # %matplotlib inline

import os
import librosa
import IPython
import tensorflow
import matplotlib
import matplotlib.pyplot as plt
import requests
import torch
import torchaudio
from transformers import AutoProcessor, AutoModelForCTC

processor = AutoProcessor.from_pretrained("patrickvonplaten/wav2vec2-base-960h-4-gram")

model = AutoModelForCTC.from_pretrained("patrickvonplaten/wav2vec2-base-960h-4-gram")


In [None]:
import collections
import contextlib
import sys
import wave
import librosa
import webrtcvad


def read_wave(path):
    """Reads a .wav file.

    Takes the path, and returns (PCM audio data, sample rate).
    """
    with contextlib.closing(wave.open(path, 'rb')) as wf:
        num_channels = wf.getnchannels()
        assert num_channels == 1
        sample_width = wf.getsampwidth()
        assert sample_width == 2
        sample_rate = wf.getframerate()
        assert sample_rate in (8000, 16000, 32000, 48000)
        pcm_data = wf.readframes(wf.getnframes())
        return pcm_data, sample_rate


def write_wave(path, audio, sample_rate):
    """Writes a .wav file.

    Takes path, PCM audio data, and sample rate.
    """
    with contextlib.closing(wave.open(path, 'wb')) as wf:
        wf.setnchannels(1)
        wf.setsampwidth(2)
        wf.setframerate(sample_rate)
        wf.writeframes(audio)


class Frame(object):
    """Represents a "frame" of audio data."""
    def __init__(self, bytes, timestamp, duration):
        self.bytes = bytes
        self.timestamp = timestamp
        self.duration = duration


def frame_generator(frame_duration_ms, audio, sample_rate):
    """Generates audio frames from PCM audio data.

    Takes the desired frame duration in milliseconds, the PCM data, and
    the sample rate.

    Yields Frames of the requested duration.
    """
    n = int(sample_rate * (frame_duration_ms / 1000.0) * 2)
    offset = 0
    timestamp = 0.0
    duration = (float(n) / sample_rate) / 2.0
    timer=0
    while offset + n < len(audio): 
        yield Frame(audio[offset:offset + n], timestamp, duration)
        timestamp += duration
        timer+=1
        offset += n
    print(timer)


def vad_collector(sample_rate, frame_duration_ms,
                  padding_duration_ms, vad, frames):
    """Filters out non-voiced audio frames.

    Given a webrtcvad.Vad and a source of audio frames, yields only
    the voiced audio.

    Uses a padded, sliding window algorithm over the audio frames.
    When more than 90% of the frames in the window are voiced (as
    reported by the VAD), the collector triggers and begins yielding
    audio frames. Then the collector waits until 90% of the frames in
    the window are unvoiced to detrigger.

    The window is padded at the front and back to provide a small
    amount of silence or the beginnings/endings of speech around the
    voiced frames.

    Arguments:

    sample_rate - The audio sample rate, in Hz.
    frame_duration_ms - The frame duration in milliseconds.
    padding_duration_ms - The amount to pad the window, in milliseconds.
    vad - An instance of webrtcvad.Vad.
    frames - a source of audio frames (sequence or generator).

    Returns: A generator that yields PCM audio data.
    """
    num_padding_frames = int(padding_duration_ms / frame_duration_ms)
    # We use a deque for our sliding window/ring buffer.
    ring_buffer = collections.deque(maxlen=num_padding_frames)
    # We have two states: TRIGGERED and NOTTRIGGERED. We start in the
    # NOTTRIGGERED state.
    triggered = False

    voiced_frames = []
    timer=0
    print(len(frames))
    for frame in frames: 
        timer+=0.03
        is_speech = vad.is_speech(frame.bytes, sample_rate)

        sys.stdout.write('1' if is_speech else '0')
        if not triggered:
            ring_buffer.append((frame, is_speech))
            num_voiced = len([f for f, speech in ring_buffer if speech])
            # If we're NOTTRIGGERED and more than 90% of the frames in
            # the ring buffer are voiced frames, then enter the
            # TRIGGERED state.
            if num_voiced > 0.9 * ring_buffer.maxlen:
                triggered = True
                sys.stdout.write('+(%s)' % (ring_buffer[0][0].timestamp,))
                # We want to yield all the audio we see from now until
                # we are NOTTRIGGERED, but we have to start with the
                # audio that's already in the ring buffer.
                for f, s in ring_buffer:
                    voiced_frames.append(f)
                ring_buffer.clear()
        else:
            # We're in the TRIGGERED state, so collect the audio data
            # and add it to the ring buffer.
            voiced_frames.append(frame)
            ring_buffer.append((frame, is_speech))
            num_unvoiced = len([f for f, speech in ring_buffer if not speech])
            # If more than 90% of the frames in the ring buffer are
            # unvoiced, then enter NOTTRIGGERED and yield whatever
            # audio we've collected.
            if num_unvoiced > 0.9 * ring_buffer.maxlen and timer > 53:
                timer=0
                sys.stdout.write('-(%s)' % (frame.timestamp + frame.duration))
                triggered = False 
                yield b''.join([f.bytes for f in voiced_frames])
                ring_buffer.clear()
                voiced_frames = []
    if triggered:
        sys.stdout.write('-(%s)' % (frame.timestamp + frame.duration))
    sys.stdout.write('\n')
    # If we have any leftover voiced audio when we run out of input,
    # yield it.
    if voiced_frames:
        yield b''.join([f.bytes for f in voiced_frames])

chunks=[]

 
# read the audio
audio, sample_rate = read_wave("/content/drive/MyDrive/speech2/difficult03.wav")
vad = webrtcvad.Vad(3)
frames = frame_generator(30, audio, sample_rate)
frames_ = list(frames)
segments = vad_collector(sample_rate, 30, 300, vad, frames_)

for i, segment in enumerate(segments):
    path = 'chunk-%002d.wav' % (i,)
    chunks.append(path)
    print(' Writing %s' % (path,))
    write_wave(path, segment, sample_rate)
for ch in chunks:
    print (ch)
    





In [None]:


def transcript_(SPEECH_FILE):
    inputs = processor(SPEECH_FILE["audio"]["array"] , sampling_rate=16_000 ,return_tensors="pt" )
    with torch.no_grad():
      logits = model(**inputs).logits
    logits.shape
    " ".join(sorted(processor.tokenizer.get_vocab()))
    transcription = processor.batch_decode(logits.numpy()).text
    
    return  transcription[0].lower() 

In [None]:
audio_text=[]
print (len("chunk-00.wav"))
file_path="chunk-00.wav"
speech, rate = librosa.load(file_path,sr=16000)
audio={'file': file_path,
'audio': {'path': file_path,
'array': speech,
'sampling_rate': 16000}
}
inputs = processor(audio["audio"]["array"] , sampling_rate=16_000 ,return_tensors="pt" )
with torch.no_grad():
  logits = model(**inputs).logits
logits.shape
" ".join(sorted(processor.tokenizer.get_vocab()))
transcription = processor.batch_decode(logits.numpy()).text
print (transcription)



In [None]:
audio_text=[]
print (len(chunks))
for file_path in chunks:
    speech, rate = librosa.load(file_path,sr=16000)
    audio={'file': file_path,
    'audio': {'path': file_path,
    'array': speech,
    'sampling_rate': 16000}
    }
    audio_text.append(transcript_(audio)) 
# initialize an empty string
final_text = ""
# traverse in the string
for ele in audio_text:
  final_text += ele 
print ("final text:")
print(final_text)
with open('transcript.txt', 'w') as f:
    f.write(final_text)
    f.close()



In [None]:
!pip install transformers==v2.11.0


In [None]:
!python inference.py --pretrained-model=roberta-large --weight-path=roberta-large-en.pt --language=en --file=transcript.txt --out-file=test_en_out.txt

In [None]:
!pip install https://github.com/kpu/kenlm/archive/master.zip 
!pip install pyctcdecode==0.3.0
!pip install datasets==2.0.0
!pip install transformers==4.18.0

In [None]:
from datasets import load_dataset

dataset = load_dataset("hf-internal-testing/librispeech_asr_demo", "clean", split="validation")
dataset
import IPython.display as ipd

audio_sample = dataset[2]

In [None]:
audio_sample

In [None]:
from transformers import AutoProcessor, AutoModelForCTC

processor = AutoProcessor.from_pretrained("patrickvonplaten/wav2vec2-base-960h-4-gram")

model = AutoModelForCTC.from_pretrained("patrickvonplaten/wav2vec2-base-960h-4-gram")

In [None]:
import librosa
file_path = "/content/drive/MyDrive/speech2/average06.wav"

speech, rate = librosa.load(file_path,sr=16000)
print(speech)

In [None]:
audio={'file': file_path,
 'audio': {'path': file_path,
  'array': speech,
  'sampling_rate': 16000}
  }

In [None]:
print(audio["audio"]["array"])

In [None]:
import torch
inputs = processor(audio["audio"]["array"], sampling_rate=16_000, return_tensors="pt")
with torch.no_grad():
  logits = model(**inputs).logits
logits.shape
" ".join(sorted(processor.tokenizer.get_vocab()))
transcription = processor.batch_decode(logits.numpy()).text
transcription[0].lower()

'i had that curiosity beside me at this moment'

In [None]:
 from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
 from datasets import load_dataset
 import torch
 
 # load model and tokenizer
 processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
 model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
     
 # load dummy dataset and read soundfiles
 ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
 #load audio file from folder of choice
 file_path = "/content/drive/MyDrive/speech2/average06.wav"

 speech, rate = librosa.load(file_path,sr=16000)
 # tokenize
 input_values = processor(speech, return_tensors="pt", padding="longest").input_values  # Batch size 1
 
 # retrieve logits
 logits = model(input_values).logits
 
 # take argmax and decode
 predicted_ids = torch.argmax(logits, dim=-1)
 transcription = processor.batch_decode(predicted_ids)



Downloading:   0%|          | 0.00/159 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/163 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.56k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/291 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/360M [00:00<?, ?B/s]

Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at facebook/wav2vec2-base-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/5.16k [00:00<?, ?B/s]

Downloading and preparing dataset librispeech_asr/clean to /root/.cache/huggingface/datasets/patrickvonplaten___librispeech_asr/clean/2.1.0/f2c70a4d03ab4410954901bde48c54b85ca1b7f9bf7d616e7e2a72b5ee6ddbfc...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Dataset librispeech_asr downloaded and prepared to /root/.cache/huggingface/datasets/patrickvonplaten___librispeech_asr/clean/2.1.0/f2c70a4d03ab4410954901bde48c54b85ca1b7f9bf7d616e7e2a72b5ee6ddbfc. Subsequent calls will reuse this data.


NameError: ignored

In [None]:
transcription

['I HAD THAT CURIOSITY BESIDE ME AT THIS MOMENT']

In [None]:
pip install jiwer

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting jiwer
  Downloading jiwer-2.5.1-py3-none-any.whl (15 kB)
Collecting levenshtein==0.20.2
  Downloading Levenshtein-0.20.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 33.3 MB/s 
[?25hCollecting rapidfuzz<3.0.0,>=2.3.0
  Downloading rapidfuzz-2.11.1-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.2 MB)
[K     |████████████████████████████████| 2.2 MB 59.3 MB/s 
[?25hInstalling collected packages: rapidfuzz, levenshtein, jiwer
Successfully installed jiwer-2.5.1 levenshtein-0.20.2 rapidfuzz-2.11.1
