In [None]:
# Mount google drive

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Installing dependencies

In [None]:
!pip install --upgrade transformers
!pip install pydub



### Import Libraries

In [None]:
import os
import os.path
from os import path

import seaborn as sns

import soundfile as sf
import librosa
from pydub import AudioSegment as am

import pandas as pd
import numpy as np

import torch

import transformers

from transformers import (
    Wav2Vec2ForCTC,
    Wav2Vec2Processor,
    AutoTokenizer, 
    AutoModelWithLMHead 
)

In [None]:
transformers.__version__

'4.9.1'

### Load HuBERT model

In [None]:
from transformers import HubertForCTC

processor = Wav2Vec2Processor.from_pretrained("facebook/hubert-xlarge-ls960-ft")
model = HubertForCTC.from_pretrained("facebook/hubert-xlarge-ls960-ft")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=212.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=292.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=138.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=85.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1404.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=3850480983.0, style=ProgressStyle(descr…




### Resampling to 16kHz

In [None]:
def pre_proc(input_file):
  sr = 16000

  # If current file is not 16kHz, resample 
  if librosa.get_samplerate(input_file) != sr:
    sound = am.from_file(input_file, format='wav', frame_rate=44100)
    sound = sound.set_frame_rate(16000)
    sound.export(input_file, format='wav')
    print(f'Resampled to {sr}')


### Setting up CTC beam search decoding

In [None]:
# https://github.com/flashlight/wav2letter/tree/master/recipes/sota/2019

# Set n=4 to use a 4-gram LM to decoding 
n=4

# Download the 4-gram LM from openslr
if n == 4:
  if not path.exists("/content/4-gram.arpa.gz"):
    !wget https://www.openslr.org/resources/11/4-gram.arpa.gz
    !gunzip '/content/4-gram.arpa.gz'
    mp = '/content/4-gram.arpa'
  else:
    mp = '/content/4-gram.arpa'
elif n == 3:
  if not path.exists("/content/3-gram.pruned.3e-7.arpa.gz"):
    !wget https://www.openslr.org/resources/11/3-gram.pruned.3e-7.arpa.gz
    !gunzip '/content/3-gram.pruned.3e-7.arpa.gz'
    mp = '/content/3-gram.pruned.3e-7.arpa'
  else:
    mp = '/content/3-gram.pruned.3e-7.arpa'

--2021-07-28 16:17:18--  https://www.openslr.org/resources/11/4-gram.arpa.gz
Resolving www.openslr.org (www.openslr.org)... 46.101.158.64
Connecting to www.openslr.org (www.openslr.org)|46.101.158.64|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1355172078 (1.3G) [application/x-gzip]
Saving to: ‘4-gram.arpa.gz’


2021-07-28 16:18:09 (25.9 MB/s) - ‘4-gram.arpa.gz’ saved [1355172078/1355172078]



In [None]:
# Install CTC beam search decoding

if not path.exists("/content/ctcdecode"):
  %cd /content 
  !git clone --recursive https://github.com/parlance/ctcdecode.git  
  !pip install /content/ctcdecode #takes about 5 minutes on google colab

  %cd ctcdecode
  !pip install .

/content
Cloning into 'ctcdecode'...
remote: Enumerating objects: 1102, done.[K
remote: Counting objects: 100% (39/39), done.[K
remote: Compressing objects: 100% (26/26), done.[K
remote: Total 1102 (delta 16), reused 28 (delta 13), pack-reused 1063[K
Receiving objects: 100% (1102/1102), 780.91 KiB | 6.45 MiB/s, done.
Resolving deltas: 100% (529/529), done.
Submodule 'third_party/ThreadPool' (https://github.com/progschj/ThreadPool.git) registered for path 'third_party/ThreadPool'
Submodule 'third_party/kenlm' (https://github.com/kpu/kenlm.git) registered for path 'third_party/kenlm'
Cloning into '/content/ctcdecode/third_party/ThreadPool'...
remote: Enumerating objects: 82, done.        
remote: Total 82 (delta 0), reused 0 (delta 0), pack-reused 82        
Cloning into '/content/ctcdecode/third_party/kenlm'...
remote: Enumerating objects: 14047, done.        
remote: Counting objects: 100% (360/360), done.        
remote: Compressing objects: 100% (292/292), done.        
remote: T

In [None]:
cd ctcdecode

/content/ctcdecode/ctcdecode


In [None]:
# Import CTC beam search decoder

from ctcdecode import CTCBeamDecoder

vocab_dict = processor.tokenizer.get_vocab()
sort_vocab = sorted((value, key) for (key,value) in vocab_dict.items())
vocab = [x[1].replace("|", " ") for x in sort_vocab]


In [None]:
# CTC settings

alpha = 1 # LM Weight
beta = 2 # LM Usage Reward

decoder = CTCBeamDecoder(
    vocab,
    model_path=mp,
    alpha=alpha,
    beta=beta,
    cutoff_top_n=40,
    cutoff_prob=1.0,
    beam_width=128,
    num_processes=4,
    blank_id=0,
    log_probs_input=True,
)

### Generate transcript functions

In [None]:
# Function used to generate text transcripts

def asr_transcript_ngram(processor, model, input_file):
  sr = 16000
  transcript = ""

  # Pre-processing
  pre_proc(input_file)

  # Stream over _ seconds chunks rather than load the full file
  stream = librosa.stream(
      input_file,
      block_length=15,
      frame_length=16000,
      hop_length=16000
      )

  for speech in stream:
    if len(speech.shape) > 1:
      speech = speech[:, 0] + speech[:, 1]

    # Tokenize
    input_values = processor(speech, sampling_rate=sr, return_tensors="pt").input_values
    # input_values = input_values.to('cuda')

    # Retrieve Logits
    with torch.no_grad():
      logits = model(input_values).logits

    # Beam Search
    beam_results, beam_scores, timesteps, out_lens= decoder.decode(logits)

    beam_results = beam_results[:,0,:] # taking only top beam
    beam_results = beam_results.clip(0,32)
    beam_results[beam_results==32] = 0
    

    # Decode
    transcription = processor.batch_decode(beam_results,   
                                          skip_special_tokens=True,
                                          clean_up_tokenization_spaces=False,
                                          group_tokens=False,
                                          )
    
    for i in range(len(transcription)):
      transcript += transcription[i] + '. '
    print(transcript)

  return transcript

###### Function to drop segment rows < threshold

In [None]:
def remove_seg(df):
  # Placeholder 
  df.length = 1
  # For each row in df
  for i in range(df.shape[0]):
    # Calculate length of each segment
    df.loc[i, 'length'] = (df.end[i] - df.begin[i]) / 1000

    # If segment length shorter than threshold, drop row
    df.drop(df.loc[df['length'] < 0.1].index, inplace=True)

  return df

###### Function to generate text for each segment

In [None]:
def segment_trans(df):
  # For each row in df
  for i in range(df.shape[0]):
    # Split the audio file into segments
    split = a[df.begin[i]:df.end[i]]

    # Store the segmented audio file as temp wav file
    temp_segment = '/content/temp'
    split.export(temp_segment, format='wav')

    print("Transcribing segment...")

    # Transcribing the segmented audio file

    transcript = asr_transcript_ngram(processor, model, temp_segment)

    # Appending the transcript based on the segmented parts
    df.loc[i, 'transcript'] = transcript

  return df

### Generate Transcripts

In [None]:
# Create output folders for transcriptions

try:
    os.mkdir('/content/drive/MyDrive/ADReSSo21/diagnosis/train/hubert/transcribed/')
    os.mkdir('/content/drive/MyDrive/ADReSSo21/diagnosis/train/hubert/transcribed/ad/')
    os.mkdir('/content/drive/MyDrive/ADReSSo21/diagnosis/train/hubert/transcribed/cn/')
except OSError:
    print ("Creation of the directory failed")
else:
    print ("Successfully created the directory")

Successfully created the directory


In [None]:

# Generate transcripts
audio_files = "/content/drive/MyDrive/ADReSSo21/diagnosis/train/audio/"
segment_files = "/content/drive/MyDrive/ADReSSo21/diagnosis/train/segmentation/"

for folder in sorted(os.listdir(segment_files)):
  for file in sorted(os.listdir(segment_files+folder)):

    # For each audio file
    audio = os.path.splitext(file)[0] + '.wav'
    print(f"Transcribing {audio}...")

    df = pd.read_csv(segment_files+folder+'/'+file)
    a = am.from_file(audio_files+folder+'/'+audio)

    # Remove segment rows < threshold
    remove_seg(df)

    # Generate transcript for each segment
    segment_trans(df)

    # Export generated text transcripts
    df.to_csv(f'/content/drive/MyDrive/ADReSSo21/diagnosis/train/hubert/transcribed/{folder}/{file}')
    print(f"Transcription for {audio} done. File exported.")