In [29]:
import librosa
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer
import pandas as pd
import numpy as np
import os

In [30]:
# Loading Wav2Vec pretrained model
tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'Wav2Vec2CTCTokenizer'. 
The class this function is called from is 'Wav2Vec2Tokenizer'.
Some weights of the model checkpoint at facebook/wav2vec2-base-960h were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2

In [31]:
def load_audio(filename, sample_rate=16000):
    audio, rate = librosa.load(filename, sr = sample_rate)
    return audio

In [32]:
def transcribe_each(audio):
    # Taking an input value
    input_values = tokenizer(audio, return_tensors = "pt").input_values

    # Storing logits (non-normalized prediction values)
    logits = model(input_values).logits

    # Storing predicted ids
    prediction = torch.argmax(logits, dim = -1)

    # Passing the prediction to the tokenzer decode to get the transcription
    transcription = tokenizer.batch_decode(prediction)[0]
    
    return transcription

In [33]:
def transcribe(input_folder, output_excel):
    # List audio files in input folder
    audio_files = os.listdir(input_folder)

    # Create empty datadrame
    df = pd.DataFrame(columns=['File', 'Actual', 'Predicted'])
    files = []
    actuals = []
    predicted = []

    for audio_file in audio_files:
        input_audio_file = os.path.join(input_folder, audio_file)

        # Check if it's a wav file (not a folder)
        if os.path.isfile(input_audio_file) and ".wav" in input_audio_file:

            try: 
                # Transcribe .wav files 
                audio = load_audio(input_audio_file)
                transcription_each = transcribe_each(audio)
                files.append(input_audio_file)

                # Extract actual word
                #actual = input_audio_file[input_audio_file.rfind('_')+1:input_audio_file.rfind('.')]    # KIDTALK & AP
                actual = "xx"
                actuals.append(actual)
                predicted.append(transcription_each)
            except RuntimeError as error:
                print("Can't transcribe", audio_file)

    # Write to excel file 
    df['File'] = files
    df['Actual'] = actuals
    df['Predicted'] = predicted
    df.to_excel(output_excel)

In [34]:
def transcribe_all(folder_of_folders):
    folders = os.listdir(folder_of_folders)

    for folder in folders:
        dir = os.path.join(folder_of_folders, folder)
        if os.path.isdir(dir):
            excel = folder + ".xlsx"
            transcribe(dir, excel)

In [35]:
# transcribe_all("/Users/cogsci-lasrlab1/Downloads/MFA_data/KidTalk")

transcribe("Jan", "jan.xlsx")