In [None]:
!pip install transformers
!pip install jiwer

In [None]:
import pandas as pd
import re
import os

import librosa
import torch
from transformers import Wav2Vec2ForCTC, Wav2Vec2Tokenizer, Wav2Vec2Processor
from jiwer import wer

import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Using pre-trained model

class ZeroShotTesting:
    def __init__(self):
        self.df_label = pd.DataFrame()

    def get_label(self):
        self.df_final = pd.read_csv('/content/drive/MyDrive/Mini_Project/Data (1000files)/Data/test.csv')

        #Define the function to clean up labels
        def remove_special_characters(text):
            chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"]'
            text = re.sub(chars_to_ignore_regex, '', text).lower() + " "
            return text

        self.df_final['Label'] = self.df_final['Label'].map(remove_special_characters)

        return self.df_final

    def get_audio_filelist(self):
        folder_path = '/content/drive/MyDrive/Mini_Project/Data (1000files)/Data/Test'
        # Get a list of all the files in the folder
        files = os.listdir(folder_path)
        file_paths = []

        for filename in files:
            file_path = os.path.join(folder_path, filename)
            if os.path.isfile(file_path):
                file_paths.append(file_path)
        self.file_paths = sorted(file_paths, key=os.path.basename)

    def get_transcript(self):
        self.tokenizer = Wav2Vec2Tokenizer.from_pretrained("facebook/wav2vec2-base-960h")
        self.model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
        text = []
        for file_name in self.file_paths:
            input_audio, _ = librosa.load(file_name, sr=16000)
            input_values = self.tokenizer(input_audio, return_tensors='pt').input_values
            logits = self.model(input_values).logits
            predicted_ids = torch.argmax(logits, dim=-1)
            text.append(self.tokenizer.batch_decode(predicted_ids)[0].lower())

        # Create a dataframe with columns ['Text', 'Label']
        self.df_final = pd.concat([self.df_final, pd.DataFrame(text, columns=['generated transcript from model'])], axis=1)
        return self.df_final

    def get_accuracy(self):
        def calculate_wer(reference, hypothesis):
            ref_words = reference.split()
            hyp_words = hypothesis.split()
            # Counting the number of substitutions, deletions, and insertions
            substitutions = sum(1 for ref, hyp in zip(ref_words, hyp_words) if ref != hyp)
            deletions = len(ref_words) - len(hyp_words)
            insertions = len(hyp_words) - len(ref_words)
            # Total number of words in the reference text
            total_words = len(ref_words)
            # Calculating the Word Error Rate (WER)
            wer = (substitutions + deletions + insertions) / total_words
            return wer

        self.df_final['WER'] = self.df_final.apply(lambda x: calculate_wer(x['Label'], x['generated transcript from model']), axis=1)
        self.df_final['WER'] = self.df_final['WER'].apply(lambda x: round(x, 2))

    def get_accuracy_jiwer(self):
        self.df_final['JIWER'] = self.df_final.apply(lambda x: wer(x['Label'], x['generated transcript from model']), axis=1)
        self.df_final['JIWER'] = self.df_final['JIWER'].apply(lambda x: round(x, 2))
        self.df_final.to_csv('speech_to_text_ZST_results_1000.csv')

    def printer(self):
        display(self.df_final)
        return self.df_final

In [None]:
zst = ZeroShotTesting()
zst.get_label()
zst.get_audio_filelist()
zst.get_transcript()
# zst.get_accuracy()
zst.get_accuracy_jiwer()
zst.printer()
df1 = zst.printer()

In [None]:
#Fine-tuned model

class finetuned_model():
    def __init__(self):
        self.df_label = pd.DataFrame()

    def get_label(self):
        self.df_final = pd.read_csv('/content/drive/MyDrive/Mini_Project/Data (1000files)/Data/test.csv')

        #Define the function to clean up labels
        def remove_special_characters(text):
            chars_to_ignore_regex = '[\,\?\.\!\-\;\:\"]'
            text = re.sub(chars_to_ignore_regex, '', text).lower() + " "
            return text

        self.df_final['Label'] = self.df_final['Label'].map(remove_special_characters)

        return self.df_final

    def get_audio_filelist(self):
        folder_path = '/content/drive/MyDrive/Mini_Project/Data (1000files)/Data/Test'
        # Get a list of all the files in the folder
        files = os.listdir(folder_path)
        file_paths = []

        for filename in files:
            file_path = os.path.join(folder_path, filename)
            if os.path.isfile(file_path):
                file_paths.append(file_path)
        self.file_paths = sorted(file_paths, key=os.path.basename)

    def get_transcript(self):
        from transformers import AutoModelForCTC, Wav2Vec2Processor
        # self.processor = Wav2Vec2Processor.from_pretrained("/content/drive/MyDrive/Mini_Project/models/2023-10-15-11-49/processor_35")
        # self.model = Wav2Vec2ForCTC.from_pretrained("/content/drive/MyDrive/Mini_Project/models/2023-10-15-11-49/model_35")
        self.processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
        self.model = AutoModelForCTC.from_pretrained('beatrice-yap/wav2vec2-base-nsc-demo-3')

        text = []
        for file_name in self.file_paths:
            input_audio, _ = librosa.load(file_name, sr=16000)
            input_values = self.processor(input_audio, return_tensors='pt').input_values
            logits = self.model(input_values).logits
            predicted_ids = torch.argmax(logits, dim=-1)
            text.append(self.processor.batch_decode(predicted_ids)[0].lower())

        # Create a dataframe with columns ['Text', 'Label']
        self.df_final = pd.concat([self.df_final, pd.DataFrame(text, columns=['generated transcript from model'])], axis=1)
        return self.df_final

    def get_accuracy(self):
        def calculate_wer(reference, hypothesis):
            ref_words = reference.split()
            hyp_words = hypothesis.split()
            # Counting the number of substitutions, deletions, and insertions
            substitutions = sum(1 for ref, hyp in zip(ref_words, hyp_words) if ref != hyp)
            deletions = len(ref_words) - len(hyp_words)
            insertions = len(hyp_words) - len(ref_words)
            # Total number of words in the reference text
            total_words = len(ref_words)
            # Calculating the Word Error Rate (WER)
            wer = (substitutions + deletions + insertions) / total_words
            return wer

        self.df_final['WER'] = self.df_final.apply(lambda x: calculate_wer(x['Label'], x['generated transcript from model']), axis=1)
        self.df_final['WER'] = self.df_final['WER'].apply(lambda x: round(x, 2))

    def get_accuracy_jiwer(self):
        self.df_final['JIWER'] = self.df_final.apply(lambda x: wer(x['Label'], x['generated transcript from model']), axis=1)
        self.df_final['JIWER'] = self.df_final['JIWER'].apply(lambda x: round(x, 2))
        self.df_final.to_csv('speech_to_text_ZST_results_ft1000.csv')

    def printer(self):
        # display(self.df_final)
        return self.df_final

In [None]:
ft = finetuned_model()
ft.get_label()
ft.get_audio_filelist()
ft.get_transcript()
# zst.get_accuracy()
ft.get_accuracy_jiwer()
ft.printer()
df2 = ft.printer()

In [None]:
df_combined = pd.concat([df1[['Label','generated transcript from model', 'JIWER']],df2['JIWER']], axis = 1)

In [None]:
df_combined = pd.concat([df1[['Label','generated transcript from model', 'JIWER']],df2['JIWER']], axis = 1)
df_combined.columns = ['Label', 'Generated Transcription', 'JIWER_pretrain', 'JIWER_finetuned']
df_combined

Unnamed: 0,Label,Generated Transcription,JIWER_pretrain,JIWER_finetuned
0,besides li's frail appearance he was seen to b...,is a least free appearance he was seen to me a...,0.62,0.31
1,he also noticed he could climb up by using som...,he also notice you could claim i by using some...,0.56,0.38
2,children need that sense of absolute security ...,duin need the sense of absolucicurity from kno...,0.60,0.20
3,he also underscored the importance of innovati...,ho's hone scot te importens ar innovation y bu...,0.94,0.62
4,members of the public can vote for their favou...,memoisel repoblic an fourt for their frevoic b...,0.77,0.31
...,...,...,...,...
295,spending is being supported by steady wage gai...,spending his wings of atten by stady reach gai...,0.55,0.27
296,as scavengers crab plays important roles in th...,ascavanges crat placed important rules in the ...,0.45,0.35
297,in some cases the cash raised from such produc...,in some cases the cash wreath from such produc...,0.20,0.33
298,it has since been restored to the original cha...,it has since been restored to the original charch,0.11,0.00


In [None]:
JIWER_pretrain = df_combined['JIWER_pretrain'].mean().round(2)
JIWER_finetuned = df_combined['JIWER_finetuned'].mean().round(2)

print('Mean WER Pretrain:', JIWER_pretrain)
print('Mean WER finetuned:', JIWER_finetuned)


Mean WER Pretrain: 0.27
Mean WER finetuned: 0.22


In [None]:
df2['JIWER'].mean().round(2)

0.19

In [None]:
df_pretrain = pd.read_csv('/content/speech_to_text_ZST_results_pretrain1000.csv')
df_finetune = pd.read_csv('/content/speech_to_text_ZST_results_finetuned1000.csv')

In [None]:
df_pretrain.head()

Unnamed: 0.1,Unnamed: 0,File Name,Data Split,File,Label,generated transcript from model,JIWER
0,0,000160342.WAV,test,160342,besides li's frail appearance he was seen to b...,is a least free appearance he was seen to me a...,0.62
1,1,000160345.WAV,test,160345,he also noticed he could climb up by using som...,he also notice you could claim i by using some...,0.56
2,2,000160346.WAV,test,160346,children need that sense of absolute security ...,duin need the sense of absolucicurity from kno...,0.6
3,3,000160347.WAV,test,160347,he also underscored the importance of innovati...,ho's hone scot te importens ar innovation y bu...,0.94
4,4,000160349.WAV,test,160349,members of the public can vote for their favou...,memoisel repoblic an fourt for their frevoic b...,0.77


In [None]:
# Define the number of rows per group
rows_per_group = 30

# Create a new column and assign group numbers to each row
df_pretrain['speaker'] = (df_pretrain.index // rows_per_group) + 1
df_finetune['speaker'] = (df_finetune.index // rows_per_group) + 1

In [None]:
df_pretrain.tail()

Unnamed: 0.1,Unnamed: 0,File Name,Data Split,File,Label,generated transcript from model,JIWER,speaker
295,295,100010395.WAV,test,100010395,spending is being supported by steady wage gai...,spending his wings of atten by stady reach gai...,0.55,10
296,296,100010396.WAV,test,100010396,as scavengers crab plays important roles in th...,ascavanges crat placed important rules in the ...,0.45,10
297,297,100010397.WAV,test,100010397,in some cases the cash raised from such produc...,in some cases the cash wreath from such produc...,0.2,10
298,298,100010398.WAV,test,100010398,it has since been restored to the original cha...,it has since been restored to the original charch,0.11,10
299,299,100010399.WAV,test,100010399,because of our selfishness our inconsiderate b...,because of our selfishness our inconsidate beh...,0.22,10


In [None]:
pretrain_grouped = df_pretrain.groupby('speaker')

In [None]:
# Define your function here
def Mean_WER(group):
    # Calculate the mean of the existing numerical column for the group
    group_mean = group['speaker'].mean()

    # Return a DataFrame with the group mean
    return pd.DataFrame({'WER_mean': [group_mean]})

# Group the DataFrame by the 'group' column and apply your function to each group
pretrain_grouped = df_pretrain.groupby('speaker').apply(Mean_WER)