In [None]:
from transformers import pipeline
import numpy as np
import jiwer
import pandas as pd
import sys
# Load the model for ASR with explicit task definition
# pipe = pipeline(task="automatic-speech-recognition", model="openai/whisper-medium")
pipe = pipeline(task="automatic-speech-recognition", model="justanotherinternetguy/whisper-small-sep28")

def transcribe(audio):
    # Process the audio file (assumes it's a .wav file or other audio format Whisper supports)
    text = pipe(audio)["text"]
    return text

def read_ground_truth(ground_truth_file):
    with open(ground_truth_file, "r") as file:
        ground_truth = file.read().strip()
    return ground_truth

def calculate_wer(transcribed_text, ground_truth_text):
    # Using the jiwer library to calculate WER
    return jiwer.wer(ground_truth_text, transcribed_text)

def calculate_cer(transcribed_text, ground_truth_text):
    # Using the jiwer library to calculate CER
    return jiwer.cer(ground_truth_text, transcribed_text)

def calculate_mer(transcribed_text, ground_truth_text):
    # Calculate Match Error Rate (MER) as the complement of WER
    # return 1 - jiwer.wer(ground_truth_text, transcribed_text)
    return jiwer.mer(ground_truth_text, transcribed_text)

def calculate_wil(transcribed_text, ground_truth_text):
    # # Calculate Word Information Lost (WIL)
    # ground_truth_words = ground_truth_text.split()
    # transcribed_words = transcribed_text.split()
    # # Count missing words (words in ground truth but not in transcribed)
    # missing_words = len([word for word in ground_truth_words if word not in transcribed_words])
    # return missing_words / len(ground_truth_words)
    return jiwer.wil(ground_truth_text, transcribed_text)
    

def calculate_wip(transcribed_text, ground_truth_text):
    # # Calculate Word Information Preserved (WIP)
    # ground_truth_words = ground_truth_text.split()
    # transcribed_words = transcribed_text.split()
    # # Count correct words (words in both transcribed and ground truth)
    # correct_words = len([word for word in transcribed_words if word in ground_truth_words])
    # return correct_words / len(ground_truth_words)
    return jiwer.wip(ground_truth_text, transcribed_text)
    

# Path to the CSV containing file paths
csv_path = '/home/alien/Git/XSpeech/data_processing/Libristutter_16hkz_fps.csv'

# Read the CSV file
df = pd.read_csv(csv_path)

# Store all metrics
wers = []
mers = []
wils = []
wips = []
cers = []

# Loop through each row in the dataframe
with open('transcription_results.txt', 'w') as output_file:
    # Redirect stdout to the file
    sys.stdout = output_file

    # Loop through each row in the dataframe
    for index, row in df.iterrows():
        audio_path = row['stuttered_fp']  # Column containing audio file paths
        ground_truth_path = row['transcript_fp']  # Column containing ground truth file paths

        # Transcribe the audio
        transcribed_text = transcribe(audio_path)

        # Read the ground truth text
        ground_truth_text = read_ground_truth(ground_truth_path)

        # Calculate WER
        wer = calculate_wer(transcribed_text, ground_truth_text)
        wers.append(wer)

        # Calculate MER
        mer = calculate_mer(transcribed_text, ground_truth_text)
        mers.append(mer)

        # Calculate WIL
        wil = calculate_wil(transcribed_text, ground_truth_text)
        wils.append(wil)

        # Calculate WIP
        wip = calculate_wip(transcribed_text, ground_truth_text)
        wips.append(wip)

        # Calculate CER
        cer = calculate_cer(transcribed_text, ground_truth_text)
        cers.append(cer)

        # Print results for each file
        print(f"Audio File: {audio_path}")
        print(f"Transcribed Text: {transcribed_text}")
        print(f"Ground Truth Text: {ground_truth_text}")
        print(f"WER: {wer}")
        print(f"MER: {mer}")
        print(f"WIL: {wil}")
        print(f"WIP: {wip}")
        print(f"CER: {cer}")
        print("="*50)

    # Calculate mean values for all metrics
    mean_wer = np.mean(np.array(wers))
    mean_mer = np.mean(np.array(mers))
    mean_wil = np.mean(np.array(wils))
    mean_wip = np.mean(np.array(wips))
    mean_cer = np.mean(np.array(cers))

    # Print the mean results
    print(f"Mean WER: {mean_wer}")
    print(f"Mean MER: {mean_mer}")
    print(f"Mean WIL: {mean_wil}")
    print(f"Mean WIP: {mean_wip}")
    print(f"Mean CER: {mean_cer}")
    
# Reset stdout back to normal (console)
sys.stdout = sys.__stdout__


model.safetensors:   2%|2         | 62.9M/3.06G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.75k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/283k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.19k [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

Device set to use cuda:0
Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
