In [1]:
import os

In [2]:
transcriptions_file_name = 'manual_transcriptions.csv'
transcriptions_file_path = '../transcriptions/'
transcriptions_file = os.path.join(transcriptions_file_path, transcriptions_file_name)
transcriptions_file

'../transcriptions/manual_transcriptions.csv'

In [3]:
if not os.path.exists(transcriptions_file) and os.path.exists(transcriptions_file.replace('.csv', '.ods')):
    print("File does not exist. Please convert .ods file to .csv")
    raise RuntimeError

In [4]:
import pandas as pd

In [5]:
transcriptions_df = pd.read_csv(transcriptions_file, sep='\t')
transcriptions_df

Unnamed: 0,Sound_file,Length (s),Transcription,nr_words,words per second
0,016_20140701.wav.pcm,22,I would like to propose on behalf of the snd g...,33,1.500000
1,017_20140701.wav.pcm,27,"no, I ask then the nominated candidate whether...",40,1.481481
2,018_20140701.wav.pcm,15,"and as there is only one candidate, I would li...",43,2.866667
3,019_20140701.wav.pcm,23,"so lets do it by acclamation, congratulation m...",24,1.043478
4,020_20140701.wav.pcm,28,thank you very much colleagues,5,0.178571
...,...,...,...,...,...
96,144_20141120.wav.pcm,25,the commissions commitment to putting an indep...,67,2.680000
97,145_20141120.wav.pcm,16,It has to be immune from European and national...,38,2.375000
98,146_20141120.wav.pcm,22,"Therefore, the appointment procedure for them ...",59,2.681818
99,147_20141120.wav.pcm,17,It is important to recall that at the local le...,56,3.294118


#### Create 4 columns:

* preprocessed manual transcription
* ASR output
* preprocessed ASR output
* WER


In [6]:
import re
import numpy as np
def preprocess_text(transcription):
    if not type(transcription) is str:
#         input(transcription)
        return ''
    transcription = transcription.lower()
    
    transcription = re.sub(r'[^\w\s]',' ',transcription)
    transcription = re.sub(r'\s+',' ',transcription)
    return transcription

In [7]:
transcriptions_df["preprocessed_transcription"] = transcriptions_df.Transcription.apply(preprocess_text)
transcriptions_df.preprocessed_transcription

0      i would like to propose on behalf of the snd g...
1      no i ask then the nominated candidate whether ...
2      and as there is only one candidate i would lik...
3      so lets do it by acclamation congratulation mi...
4                         thank you very much colleagues
                             ...                        
96     the commissions commitment to putting an indep...
97     it has to be immune from european and national...
98     therefore the appointment procedure for them m...
99     it is important to recall that at the local le...
100    now with regard to ehm ehm ehm ehm protection ...
Name: preprocessed_transcription, Length: 101, dtype: object

In [8]:
from zipfile import ZipFile
zipfiles_path = '../ASR_output/the_transcripts'


def get_filename_of_output(sound_file_name):
    filename = sound_file_name
    bare_filename = filename.split('.')[0]
    id_1, id_2 = bare_filename.split('_')
    zipfile = f'{id_2}.zip'
    zipfile_path = os.path.join(zipfiles_path, zipfile)
    member = os.path.join(id_2, f"{id_1}_{id_2}.wav.pcm.txt")
    return {
        'zipfile': zipfile_path,
        'member': member
    }

def get_asr_output(sound_file_name):
    file_info = get_filename_of_output(sound_file_name)
    
    try:
        with ZipFile(file_info['zipfile'], 'r') as zipobj:
            text = zipobj.read(file_info['member']).decode("utf-8")
    except FileNotFoundError:
        text = ''
    
    return text
    

In [9]:
transcriptions_df['asr_output'] = transcriptions_df.Sound_file.apply(get_asr_output)
transcriptions_df.head()

Unnamed: 0,Sound_file,Length (s),Transcription,nr_words,words per second,preprocessed_transcription,asr_output
0,016_20140701.wav.pcm,22,I would like to propose on behalf of the snd g...,33,1.5,i would like to propose on behalf of the snd g...,I WOULD LIKE TO PROPOSE ON BEHALF OF THIES AND...
1,017_20140701.wav.pcm,27,"no, I ask then the nominated candidate whether...",40,1.481481,no i ask then the nominated candidate whether ...,NO I ASK THAN A A DENOMINATED CANDIDE WETHER H...
2,018_20140701.wav.pcm,15,"and as there is only one candidate, I would li...",43,2.866667,and as there is only one candidate i would lik...,AND AS THERE IS ONLY ONE CANDIDATE I WOULD LIK...
3,019_20140701.wav.pcm,23,"so lets do it by acclamation, congratulation m...",24,1.043478,so lets do it by acclamation congratulation mi...,SO A LEX DOID BY ACCLAMATION CONGRATULATION ST...
4,020_20140701.wav.pcm,28,thank you very much colleagues,5,0.178571,thank you very much colleagues,THANK YOU VERY MUCH GOEX OO


In [10]:
transcriptions_df['asr_output_preprocessed'] = transcriptions_df.asr_output.apply(preprocess_text)
transcriptions_df.head()

Unnamed: 0,Sound_file,Length (s),Transcription,nr_words,words per second,preprocessed_transcription,asr_output,asr_output_preprocessed
0,016_20140701.wav.pcm,22,I would like to propose on behalf of the snd g...,33,1.5,i would like to propose on behalf of the snd g...,I WOULD LIKE TO PROPOSE ON BEHALF OF THIES AND...,i would like to propose on behalf of thies and...
1,017_20140701.wav.pcm,27,"no, I ask then the nominated candidate whether...",40,1.481481,no i ask then the nominated candidate whether ...,NO I ASK THAN A A DENOMINATED CANDIDE WETHER H...,no i ask than a a denominated candide wether h...
2,018_20140701.wav.pcm,15,"and as there is only one candidate, I would li...",43,2.866667,and as there is only one candidate i would lik...,AND AS THERE IS ONLY ONE CANDIDATE I WOULD LIK...,and as there is only one candidate i would lik...
3,019_20140701.wav.pcm,23,"so lets do it by acclamation, congratulation m...",24,1.043478,so lets do it by acclamation congratulation mi...,SO A LEX DOID BY ACCLAMATION CONGRATULATION ST...,so a lex doid by acclamation congratulation st...
4,020_20140701.wav.pcm,28,thank you very much colleagues,5,0.178571,thank you very much colleagues,THANK YOU VERY MUCH GOEX OO,thank you very much goex oo


In [11]:
from jiwer import wer
import jiwer

# transformation = jiwer.Compose([
#     jiwer.ToLowerCase(),
#     jiwer.RemoveMultipleSpaces(),
#     jiwer.RemoveWhiteSpace(replace_by_space=False),
#     jiwer.SentencesToListOfWords(word_delimiter=" ")
# ]) 

def calculate_wer(row):
    ground_truth = row.Transcription
    hypothesis = row.asr_output_preprocessed
    if not type(ground_truth) is str or not type(hypothesis) is str:
        print('return nan')
        return np.NaN
#     print(type(ground_truth), type(hypothesis))
#     WordErrRate = wer(ground_truth, hypothesis, truth_transform=transformation, 
#     hypothesis_transform=transformation)
    WordErrRate = wer(ground_truth, hypothesis)
    return WordErrRate
    

In [12]:
transcriptions_df['jiWER'] = transcriptions_df.apply(calculate_wer, axis=1)
transcriptions_df.head()

return nan
return nan


Unnamed: 0,Sound_file,Length (s),Transcription,nr_words,words per second,preprocessed_transcription,asr_output,asr_output_preprocessed,jiWER
0,016_20140701.wav.pcm,22,I would like to propose on behalf of the snd g...,33,1.5,i would like to propose on behalf of the snd g...,I WOULD LIKE TO PROPOSE ON BEHALF OF THIES AND...,i would like to propose on behalf of thies and...,0.606061
1,017_20140701.wav.pcm,27,"no, I ask then the nominated candidate whether...",40,1.481481,no i ask then the nominated candidate whether ...,NO I ASK THAN A A DENOMINATED CANDIDE WETHER H...,no i ask than a a denominated candide wether h...,0.525
2,018_20140701.wav.pcm,15,"and as there is only one candidate, I would li...",43,2.866667,and as there is only one candidate i would lik...,AND AS THERE IS ONLY ONE CANDIDATE I WOULD LIK...,and as there is only one candidate i would lik...,0.232558
3,019_20140701.wav.pcm,23,"so lets do it by acclamation, congratulation m...",24,1.043478,so lets do it by acclamation congratulation mi...,SO A LEX DOID BY ACCLAMATION CONGRATULATION ST...,so a lex doid by acclamation congratulation st...,0.5
4,020_20140701.wav.pcm,28,thank you very much colleagues,5,0.178571,thank you very much colleagues,THANK YOU VERY MUCH GOEX OO,thank you very much goex oo,0.4


In [13]:
def print_wers(row):
    print('truth :\n', row.preprocessed_transcription, end='\n---\n')
    print('hypothesis :\n', row.asr_output_preprocessed, end='\n---\n')
    print('WER:', f"{row.WER:.2f}")
    input()
    
# transcriptions_df.apply(print_wers, axis=1)

In [14]:
import pywer


def calculate_pywer(row):
    ground_truth = row.Transcription
    hypothesis = row.asr_output_preprocessed
    if not type(ground_truth) is str or not type(hypothesis) is str:
        print('return nan')
        return np.NaN
#     print(type(ground_truth), type(hypothesis))
    WordErrRate = pywer.wer([ground_truth], [hypothesis])/100
    return WordErrRate


In [15]:
transcriptions_df['pyWER'] = transcriptions_df.apply(calculate_pywer, axis=1)
transcriptions_df.head()


return nan
return nan


Unnamed: 0,Sound_file,Length (s),Transcription,nr_words,words per second,preprocessed_transcription,asr_output,asr_output_preprocessed,jiWER,pyWER
0,016_20140701.wav.pcm,22,I would like to propose on behalf of the snd g...,33,1.5,i would like to propose on behalf of the snd g...,I WOULD LIKE TO PROPOSE ON BEHALF OF THIES AND...,i would like to propose on behalf of thies and...,0.606061,0.606061
1,017_20140701.wav.pcm,27,"no, I ask then the nominated candidate whether...",40,1.481481,no i ask then the nominated candidate whether ...,NO I ASK THAN A A DENOMINATED CANDIDE WETHER H...,no i ask than a a denominated candide wether h...,0.525,0.525
2,018_20140701.wav.pcm,15,"and as there is only one candidate, I would li...",43,2.866667,and as there is only one candidate i would lik...,AND AS THERE IS ONLY ONE CANDIDATE I WOULD LIK...,and as there is only one candidate i would lik...,0.232558,0.232558
3,019_20140701.wav.pcm,23,"so lets do it by acclamation, congratulation m...",24,1.043478,so lets do it by acclamation congratulation mi...,SO A LEX DOID BY ACCLAMATION CONGRATULATION ST...,so a lex doid by acclamation congratulation st...,0.5,0.5
4,020_20140701.wav.pcm,28,thank you very much colleagues,5,0.178571,thank you very much colleagues,THANK YOU VERY MUCH GOEX OO,thank you very much goex oo,0.4,0.4


In [16]:
import sys
import numpy


def editDistance(r, h):
    '''
    This function is to calculate the edit distance of reference sentence and the hypothesis sentence.

    Main algorithm used is dynamic programming.

    Attributes: 
        r -> the list of words produced by splitting reference sentence.
        h -> the list of words produced by splitting hypothesis sentence.
    '''
    d = numpy.zeros((len(r)+1)*(len(h)+1), dtype=numpy.uint8).reshape((len(r)+1, len(h)+1))
    for i in range(len(r)+1):
        d[i][0] = i
    for j in range(len(h)+1):
        d[0][j] = j
    for i in range(1, len(r)+1):
        for j in range(1, len(h)+1):
            if r[i-1] == h[j-1]:
                d[i][j] = d[i-1][j-1]
            else:
                substitute = d[i-1][j-1] + 1
                insert = d[i][j-1] + 1
                delete = d[i-1][j] + 1
                d[i][j] = min(substitute, insert, delete)
    return d

def getStepList(r, h, d):
    '''
    This function is to get the list of steps in the process of dynamic programming.

    Attributes: 
        r -> the list of words produced by splitting reference sentence.
        h -> the list of words produced by splitting hypothesis sentence.
        d -> the matrix built when calulating the editting distance of h and r.
    '''
    x = len(r)
    y = len(h)
    list = []
    while True:
        if x == 0 and y == 0: 
            break
        elif x >= 1 and y >= 1 and d[x][y] == d[x-1][y-1] and r[x-1] == h[y-1]: 
            list.append("e")
            x = x - 1
            y = y - 1
        elif y >= 1 and d[x][y] == d[x][y-1]+1:
            list.append("i")
            x = x
            y = y - 1
        elif x >= 1 and y >= 1 and d[x][y] == d[x-1][y-1]+1:
            list.append("s")
            x = x - 1
            y = y - 1
        else:
            list.append("d")
            x = x - 1
            y = y
    return list[::-1]

def alignedPrint(list, r, h, result):
    '''
    This funcition is to print the result of comparing reference and hypothesis sentences in an aligned way.
    
    Attributes:
        list   -> the list of steps.
        r      -> the list of words produced by splitting reference sentence.
        h      -> the list of words produced by splitting hypothesis sentence.
        result -> the rate calculated based on edit distance.
    '''
    print("REF:", end=" ")
    for i in range(len(list)):
        if list[i] == "i":
            count = 0
            for j in range(i):
                if list[j] == "d":
                    count += 1
            index = i - count
            print(" "*(len(h[index])), end=" ")
        elif list[i] == "s":
            count1 = 0
            for j in range(i):
                if list[j] == "i":
                    count1 += 1
            index1 = i - count1
            count2 = 0
            for j in range(i):
                if list[j] == "d":
                    count2 += 1
            index2 = i - count2
            if len(r[index1]) < len(h[index2]):
                print(r[index1] + " " * (len(h[index2])-len(r[index1])), end=" ")
            else:
                print(r[index1], end=" "),
        else:
            count = 0
            for j in range(i):
                if list[j] == "i":
                    count += 1
            index = i - count
            print(r[index], end=" "),
    print("\nHYP:", end=" ")
    for i in range(len(list)):
        if list[i] == "d":
            count = 0
            for j in range(i):
                if list[j] == "i":
                    count += 1
            index = i - count
            print(" " * (len(r[index])), end=" ")
        elif list[i] == "s":
            count1 = 0
            for j in range(i):
                if list[j] == "i":
                    count1 += 1
            index1 = i - count1
            count2 = 0
            for j in range(i):
                if list[j] == "d":
                    count2 += 1
            index2 = i - count2
            if len(r[index1]) > len(h[index2]):
                print(h[index2] + " " * (len(r[index1])-len(h[index2])), end=" ")
            else:
                print(h[index2], end=" ")
        else:
            count = 0
            for j in range(i):
                if list[j] == "d":
                    count += 1
            index = i - count
            print(h[index], end=" ")
    print("\nEVA:", end=" ")
    for i in range(len(list)):
        if list[i] == "d":
            count = 0
            for j in range(i):
                if list[j] == "i":
                    count += 1
            index = i - count
            print("D" + " " * (len(r[index])-1), end=" ")
        elif list[i] == "i":
            count = 0
            for j in range(i):
                if list[j] == "d":
                    count += 1
            index = i - count
            print("I" + " " * (len(h[index])-1), end=" ")
        elif list[i] == "s":
            count1 = 0
            for j in range(i):
                if list[j] == "i":
                    count1 += 1
            index1 = i - count1
            count2 = 0
            for j in range(i):
                if list[j] == "d":
                    count2 += 1
            index2 = i - count2
            if len(r[index1]) > len(h[index2]):
                print("S" + " " * (len(r[index1])-1), end=" ")
            else:
                print("S" + " " * (len(h[index2])-1), end=" ")
        else:
            count = 0
            for j in range(i):
                if list[j] == "i":
                    count += 1
            index = i - count
            print(" " * (len(r[index])), end=" ")
    print("\nWER: " + result)

def mywer(r, h):
    """
    This is a function that calculate the word error rate in ASR.
    You can use it like this: wer("what is it".split(), "what is".split()) 
    """
    # build the matrix
    d = editDistance(r, h)

    # find out the manipulation steps
    list = getStepList(r, h, d)

    # print the result in aligned way
    result = float(d[len(r)][len(h)]) / len(r) * 100
    
    return result

In [17]:
def calculate_mywer(row):
    ground_truth = row.Transcription
    hypothesis = row.asr_output_preprocessed
    if not type(ground_truth) is str or not type(hypothesis) is str:
        print('return nan')
        return np.NaN
#     print(type(ground_truth), type(hypothesis))
    WordErrRate = mywer(ground_truth.split(' '), hypothesis.split(' '))/100
    return WordErrRate


In [18]:
from tqdm import tqdm
tqdm.pandas()
transcriptions_df['myWER'] = transcriptions_df.progress_apply(calculate_mywer, axis=1)
transcriptions_df.head()

  from pandas import Panel
  0%|          | 0/101 [00:00<?, ?it/s]

return nan


100%|██████████| 101/101 [00:02<00:00, 47.57it/s]

return nan





Unnamed: 0,Sound_file,Length (s),Transcription,nr_words,words per second,preprocessed_transcription,asr_output,asr_output_preprocessed,jiWER,pyWER,myWER
0,016_20140701.wav.pcm,22,I would like to propose on behalf of the snd g...,33,1.5,i would like to propose on behalf of the snd g...,I WOULD LIKE TO PROPOSE ON BEHALF OF THIES AND...,i would like to propose on behalf of thies and...,0.606061,0.606061,0.606061
1,017_20140701.wav.pcm,27,"no, I ask then the nominated candidate whether...",40,1.481481,no i ask then the nominated candidate whether ...,NO I ASK THAN A A DENOMINATED CANDIDE WETHER H...,no i ask than a a denominated candide wether h...,0.525,0.525,0.525
2,018_20140701.wav.pcm,15,"and as there is only one candidate, I would li...",43,2.866667,and as there is only one candidate i would lik...,AND AS THERE IS ONLY ONE CANDIDATE I WOULD LIK...,and as there is only one candidate i would lik...,0.232558,0.232558,0.232558
3,019_20140701.wav.pcm,23,"so lets do it by acclamation, congratulation m...",24,1.043478,so lets do it by acclamation congratulation mi...,SO A LEX DOID BY ACCLAMATION CONGRATULATION ST...,so a lex doid by acclamation congratulation st...,0.5,0.5,0.5
4,020_20140701.wav.pcm,28,thank you very much colleagues,5,0.178571,thank you very much colleagues,THANK YOU VERY MUCH GOEX OO,thank you very much goex oo,0.4,0.4,0.4


In [19]:
transcriptions_df.to_csv("evaluations.csv", sep='\t')
os.path.exists("evaluations.csv")

True

In [20]:
pywer.wer(transcriptions_df.preprocessed_transcription, transcriptions_df.asr_output_preprocessed)

35.092724679029956

In [21]:
pywer.cer(transcriptions_df.preprocessed_transcription, transcriptions_df.asr_output_preprocessed)

23.805329222638186

In [22]:
transcriptions_df_sorted = transcriptions_df.sort_values(by='jiWER', ascending=False)
transcriptions_df_sorted

Unnamed: 0,Sound_file,Length (s),Transcription,nr_words,words per second,preprocessed_transcription,asr_output,asr_output_preprocessed,jiWER,pyWER,myWER
22,066_20140723.wav.pcm,21,"misses brjork, well on behalf of the GUE ngl ...",11,0.523810,misses brjork well on behalf of the gue ngl g...,DEAR MISSUS BEORG AND YET EH YOU EVRONGUI WELL...,dear missus beorg and yet eh you evrongui well...,1.100000,1.100000,1.000000
41,002_20140915.wav.pcm,18,"yes, I would like to raise a point regarding p...",10,0.555556,yes i would like to raise a point regarding pr...,OPER SCULAT THAT'S IT INKEMURGED YES I WOULD L...,oper sculat that s it inkemurged yes i would l...,1.100000,1.100000,1.100000
40,001_20140915.wav.pcm,27,"so that’s good, okay colleagues, if there are ...",27,1.000000,so that s good okay colleagues if there are no...,HAVEN'T THEY SO THAT'S GOOD LOOK IT COLLEAGUES...,haven t they so that s good look it colleagues...,1.074074,1.074074,1.074074
100,232_20141211.wav.pcm,29,"now, with regard to ehm ehm ehm ehm, protectio...",60,2.068966,now with regard to ehm ehm ehm ehm protection ...,,,1.000000,1.000000,1.000000
99,147_20141120.wav.pcm,17,It is important to recall that at the local le...,56,3.294118,it is important to recall that at the local le...,,,1.000000,1.000000,0.982143
...,...,...,...,...,...,...,...,...,...,...,...
24,068_20140723.wav.pcm,21,"I am glad that the, inside the directive we ar...",38,1.809524,i am glad that the inside the directive we are...,I AM GLAD THAT THE INSIDE THE DIRECTIVE WE ARE...,i am glad that the inside the directive we are...,0.184211,0.184211,0.184211
87,175_20141110.wav.pcm,21,the next point on the agenda is the presention...,42,2.000000,the next point on the agenda is the presention...,THE NEXT POINT ON THE ADGENDER IS THE PRESENTI...,the next point on the adgender is the presenti...,0.166667,0.166667,0.166667
30,448_20140904.wav.pcm,28,"about fundamental rights, does it bring about ...",69,2.464286,about fundamental rights does it bring about a...,ABOUT FUNAMENAL RIGHTS DOES IT BRING ABOUT ANY...,about funamenal rights does it bring about any...,0.159420,0.159420,0.159420
5,021_20140701.wav.pcm,16,,0,0.000000,,,,,,


In [23]:
transcriptions_df_sorted.to_csv("evaluations_sorted.csv", sep='\t')
os.path.exists("evaluations_sorted.csv")

True