In [1]:
# Base code by Sile/Mustafa; see TextCompare.py
# Modified for multiple cases with single file by Elizabeth

import numpy as np
import re
import codecs
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from functools import reduce


# import nltk
# nltk.download('stopwords')

In [24]:
# hyper parameters
stop_words = set(stopwords.words('english'))
# stop_words = set()   # Use this line instead to run without stop words

def get_transcripts(path):
    """
    Separates a master file of transcripts into individual cases
    :param path: the location of the file with all transcripts
    :return transcripts: a list of all the transcripts
    """
    file = open(path, "r")
    transcripts = []
    for thing in file:
        transcripts.append(thing)
    file.close()
    return transcripts
    
# compare two text
class TextComp(object):
    def __init__(self, original_path, recognition_path, encoding='utf-8'):
        # original_path: path of the original text
        # recognition_path: path of the recognized text
        # encoding: specifies the encoding which is to be used for the file
        # all_originals: list of all individual transcripts if using master files
        # all_recog: list of all individual transcripts if using master files
        self.original_path = original_path
        self.recognition_path = recognition_path
        self.encoding = encoding
        self.all_originals = get_transcripts(original_path)
        self.all_recog = get_transcripts(recognition_path)
        self.I = 0
        self.S = 0
        self.D = 0    
        
    def Preprocess(self, path, one=False):
        if not one:
            with codecs.open(path, encoding=self.encoding) as f:
                text = f.read().lower().replace(",", "")
                tokenizer = RegexpTokenizer(r'\w+')
                words = tokenizer.tokenize(text)
                filtered_words = list(filter(lambda w: w not in stop_words, words))
                return filtered_words
        else:
            text = path.lower()
            tokenizer = RegexpTokenizer(r'\w+')
            words = tokenizer.tokenize(text)
            filtered_words = list(filter(lambda w: w not in stop_words, words))
            return filtered_words

    def WER(self, debug=False, ind="all"):
        if ind == "all":
            r = self.Preprocess(self.original_path)
            h = self.Preprocess(self.recognition_path)
        else:
            r = self.Preprocess(self.all_originals[ind], True)
            h = self.Preprocess(self.all_recog[ind], True)
        # costs will holds the costs, like in the Levenshtein distance algorithm
        costs = [[0 for inner in range(len(h) + 1)] for outer in range(len(r) + 1)]
        # backtrace will hold the operations we've done.
        # so we could later backtrace, like the WER algorithm requires us to.
        backtrace = [[0 for inner in range(len(h) + 1)] for outer in range(len(r) + 1)]

        OP_OK = 0
        OP_SUB = 1
        OP_INS = 2
        OP_DEL = 3

        # First column represents the case where we achieve zero
        # hypothesis words by deleting all reference words.
        for i in range(1, len(r) + 1):
            costs[i][0] = i
            backtrace[i][0] = OP_DEL

        # First row represents the case where we achieve the hypothesis
        # by inserting all hypothesis words into a zero-length reference.
        for j in range(1, len(h) + 1):
            costs[0][j] = j
            backtrace[0][j] = OP_INS

        # computation
        for i in range(1, len(r) + 1):
            for j in range(1, len(h) + 1):
                if r[i - 1] == h[j - 1]:
                    costs[i][j] = costs[i - 1][j - 1]
                    backtrace[i][j] = OP_OK
                else:
                    substitutionCost = costs[i - 1][j - 1] + 1  # penalty is always 1
                    insertionCost = costs[i][j - 1] + 1  # penalty is always 1
                    deletionCost = costs[i - 1][j] + 1  # penalty is always 1

                    costs[i][j] = min(substitutionCost, insertionCost, deletionCost)
                    if costs[i][j] == substitutionCost:
                        backtrace[i][j] = OP_SUB
                    elif costs[i][j] == insertionCost:
                        backtrace[i][j] = OP_INS
                    else:
                        backtrace[i][j] = OP_DEL

        # back trace though the best route:
        i = len(r)
        j = len(h)
        self.S = 0
        self.D = 0
        self.I = 0
        numCor = 0
        if debug:
            print("OP\toriginal\trecognition")
            lines = []
        while i > 0 or j > 0:
            if backtrace[i][j] == OP_OK:
                numCor += 1
                i -= 1
                j -= 1
                if debug:
                    lines.append("OK\t" + r[i] + "\t" + h[j])
            elif backtrace[i][j] == OP_SUB:
                self.S += 1
                i -= 1
                j -= 1
                if debug:
                    lines.append("SUB\t" + r[i] + "\t" + h[j])
            elif backtrace[i][j] == OP_INS:
                self.I += 1
                j -= 1
                if debug:
                    lines.append("INS\t" + "****" + "\t" + h[j])
            elif backtrace[i][j] == OP_DEL:
                self.D += 1
                i -= 1
                if debug:
                    lines.append("DEL\t" + r[i] + "\t" + "****")
        if debug:
            lines = reversed(lines)
            for line in lines:
                print(line)
            print("#cor " + str(numCor))
            print("#sub " + str(self.S))
            print("#del " + str(self.D))
            print("#ins " + str(self.I))
            return (self.S + self.D + self.I) / float(len(r))
        wer_result = round((self.S + self.D + self.I) / float(len(r)), 3)
        return wer_result

    def Accuracy(self, ind="all"):
        if ind == "all":
            return float(len(self.Preprocess(self.original_path)) - self.D - self.S) / len(
                self.Preprocess(self.original_path))
        else: 
            return float(len(self.Preprocess(self.all_originals[ind], True)) - self.D - self.S) / len(
                self.Preprocess(self.all_originals[ind], True))



In [25]:
# Example of how to use
def main():
    """
    Instructions: Modify path to be where your files are. 
    "master file" = one text file with multiple transcripts, generally copied from a spreadsheet
    If you want the individual WERs of each transcript in a master file, use WER(ind=#) in a loop with # = index numbers,
    one for each transcript, as seen in the loop below.
    If you want the WER for a single file or the overall WER of many files, use WER() without arguments
    You can also get the WER of one transcript from a file with many by just using WER(ind=#) with the index of what you want.
    -Above suggestions also apply to Accuracy; however, I, D, and S are set after the last WER, so you have to run 
        Accuracy() directly after an individual text's WER() if you want the accuracy for just that text
    
    Notes: -debug=True shows the word-by-word breakdown of the comparison
           -making stop_words an empty set (see #hyper parameters) finds WER without taking stop words into account
           -you have to run WER() before you can correctly use Accuracy()
    """
    path = "C:/Users/Student/OneDrive/Documents/Summer 2019 Research/Week 5-Clean and North Garden/NG625Eval/"
    
    # Multiple WERs
    for i in range(2):
        compare = TextComp(path + "mastertexttrue.txt", path + "mastertextresults.txt")
#         print("Recording 00" + str(i) + " WER: " + str(compare.WER(ind=i, debug=True)))
        compare.WER(ind=i)
        print(str(compare.Accuracy(ind=i)))
#         print()
        
    # Single WER
#     compare = TextComp(path + "singletest.txt", path + "singletestapi.txt")
#     print(str(compare.WER(debug=True)))
    

In [26]:
main()

0.8596491228070176
0.8773946360153256
