# INSTALLS AND IMPORTS

In [None]:
### INSTALL DEPENDENCIES
!pip install python-Levenshtein
!pip uninstall nltk
!pip install nltk==3.6.2



In [1]:
### IMPORT LIBRARIES
import pandas as pd
import numpy as np
import sys
import os
import Levenshtein as Lev
import pickle

# FUNKTIONER

In [18]:
def wer(s1, s2):
    """
    Computes the Word Error Rate, defined as the edit distance between the
    two provided sentences after tokenizing to words.
    Arguments:
        s1 (string): space-separated sentence
        s2 (string): space-separated sentence
    """

    # build mapping of words to integers
    b = set(s1.split() + s2.split())
    word2char = dict(zip(b, range(len(b))))

    # map the words to a char array (Levenshtein packages only accepts
    # strings)
    w1 = [chr(word2char[w]) for w in s1.split()]
    w2 = [chr(word2char[w]) for w in s2.split()]

    return Lev.distance(''.join(w1), ''.join(w2))

def wer_normalized(s1, s2):
  return wer(s1.lower(),s2.lower()) / len(s2.split(" "))
def calculate_wer(df):
  return wer_normalized(df['corrected'],df['reference_text'])

In [4]:
def open_dataset(path, load_outputs=True):
    with open(path) as f:
        sentences = f.read().split("\n\n")[:-1]
    sentences = [s.split('\n') for s in sentences]
    inputs = [[w.split('\t')[0] for w in s] for s in sentences]

    if not load_outputs:
        return inputs

    outputs = [[w.split('\t')[1] for w in s] for s in sentences]
    return inputs, outputs

# LOAD DATA and EVALUATE ON MultiLexNorm


In [7]:
### READ FIL

_, outputs = open_dataset('/Users/jenspt/Desktop/git/DL-ByT5/data/outputs_mln_ft.txt')
corrected = [' '.join(sentence) for sentence in outputs]
transcribed = [' '.join(sentence) for sentence in _]

In [10]:
with open('/Users/jenspt/Desktop/git/DL-ByT5/data/mln_data_test_outputs.pkl', 'rb') as f:
    reference = pickle.load(f)

reference = [" ".join(s) for s in reference]

In [11]:
mln_df = pd.DataFrame(list(zip(reference, transcribed, corrected)),
               columns =['reference_text', 'transcription', 'corrected'])

In [12]:
mln_df.head()

Unnamed: 0,reference_text,transcription,corrected
0,gyset der har siddet sammenkrøbet i nakkeregio...,gyset der har siddet sammenkrøbet i nakke regi...,gylfi dér er sidde sammenkrøbene inden nakkere...
1,det er et enormt befolkningstal sammenlignet m...,det er et enormt befolkningstal sammenlignet m...,der er enormt enorm befolkningstal sammenligne...
2,de seks balletter er ikke alle avantgardestykk...,de seks balletter er ikke alle avangard stykke...,det seksballetter balletter har alle al avanga...
3,stakkels davedarling,stakkels dave darling,stakels davedarling darling
4,det får han osse,det får han også,dét for hr osse


# Evaluation



In [19]:
mln_df['corrected_wer'] = mln_df.apply(calculate_wer,axis=1)

In [20]:
print("MLN WER: ", mln_df['corrected_wer'].mean()*100)

MLN WER:  75.16467803146487


# BLEU & GLEU

In [None]:
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.gleu_score import sentence_gleu

In [21]:
def calculate_bleu_baseline_normalized(df):
  total_bleu = 0
  for i in range(len(df)):
    ref = [df['reference_text'].iloc[i].split(" ")]
    hyp = df['transcription'].iloc[i].split(" ")
    sentence_bleu_score = sentence_bleu(ref, hyp)
    total_bleu += sentence_bleu_score
  return total_bleu / len(df)

def calculate_gleu_baseline_normalized(df):
  total_gleu = 0
  for i in range(len(df)):
    ref = [df['reference_text'].iloc[i].split(" ")]
    hyp = df['transcription'].iloc[i].split(" ")
    sentence_gleu_score = sentence_gleu(ref, hyp)
    total_gleu += sentence_gleu_score
  return total_gleu / len(df)

def calculate_bleu_normalized(df):
  total_bleu = 0
  for i in range(len(df)):
    ref = [df['reference_text'].iloc[i].split(" ")]
    hyp = df['corrected'].iloc[i].split(" ")
    sentence_bleu_score = sentence_bleu(ref, hyp)
    total_bleu += sentence_bleu_score
  return total_bleu / len(df)

def calculate_gleu_normalized(df):
  total_gleu = 0
  for i in range(len(df)):
    ref = [df['reference_text'].iloc[i].split(" ")]
    hyp = df['corrected'].iloc[i].split(" ")
    sentence_gleu_score = sentence_gleu(ref, hyp)
    total_gleu += sentence_gleu_score
  return total_gleu / len(df)

In [31]:
print(calculate_bleu_baseline_normalized(mln_df))
print(calculate_gleu_baseline_normalized(mln_df))

The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


0.6473379750494206
0.7097791013001765


In [None]:

#corpus_bleu_score = calculate_bleu_corpus(df)
norm_bleu_score = calculate_bleu_normalized(mln_df)
#calculated_gleu = calculate_gleu(df) #RuntimeError: generator raised StopIteration


#print("\nCorpus bleu:")
#print(corpus_bleu_score)
print("\nNormalized bleu:")
print(norm_bleu_score)
#print(calculated_gleu)

#corpus_bleu_score = calculate_bleu_corpus(df)
norm_gleu_score = calculate_gleu_normalized(mln_df)
#calculated_gleu = calculate_gleu(df) #RuntimeError: generator raised StopIteration


#print("\nCorpus bleu:")
#print(corpus_bleu_score)
print("\nNormalized gleu:")
print(norm_gleu_score)
#print(calculated_gleu)

The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()



Normalized bleu:
0.006133337391333845

Normalized gleu:
0.1129387766622926


In [None]:
## GLEU SCORES
# Baseline (transcription)= 0.8534873003494211
# ByT5_lr_1e4             = 0.8952613840069592
# ByT5_ws3000             = 0.8873560947109903
# ByT5                    = 0.887649053358017
# ByT5_extra_layer        = 0.0
# MultiLexNorm_raw_data   = 0.7097791013001765
# MultiLexNorm_base       = 0.6818061444602583
# MultiLexNorm_Fine_tuned = 0.1129387766622926


## BLEU SCORES (normalized)
# Baseline (transcription)= 0.7871290040412282
# ByT5                    = 0.8233903148477896
# ByT5_lr_1e4             = 0.8320419867936647
# ByT5_ws3000             = 0.8231682536463248
# ByT5_extra_layer        = 0.0
# MultiLexNorm_raw_data   = 0.6473379750494206
# MultiLexNorm_base       = 0.6190157431613983
# MultiLexNorm_fine_tuned = 0.006133337391333845
