# Transcription normalization

This script is used to normalize the transcriptions.

In [10]:
import os
import re
import unidecode

In [11]:
fname = "/Users/javirando/Desktop/Universidad/3º/3 Trimestre/Speech/Final Project/models/es-es/corele_raw.transcription"
count = 0
with open(fname, 'r') as f:
    for line in f:
        count += 1
print("Total number of lines is:", count)

Total number of lines is: 913


In [12]:
import enchant
from enchant.checker import SpellChecker

my_dict = enchant.Dict("es_ES")
my_checker = SpellChecker(my_dict)

In [13]:
def find_errors (sentence, errors):
    my_checker.set_text(sentence)
    for error in my_checker:
        errors.add(error.word)

In [14]:
def replace_acts (line, acts_set):
    acts = re.findall(r'(\{\%act\: \w+\})', line)
    for i in acts:
        act = '<'+re.search(r'\{\%act\: (\w+)\}', i).group(1).upper()+'>'
        line = line.replace(i, act)
        acts_set.add(act)
    return line

In [15]:
def replace_names (line):
    names = re.findall(r'(\w+xxx)', line)
    for n in names:
        line = line.replace(n, '<XXX>')
    return line

In [16]:
def normalize_sentence (line, errors, acts, noise_dict):
    sentence_original = re.search('<s> (.*) </s>', line).group(1)
    
    # Remove speaker *FIN:
    sentence = re.sub(r'\*\w{3}\:\s+', '', sentence_original)
    
    # Lowercase
    sentence = sentence.lower()
    
    # Remove names:
    sentence = replace_names(sentence)
    
    # Remove noise
    for abb, full in noise_dict.items():
        sentence = sentence.replace(abb, full)
        acts.add(full)
    
    # Replace acts by text
    sentence = replace_acts(sentence, acts)
    
    # Replace information between brackets
    sentence = re.sub(r'(\{[^\}]*)?\}', '', sentence)
    
    # Replace information between brackets
    sentence = re.sub(r'(\[[^\]]*)?\]', '', sentence)
    
    # Remove common @g
    sentence = re.sub(r'@g', '', sentence)
    
    # Remove symbols
    sentence = re.sub(r'[^\w\s\<\>]', '', sentence)
    
    # Remove double spaces
    sentence = re.sub(r'[\s]+', ' ', sentence)
    
    # Find errors
    find_errors(sentence, errors)
    
    return line.replace(sentence_original, sentence)

In [17]:
noise_dict = {
    "&eh": "<EH>",
    "hhh": "<HHH>",
    "&mm": "<MM>",
}

In [18]:
count = 0
errors = set()
acts = set()
with open(fname, 'r') as f:
    with open("/Users/javirando/Desktop/Universidad/3º/3 Trimestre/Speech/Final Project/models/es-es/corele.transcription", 'w') as out:
        for line in f:
            out.write(normalize_sentence(line, errors, acts, noise_dict))
            count+=1
print("Total number of lines is:", count)

Total number of lines is: 913


In [61]:
errors

{'ASSENT',
 'BLOW',
 'BLOWING',
 'BLOWS',
 'BREATHING',
 'CLICK',
 'DOUBT',
 'DOUBTING',
 'DOUBTS',
 'EXCLAMATION',
 'HHH',
 'INHALATION',
 'LAUGH',
 'NEGATION',
 'ONOMATOPOEIA',
 'QUESTION',
 'SIGH',
 'SIGHS',
 'SNORT',
 'THINKING',
 'WHISTLE',
 'XXX',
 'abordagem',
 'abur',
 'ac',
 'acordo',
 'advisa',
 'aficción',
 'agora',
 'aho',
 'aiora',
 'aj',
 'alemania',
 'alg',
 'alguns',
 'alimen',
 'almería',
 'almoço',
 'american',
 'américa',
 'andalucía',
 'anglófonis',
 'asegurancia',
 'assim',
 'asturias',
 'atración',
 'aune',
 'ba',
 'barcelona',
 'because',
 'beijin',
 'bemhumorada',
 'ber',
 'bergle',
 'berlín',
 'besitos',
 'bi',
 'bie',
 'bieno',
 'bilbao',
 'bilete',
 'bizcocio',
 'bocodi',
 'bok',
 'bom',
 'brasil',
 'bru',
 'brá',
 'brás',
 'bu',
 'building',
 'bur',
 'burger',
 'burgers',
 'burgue',
 'burguers',
 'burgues',
 'burguesería',
 'buru',
 'bélgica',
 'cabarero',
 'cabelo',
 'cafecito',
 'camar',
 'camare',
 'camarer',
 'camerero',
 'canadá',
 'cantabria',
 'canter

In [48]:
list(acts)

['<LAUGH>',
 '<QUESTION>',
 '<BLOW>',
 '<BLOWS>',
 '<SIGHS>',
 '<EH>',
 '<SNORT>',
 '<XXX>',
 '<DOUBTS>',
 '<SIGH>',
 '<ASSENT>',
 '<NEGATION>',
 '<EXCLAMATION>',
 '<DOUBTING>',
 '<THINKING>',
 '<BREATHING>',
 '<WHISTLE>',
 '<HHH>',
 '<MM>',
 '<CLICK>',
 '<INHALATION>',
 '<ONOMATOPOEIA>',
 '<DOUBT>',
 '<BLOWING>']