# Clean Irish text

> "...preserving some punctuation, for silence alignment"

- toc: false
- branch: master
- hidden: true
- categories: [irish, cleaning, alignment]

In [8]:
def _ga_lc_word(text):
    if text[0:1] in "nt" and text[1:2] in "AÁEÉIÍOÓUÚ":
        return text[0:1] + "-" + text[1:].lower()
    else:
        return text.lower()
    
def ga_lower(text):
    words = [_ga_lc_word(word) for word in text.split()]
    return " ".join(words)

In [9]:
test = "Cuairt an tAthair"
assert ga_lower(test) == "cuairt an t-athair"

In [50]:
import re
def clean_text(text):
    # keep only word-internal apostrophes
    text = re.sub("^'+", "", text)
    text = re.sub("[']+$", "", text)
    text = text.replace("' ", " ").replace(" '", " ")
    
    text = text.replace("’", "'")
    text = re.sub("[‘“”\"\(\)\[\]\{\}]", "", text)

    # keep punctuation that can correspond to silence
    text = re.sub("([,;\.!?])", " \\1", text)
    # leave spaced hyphens, which also can be silences, except at EOS
    text = re.sub(" \-$", "", text)
    return ga_lower(text)

In [33]:
test = "'cuairt (an) “tAthair”''"
assert clean_text(test) == "cuairt an t-athair"

In [48]:
test = "'cuairt, (an) “tAthair”!"
assert clean_text(test) == "cuairt , an t-athair !"

In [51]:
test = "'cuairt, (an) “tAthair”! -"
assert clean_text(test) == "cuairt , an t-athair !"

Actually using it.

In [52]:
from pathlib import Path

In [53]:
OUT = Path("<SNIP>")
SRC = Path("<SNIP>")

In [58]:
for filename in SRC.glob("*.txt"):
    base = filename.stem
    wav = OUT / f"{base}.wav"
    if wav.is_file():
        out = OUT / f"{base}.txt"
        with open(out, "w") as outf, open(filename) as inf:
            text = inf.read()
            clean = clean_text(text)
            outf.write(clean)