In [11]:
import os

In [22]:
import math

In [8]:
from collections import defaultdict

In [2]:
# -*- coding: utf-8 -*-

import codecs

def get_basic_phoneme(phoneme):
	if phoneme[-1].isdigit():
		return phoneme[:-1]
	return phoneme

def get_phonemes(phonemes_code):
	return tuple([get_basic_phoneme(phoneme_code) for phoneme_code in phonemes_code.split(';')])

def get_pg_pair(pg_pair_code):
	phonemes_code, grapheme = pg_pair_code.split('>')
	return (get_phonemes(phonemes_code), grapheme)

def get_mapping(mapping_code):
	return tuple([get_pg_pair(pg_pair_code) for pg_pair_code in mapping_code.split('|')])

def read_phonix(input_file_name):
	phonix = []
	with codecs.open(input_file_name, encoding = 'utf-8') as input_file:
		for line in input_file:
			line = line.strip()
			if not line: continue
			word, mapping_code = line.split(' ')
			phonix.append((word, get_mapping(mapping_code)))
	return phonix

def pg_pair_to_str(pg_pair):
	phonemes, grapheme = pg_pair
	return '%s>%s' % (';'.join(phonemes), grapheme)

def mapping_to_str(mapping):
	return '|'.join(pg_pair_to_str(pg_pair) for pg_pair in mapping)

In [3]:
def read_freq_list(freq_file_name):
    wordfreqs = {}
    with open(freq_file_name) as input_file:
        for line in input_file:
            line = line.strip()
            word, freq = line.split(' ')
            freq = float(freq)
            wordfreqs[word] = freq
    return wordfreqs

In [4]:
phonix = read_phonix('../data/phonix.txt')
phonix_dict = dict(phonix)
wordfreqs = read_freq_list('../data/word-freqs.txt')

In [5]:
def normalize(distr):
    denominator = sum(distr.values())
    return {key: float(value) / denominator for key, value in distr.items()}

In [6]:
def get_pg_freqs(wordfreqs, phonix):
    aggregator = defaultdict(float)
    for word, mapping in phonix:
        if word not in wordfreqs: continue
        wordfreq = wordfreqs[word]
        for pg in mapping:
            aggregator[pg_pair_to_str(pg)] += wordfreq
    return normalize(aggregator)

In [9]:
pg_freqs = get_pg_freqs(wordfreqs, phonix)

In [23]:
pg_logfreqs = {pg: math.log(freq) for pg, freq in pg_freqs.items()}

In [26]:
class LList:
    def __init__(self, *args):
        if 0 == len(args):
            self._tip = None
            self._prefix = None
            self._len = 0
        elif 1 == len(args):
            self._tip = args[0]
            self._prefix = LList()
            self._len = 1
        else:
            self._tip = args[1]
            self._prefix = args[0]
            self._len = 1 + len(self._prefix)

    def tip(self):
        return self._tip

    def prefix(self):
        return self._prefix

    def nil(self):
        return self._len == 0

    def __len__(self):
        return self._len

    def toPy(self):
        if self.nil(): return []
        pylist = self._prefix.toPy()
        pylist.append(self._tip)
        return pylist


In [29]:
def possibleSegmentations(sequence, cutoff_len):
    if 0 == len(sequence): return []
    segmentationColumn = [LList(LList(sequence[0]))]
    for i in range(1, len(sequence)):
        newSegmentationColumn = []
        symI = sequence[i]
        for segmentation in segmentationColumn:
            newSegmentationColumn.append(LList(segmentation, LList(symI)))
            if len(segmentation.tip()) < cutoff_len:
                newSegmentationColumn.append(LList(segmentation.prefix(), LList(segmentation.tip(), symI)))
        segmentationColumn = newSegmentationColumn
    return [[subsequence.toPy() for subsequence in segmentation.toPy()] for segmentation in segmentationColumn]

def assortSegmentations(segmentations):
    sorter = defaultdict(list)
    for segmentation in segmentations:
        sorter[len(segmentation)].append(segmentation)
    return sorter

def possibleAlignments(word, phonemes, w_cutoff = 4, p_cutoff = 3):
    return _possibleAlignments(possibleSegmentations(word, w_cutoff),
                               possibleSegmentations(phonemes, p_cutoff),
                               word, phonemes)

def _possibleAlignments(letrSegs, phSegs, word, phonemes):
    assortedLetrSegs = assortSegmentations(letrSegs)
    assortedPhSegs = assortSegmentations(phSegs)
    alignments = []
    for i in range(1, min(len(word), len(phonemes)) + 1):
        letrSegsI = [["".join(grp) for grp in segmentation]
                     for segmentation in assortedLetrSegs.get(i, [])]
        phSegsI = [[tuple(grp) for grp in segmentation]
                     for segmentation in assortedPhSegs.get(i, [])]
        for letrSeg in letrSegsI:
            for phSeg in phSegsI:
                alignments.append(list(zip(phSeg, letrSeg)))
    return alignments

In [17]:
TOWRE_FOLDER = '../data/TOWRE'
def readTowre():
    word_pronunciations = {}
    for towre_file in os.listdir(TOWRE_FOLDER):
        if not towre_file.endswith('.csv'): continue
        with open(os.path.join(TOWRE_FOLDER, towre_file)) as infile:
            for i, line in enumerate(infile):
                if 0 == i: continue
                line = line.strip()
                cells = line.split(',')
                word = cells[0].strip()
                pronunciations = []
                for pronunciation_record in cells[1:]:
                    pronunciation_record = pronunciation_record.strip()
                    if not pronunciation_record: continue
                    pronunciations.append(pronunciation_record.split('/'))
                word_pronunciations[word] = pronunciations
    return word_pronunciations

In [81]:
word_pronunciations = readTowre()

In [20]:
word_pronunciations['trisk']

[['t', 'r', 'ɪ', 's', 'k']]

In [68]:
possibleAlignments('jeltlic', word_pronunciations['jeltlic'][0])

[[(('dʒ',), 'j'), (('ɛ', 'l', 't'), 'el'), (('l', 'ɪ', 'c'), 'tlic')],
 [(('dʒ', 'ɛ'), 'j'), (('l', 't'), 'el'), (('l', 'ɪ', 'c'), 'tlic')],
 [(('dʒ', 'ɛ'), 'j'), (('l', 't', 'l'), 'el'), (('ɪ', 'c'), 'tlic')],
 [(('dʒ', 'ɛ', 'l'), 'j'), (('t',), 'el'), (('l', 'ɪ', 'c'), 'tlic')],
 [(('dʒ', 'ɛ', 'l'), 'j'), (('t', 'l'), 'el'), (('ɪ', 'c'), 'tlic')],
 [(('dʒ', 'ɛ', 'l'), 'j'), (('t', 'l', 'ɪ'), 'el'), (('c',), 'tlic')],
 [(('dʒ',), 'j'), (('ɛ', 'l', 't'), 'elt'), (('l', 'ɪ', 'c'), 'lic')],
 [(('dʒ', 'ɛ'), 'j'), (('l', 't'), 'elt'), (('l', 'ɪ', 'c'), 'lic')],
 [(('dʒ', 'ɛ'), 'j'), (('l', 't', 'l'), 'elt'), (('ɪ', 'c'), 'lic')],
 [(('dʒ', 'ɛ', 'l'), 'j'), (('t',), 'elt'), (('l', 'ɪ', 'c'), 'lic')],
 [(('dʒ', 'ɛ', 'l'), 'j'), (('t', 'l'), 'elt'), (('ɪ', 'c'), 'lic')],
 [(('dʒ', 'ɛ', 'l'), 'j'), (('t', 'l', 'ɪ'), 'elt'), (('c',), 'lic')],
 [(('dʒ',), 'j'), (('ɛ', 'l', 't'), 'eltl'), (('l', 'ɪ', 'c'), 'ic')],
 [(('dʒ', 'ɛ'), 'j'), (('l', 't'), 'eltl'), (('l', 'ɪ', 'c'), 'ic')],
 [(('dʒ', 'ɛ'

In [34]:
def alignmentScore(alignment):
    score = 0
    for pg in alignment:
        pg_str = pg_pair_to_str(pg)
        pg_score = pg_logfreqs.get(pg_str, None)
        if pg_score is None: return None
        score += pg_score
    return score

In [37]:
def bestAlignment(word, pronunciation):
    alignments = possibleAlignments(word, pronunciation)
    valid_alignments = []
    scores = []
    for alignment in alignments:
        score = alignmentScore(alignment)
        if score is not None:
            valid_alignments.append(alignment)
            scores.append(score)
    if not valid_alignments:
        print("No alignment between %s and %s" % (word, ';'.join(pronunciation)))
        return None
    best_idx = max(range(len(scores)), key = lambda i: scores[i])
    return valid_alignments[best_idx]

In [80]:
bestAlignment('happ', "h;æ;pp".split(';'))

No alignment between happ and h;æ;pp


In [39]:
def genTOWREphonix(word_pronunciations):
    phonix = []
    for word, pronunciations in word_pronunciations.items():
        for i, pronunciation in enumerate(pronunciations):
            alignment = bestAlignment(word, pronunciation)
            if alignment is not None:
                phonix.append((word if 1 == len(pronunciations) else "%s(%d)" % (word, i), alignment))
    return phonix

In [82]:
towre_phonix = genTOWREphonix(word_pronunciations)

No alignment between bevignuf and b;ɪ;v;ʌ;g;n;ʌ;f
No alignment between bevignuf and b;ɛ;v;ʌ;g;n;ʌ;f
No alignment between vuntilorst and v;ʌ;n;t;ʌ;l;ɝ;s;t
No alignment between vuntilorst and v;ʌ;n;t;æ;l;ɝ;s;t
No alignment between crolnadibot and k;r;ɔ;l;n;ɛ;d;æ;b;ɑ;t
No alignment between crolnadibot and k;r;ɔ;l;n;æ;d;ʌ;b;ɑ;t
No alignment between crolnadibot and k;r;ɑ;l;n;æ;d;æ;b;ɑ;t
No alignment between crolnadibot and k;r;ɑ;l;n;ɛ;d;ʌ;b;ɑ;t
No alignment between crolnadibot and k;r;ɔ;l;n;æ;d;æ;b;ɑ;t
No alignment between crolnadibot and k;r;ɔ;l;n;ɛ;d;ʌ;b;ɑ;t
No alignment between crolnadibot and k;r;ɑ;l;n;ɛ;d;æ;b;ɑ;t
No alignment between crolnadibot and k;r;ɑ;l;n;æ;d;ʌ;b;ɑ;t
No alignment between happon and h;æ;p;ɛ;n
No alignment between morlingdon and m;ɝ;l;i;ŋ;d;ɛ;n
No alignment between revignuf and r;i;v;ə;g;n;ʌ;f
No alignment between revignuf and r;ɛ;v;ə;g;n;ʌ;f
No alignment between strilmolifant and s;t;r;ɪ;l;m;oʊ;l;ʌ;f;æ;n;t
No alignment between strilmolifant and s;t;r;ɪ;l;m;ɑ;l;ʌ;f;æ

In [91]:
towre_phonix[2]

('amalfitut(1)',
 [(('ʌ',), 'a'),
  (('m',), 'm'),
  (('ɑ',), 'a'),
  (('l',), 'l'),
  (('f',), 'f'),
  (('ɪ',), 'i'),
  (('t',), 't'),
  (('ʌ',), 'u'),
  (('t',), 't')])

In [94]:
def output_phonix(phonix, outfname):
    phonix.sort(key = lambda entry: entry[0])
    with codecs.open(outfname, 'w', encoding = 'utf-8') as outfile:
        for word, mapping in phonix:
            print('%s %s' % (word, mapping_to_str(mapping)), file=outfile)

In [95]:
output_phonix(towre_phonix, '../data/towre_phonix.txt')

In [77]:
pg_freqs['ɛ>o']

KeyError: 'ɛ>o'

In [None]:
tʃ