In [3]:
from xpinyin import Pinyin
import eng_to_ipa
import Levenshtein
import numpy as np
import epitran

In [78]:
class MemeTextGenerator:
    def __init__(self, word_file, delimiter='\n'):
        with open(word_file, 'r') as f:
            self.words = f.read().strip().split(delimiter)
        self.words = np.array(self.words)
        epi = epitran.Epitran('eng-Latn', ligatures=True)
        vowels = ['ə', 'æ', 'ɪ', 'i', 'e', 'ɔ', 'a', 'ʌ', 'u', 'ɛ', 'ɑ', 'o', 'ʊ']
        consonants = ['b', 'n', 'd', 'l', 't', 'j', 'ɹ', 'ʃ', 'w', 'v', 's', 'z', 'k', 'm', 'p', 'ŋ', 'ɹ̩', 'ʤ', 'ʧ', 'ʒ', 'f', 'ɡ', 'h', 'ð', 'θ']
        ipa2py = {'-':[''],
                 'ə':['e', 'a'], 'æ':['e'], 'ɪ':['i'], 'i':['i'], 'e':['e'], 'ɔ':['o'], 'a':['a'], 'ʌ':['a'], 'u':['u'], 'ɛ':['e'], 'ɑ':['a'], 'o':['o'], 'ʊ':['u'], 
                 'ow':['ou'], 'aw':['ao'], 'iə':['ia'], 'əwə':['uo', 'wo'], 'əwəl':['uo', 'wo'], 'əl':['ou'], 
                 'j':['i'], 'ɹ':['r'], 'ʃ':['xi','sh'], 'v':['f'], 's':['s','sh'],'z':['z'], 'ŋ':['ng'], 'ɹ̩':['er'], 'ʤ':['ju', 'zhe'], 'ʧ':['qi', 'qu'], 'ʒ':['ju', 'zhe'], 'ð':['l'], 'θ':['s', 'sh'], 'l':['o'], 
                 'tɹ':['ch'], 'tɹ̩':['ter'], 'dɹ':['zh'], 'dɹ̩':['der'], 'lɹ̩':['ler'], 'kw':['ku'], "dz":['zi'], "ts":['ci']}

        def ipa_2_py(ipa):
            py = ['']
            index = 0
            L = len(ipa)
            while index < L:
                if index+3 < L and ipa[index:index+4] in ipa2py:
                    k = 4
                    py_num = len(py)
                    for i in range(1, len(ipa2py[ipa[index:index+k]])):
                        for j in range(py_num):
                            py.append(py[j] + ipa2py[ipa[index:index+k]][i])
                    for j in range(py_num):
                        py[j] += ipa2py[ipa[index:index+k]][0]
                    index += k
                elif index+2 < L and ipa[index:index+3] in ipa2py:
                    k = 3
                    py_num = len(py)
                    for i in range(1, len(ipa2py[ipa[index:index+k]])):
                        for j in range(py_num):
                            py.append(py[j] + ipa2py[ipa[index:index+k]][i])
                    for j in range(py_num):
                        py[j] += ipa2py[ipa[index:index+k]][0]
                    index += k
                elif index+1 < L and ipa[index:index+2] in ipa2py:
                    k = 2
                    py_num = len(py)
                    for i in range(1, len(ipa2py[ipa[index:index+k]])):
                        for j in range(py_num):
                            py.append(py[j] + ipa2py[ipa[index:index+k]][i])
                    for j in range(py_num):
                        py[j] += ipa2py[ipa[index:index+k]][0]
                    index += k
                elif ipa[index] == 'l' and index+1 < L and ipa[index+1] in vowels:
                    for i in range(len(py)):
                        py[i] += ipa[index]
                    index += 1
                elif ipa[index] in ipa2py:
                    py_num = len(py)
                    for i in range(1, len(ipa2py[ipa[index]])):
                        for j in range(py_num):
                            py.append(py[j] + ipa2py[ipa[index]][i])
                    for j in range(py_num):
                        py[j] += ipa2py[ipa[index]][0]
                    index += 1
                else:
                    for i in range(len(py)):
                        py[i] += ipa[index]
                    index += 1
            return py
        
        self.candidates = []
        for word in self.words:
            ipa = epi.transliterate(word)
            py = ipa_2_py(ipa)
            self.candidates.append((word, py))
    
    def generate(self, s):
        p = Pinyin()
        min_distance = float('inf')
        distances = []
        cand_words = []
        for i in range(len(s)-1):
            chs = p.get_pinyin(s[i:i+2]).split('-')
            target = ''.join(chs)
            target = target.replace('y', 'i')
            ds = []
            for cand in self.candidates:
                d = []
                for py in cand[1]:
                    if py[0] != target[0]: d.append(10)
                    else: d.append(Levenshtein.distance(py, target, weights=(1, 1, 2)))
                ds.append(min(d))
            ds = np.array(ds)
            cand_words.append(self.words[ds == min(ds)])
            distances.append(min(ds))
        min_distance = min(distances)
        if min_distance > 2:
            return None
        results = []
        for i in range(len(s)-1):
            if distances[i] == min_distance:
                for word in cand_words[i]:
                    results.append(s[:i] + word + s[i+2:])
        return results

In [79]:
generator = MemeTextGenerator('vocabulary3000.txt')

In [81]:
s = u''
result = generator.generate(s)
print(result)

['有barrier來', '有備airline', '有備ally', '有備early']
