In [1]:
from collections import Counter
import numpy as np
import pandas as pd
from typing import List, Tuple, Dict, Union
from functools import reduce

Warmup statistics for most common words

In [2]:
def read_file(filepath: str) -> Tuple[List[str], List[str], List[List[str]]]:
    words = []
    sentences = []
    with open(filepath, "r") as f:
        lines = f.readlines()
        lines = [l.strip() for l in lines]  # strip whitespace and newline character

        for line in lines:
            remove = [".", ",", "!", "?", ":", ";"]
            new_words = list(filter(lambda x: x not in remove, line.split(" ")))

            # extend to one dimensional list to anaylze overall frequency of words
            words.extend(new_words)

            # append as separate sentence to keep sentence structure of words
            sentences.append(new_words)

        # filter unique words
        unique = list(set(words))
        unique.append("NULL")
    return words, unique, sentences


In [3]:
# german text
data_de_path = "./data/europarl-v7.de-en.lc.de"
de_words, de_unique, de_sentences = read_file(data_de_path)

# english text
data_en_path = "./data/europarl-v7.de-en.lc.en"
en_words, en_unique, en_sentences = read_file(data_en_path)

In [4]:
de_cnt = Counter(de_words)
en_cnt = Counter(en_words)

n = 10
de_most_cmn = de_cnt.most_common(n)
en_most_cmn = en_cnt.most_common(n)

print(f"Most common {n} words in german text:")
for word, cnt in de_most_cmn:
    print(f"Word: '{word}'; Count: {cnt}")

print(f"Most common {n} words in english text:")
for word, cnt in en_most_cmn:
    print(f"Word: '{word}'; Count: {cnt}")


Most common 10 words in german text:
Word: 'die'; Count: 10521
Word: 'der'; Count: 9374
Word: 'und'; Count: 7028
Word: 'in'; Count: 4175
Word: 'zu'; Count: 3168
Word: 'den'; Count: 2976
Word: 'wir'; Count: 2863
Word: 'daß'; Count: 2738
Word: 'ich'; Count: 2670
Word: 'das'; Count: 2669
Most common 10 words in english text:
Word: 'the'; Count: 19847
Word: 'of'; Count: 9597
Word: 'to'; Count: 9059
Word: 'and'; Count: 7303
Word: 'in'; Count: 6237
Word: 'is'; Count: 4478
Word: 'that'; Count: 4441
Word: 'a'; Count: 4435
Word: 'we'; Count: 3372
Word: 'this'; Count: 3362


In [5]:
p_zebra = en_cnt["zebra"] / len(en_words)
print(f"Probability for 'zebra': {p_zebra}")

p_speaker = en_cnt["speaker"] / len(en_words)
print(f"Probability for 'speaker': {p_speaker}")


Probability for 'zebra': 0.0
Probability for 'speaker': 4.150567495773968e-05


Bigram Language modeling

In [6]:
def get_pair_cnt(words: List[str], pair: Tuple[str, str]) -> int:
    cnt = 0
    # stop at second last word
    for i in range(len(words) - 1):
        w1, w2 = words[i], words[i + 1]
        if w1 == pair[0] and w2 == pair[1]:
            cnt += 1
    return cnt


def calc_prob_word(w_cur: str, w_prev: str, words_cnt: Counter) -> float:
    pair_count = get_pair_cnt(words_cnt, (w_prev, w_cur))
    solo_count = en_cnt[w_prev]

    # is a word does not occur at all, retrurn 0 probability
    if solo_count == 0:
        return 0

    # maximum likelihood estimation
    return pair_count / solo_count


In [16]:
def calc_phrase_probs(sentence: List[str], words_cnt: Counter) -> float:
    probs = []
    for i in range(len(sentence)):
        if i == 0:
            probs.append(words_cnt[sentence[i]])
        else:
            w1, w2 = sentence[i], sentence[i - 1]
            probs.append(calc_prob_word(w1, w2, words_cnt))

    return reduce(lambda a, b: a * b, probs)


Translation modeling

In [8]:
def reestimate_t(t, c):
    combo_keys = list(filter(lambda x: isinstance(x, tuple), c.keys()))
    for key_pair in combo_keys:
        orig_key, trans_key = key_pair[0], key_pair[1]
        t.loc[orig_key][trans_key] = c[key_pair] / c[orig_key]
    return t


def em_iterations(
    orig_uniques: List[str],
    trans_uniques: List[str],
    orig_corpus: List[List[str]],
    trans_corpus: List[List[str]],
    iterations: int = 1,
    t_cache=None,
) -> pd.DataFrame:
    # direct compare with None not possible due to pandas
    if not type(t_cache) == type(None):
        # random init T with dimension of unique words in training corpus
        t_data = np.random.rand(len(orig_uniques), len(trans_uniques))
        t = pd.DataFrame(t_data, columns=trans_uniques, index=orig_uniques)

    # for each EM iteration (each unique word in a corpus)
    for _ in range(iterations):
        # init pseudocounts
        c = {}

        # for each sentence
        for k in range(len(orig_corpus)):
            print(f"{k+1} / {len(orig_corpus)}", flush=True, end="\r")
            orig_sen = orig_corpus[k]
            trans_sen = trans_corpus[k]

            # for each orig word in sentence
            for i in range(len(orig_sen)):
                # for each trans word in sentence
                for j in range(len(trans_sen)):
                    # calc alignment prob and update pseudocount
                    orig_word = orig_sen[i]
                    trans_word = trans_sen[j]
                    delta = t.loc[orig_word][trans_word] / t.loc[orig_word].sum()

                    orig_word = orig_corpus[k][i]
                    trans_word = trans_corpus[k][j]

                    # update softcounts
                    if (orig_word, trans_word) not in c.keys():
                        c[(orig_word, trans_word)] = 0
                    if orig_word not in c.keys():
                        c[orig_word] = 0
                    c[(orig_word, trans_word)] += delta
                    c[orig_word] += delta

        # reestimate t probs
        t = reestimate_t(t, c)

    return t


In [23]:
t_data = np.full((len(de_unique), len(en_unique)), 0.5)
t_cache = pd.DataFrame(t_data, columns=en_unique, index=de_unique)


In [24]:
t = em_iterations(de_unique, en_unique, de_sentences, en_sentences, t_cache=t_cache)


9999 / 10000

In [25]:
# save t
t.to_csv("t.csv")


In [26]:
test_word = "european"
translations = t[test_word].nlargest(10)
translations

anzurichten                    0.999931
möglicher                      0.999837
meinungsverschiedenheit        0.999636
personalsystem                 0.999572
gerutscht                      0.999309
militärs                       0.999271
bot                            0.999259
sinken                         0.999124
beschäftigungsmöglichkeiten    0.999070
elite                          0.998867
Name: european, dtype: float64

In [27]:
t

Unnamed: 0,lucas,celebrating,helped,contentious,laid,uprooted,distortions,principles,disregard,apology,...,gains,2012,densities,unnoticed,grip,time-related,original,fight,consciences,NULL
vereinfachung,0.883859,0.602670,0.929004,0.753079,0.044508,0.930827,0.431534,0.954493,0.764328,0.350410,...,0.744715,0.522115,0.784169,0.914207,0.353058,0.577345,0.393063,0.571678,0.075509,0.469871
konzipiert,0.799199,0.793710,0.372171,0.664735,0.909308,0.695382,0.924483,0.940851,0.993392,0.161267,...,0.055470,0.748123,0.337444,0.017594,0.096244,0.641658,0.322365,0.004489,0.343519,0.978729
sharm-al-sheikh,0.219580,0.796246,0.314933,0.491926,0.039228,0.619836,0.861082,0.722475,0.438984,0.792226,...,0.282281,0.220763,0.441879,0.273690,0.316245,0.083012,0.170565,0.117326,0.320922,0.539911
erstmals,0.382469,0.721860,0.931573,0.473319,0.725038,0.319490,0.735937,0.109251,0.368670,0.729800,...,0.286833,0.924002,0.769104,0.357487,0.372872,0.786945,0.298774,0.968581,0.295490,0.626809
mannesmann,0.299785,0.144564,0.789227,0.986529,0.250881,0.375542,0.969084,0.619666,0.039272,0.535423,...,0.501253,0.091555,0.695517,0.467964,0.263146,0.184874,0.742344,0.517080,0.492388,0.300591
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2012,0.256274,0.760120,0.587340,0.593305,0.459136,0.387961,0.305173,0.880537,0.382769,0.844393,...,0.694836,0.031801,0.734585,0.303454,0.259256,0.348572,0.762261,0.329843,0.586084,0.382216
sehe,0.945919,0.511979,0.884957,0.648082,0.846897,0.266635,0.860160,0.112157,0.999301,0.798087,...,0.535491,0.158890,0.410145,0.598682,0.544511,0.400300,0.642439,0.548028,0.810526,0.210995
zerstreuen,0.835962,0.040454,0.792540,0.522234,0.701142,0.429292,0.610958,0.976210,0.018812,0.966344,...,0.118466,0.408310,0.488639,0.068017,0.506826,0.523780,0.589586,0.271672,0.566979,0.588105
betreiben,0.036693,0.584328,0.153738,0.687203,0.323485,0.046888,0.530371,0.790028,0.004499,0.873841,...,0.907910,0.286495,0.112809,0.076268,0.264941,0.149551,0.302055,0.024911,0.794571,0.347945


Decoding

Find $E^* = argmax_E P(E|F) = argmax_E P(E)P(F|E)$

In [21]:
de_test = "die schwarze katze".split(" ") # black cat in german
en_test = "the black cat".split(" ")

for w in de_test:
    idx_translation = t.columns[t.loc[w].argmax()]
    P_d_e = t.loc[w][idx_translation]
    print(P_d_e)

    p_trans = calc_phrase_probs(idx_translation.split(" "), en_cnt)
    print(p_trans)

    p_orig_trans = t


0.9997510696950254
1
0.9998628383185835
50
0.9999897723583657
5
