In [1]:
from collections import Counter
import numpy as np
import pandas as pd
from typing import List, Tuple, Dict, Union
from functools import reduce
from itertools import product

Warmup statistics for most common words

In [2]:
def read_file(filepath: str) -> Tuple[List[str], List[str], List[List[str]]]:
    words = []
    sentences = []
    with open(filepath, "r") as f:
        lines = f.readlines()
        lines = [l.strip() for l in lines]  # strip whitespace and newline character

        for line in lines:
            remove = [".", ",", "!", "?", ":", ";"]
            new_words = list(filter(lambda x: x not in remove, line.split(" ")))

            # extend to one dimensional list to anaylze overall frequency of words
            words.extend(new_words)

            # append as separate sentence to keep sentence structure of words
            sentences.append(new_words)

        # filter unique words
        unique = list(set(words))
        unique.append("NULL")
    return words, unique, sentences


In [3]:
# german text
data_de_path = "./data/europarl-v7.de-en.lc.de"
de_words, de_unique, de_sentences = read_file(data_de_path)

# english text
data_en_path = "./data/europarl-v7.de-en.lc.en"
en_words, en_unique, en_sentences = read_file(data_en_path)

In [4]:
de_cnt = Counter(de_words)
en_cnt = Counter(en_words)

n = 10
de_most_cmn = de_cnt.most_common(n)
en_most_cmn = en_cnt.most_common(n)

print(f"Most common {n} words in german text:")
for word, cnt in de_most_cmn:
    print(f"Word: '{word}'; Count: {cnt}")

print(f"Most common {n} words in english text:")
for word, cnt in en_most_cmn:
    print(f"Word: '{word}'; Count: {cnt}")


Most common 10 words in german text:
Word: 'die'; Count: 10521
Word: 'der'; Count: 9374
Word: 'und'; Count: 7028
Word: 'in'; Count: 4175
Word: 'zu'; Count: 3168
Word: 'den'; Count: 2976
Word: 'wir'; Count: 2863
Word: 'daß'; Count: 2738
Word: 'ich'; Count: 2670
Word: 'das'; Count: 2669
Most common 10 words in english text:
Word: 'the'; Count: 19847
Word: 'of'; Count: 9597
Word: 'to'; Count: 9059
Word: 'and'; Count: 7303
Word: 'in'; Count: 6237
Word: 'is'; Count: 4478
Word: 'that'; Count: 4441
Word: 'a'; Count: 4435
Word: 'we'; Count: 3372
Word: 'this'; Count: 3362


In [5]:
p_zebra = en_cnt["zebra"] / len(en_words)
print(f"Probability for 'zebra': {p_zebra}")

p_speaker = en_cnt["speaker"] / len(en_words)
print(f"Probability for 'speaker': {p_speaker}")


Probability for 'zebra': 0.0
Probability for 'speaker': 4.150567495773968e-05


Bigram Language modeling

In [6]:
def get_pair_cnt(words: List[str], pair: Tuple[str, str]) -> int:
    cnt = 0
    # stop at second last word
    for i in range(len(words) - 1):
        w1, w2 = words[i], words[i + 1]
        if w1 == pair[0] and w2 == pair[1]:
            cnt += 1
    return cnt


def calc_prob_word(w_cur: str, w_prev: str, words_cnt: Counter) -> float:
    pair_count = get_pair_cnt(words_cnt, (w_prev, w_cur))
    solo_count = en_cnt[w_prev]

    # is a word does not occur at all, retrurn 0 probability
    if solo_count == 0:
        return 0

    # maximum likelihood estimation
    return pair_count / solo_count


In [7]:
def calc_phrase_probs(sentence: List[str], words_cnt: Counter) -> float:
    probs = []
    for i in range(len(sentence)):
        if i == 0:
            probs.append(words_cnt[sentence[i]])
        else:
            w1, w2 = sentence[i], sentence[i - 1]
            probs.append(calc_prob_word(w1, w2))

    return reduce(lambda a, b: a * b, probs)


Translation modeling

In [8]:
fr_sentence = "le chat noir"
fr_sentence = fr_sentence.split(" ")
eng_sentence = "the black cat"
eng_sentence = eng_sentence.split(" ")


def init_c(a1: List[str], a2: List[str]) -> Dict[Union[Tuple[str, str], str], float]:
    combos = list(product(a1, a2))
    c = {(k1, k2): 0 for k1, k2 in combos}
    e = {e: 0 for e in a1}
    c.update(e)
    return c


def reestimate_t(t, c):
    combo_keys = list(filter(lambda x: isinstance(x, tuple), c.keys()))
    for key_pair in combo_keys:
        orig_key, trans_key = key_pair[0], key_pair[1]
        t.loc[orig_key][trans_key] = c[key_pair] / c[trans_key]
    return t


def em_iterations(
    orig_uniques: List[str],
    trans_uniques: List[str],
    orig_corpus: List[List[str]],
    trans_corpus: List[List[str]],
) -> pd.DataFrame:
    # random init T with dimension of unique words in training corpus
    t_data = np.random.rand(len(orig_uniques), len(trans_uniques))
    t = pd.DataFrame(t_data, columns=trans_uniques, index=orig_uniques)

    # for each EM iteration (each unique word in a corpus)
    for _ in range(50):
    #for t_idx, t_row in t.iterrows():
        # init pseudocounts
        c = init_c(orig_uniques, trans_uniques)  # soft counts

        # for each sentence
        for k in range(len(orig_corpus)):
            # for each orig word
            for i in range(len(orig_uniques)):
                # for each trans word
                for j in range(len(trans_uniques)):
                    # calc alignment prob and update pseudocount
                    orig_word, trans_word = orig_corpus[k][i], trans_corpus[k][j]
                    delta = t.loc[orig_word][trans_word] / t.loc[orig_word].sum()

                    orig_word = orig_corpus[k][i]
                    trans_word = trans_corpus[k][j]
                    c[(orig_word, trans_word)] += delta
                    c[orig_word] += delta

        # reestimate t probs
        t = reestimate_t(t, c)

    return t


In [9]:
t = em_iterations(de_unique, en_unique, de_sentences, en_sentences)
t

In [84]:
df = pd.DataFrame([[1,2], [3,4]], columns=["trans1", "trans2"], index=["orig1", "orig2"])
df.loc["orig1"].sum()

3

In [90]:
c = {("a", "b"): 0.54, ("c", "d"): 0.34, "f": 0.23}
k = list(filter(lambda x: isinstance(x, tuple), c.keys()))
k

[('a', 'b'), ('c', 'd')]

In [None]:
fr_sentence = "le chat noir"
fr_sentence = fr_sentence.split(" ")
eng_sentence = "the black cat"
eng_sentence = eng_sentence.split(" ")


def init_c(a1: List[str], a2: List[str]) -> Dict[Union[Tuple[str, str], str], float]:
    combos = list(product(a1, a2))
    c = {(k1, k2): 0 for k1, k2 in combos}
    e = {e: 0 for e in a1}
    c.update(e)
    return c

def reestimate_t(t, c):
    combo_keys = 


def em_iterations(
    orig_prhase: List[str],
    trans_phrase: List[str],
    orig_uniques: List[str],
    trans_uniques: List[str],
    orig_corpus: List[List[str]],
    trans_corpus: List[List[str]],
) -> np.ndarray:
    # random init T with dimension of unique words in training corpus
    t_data = np.random.rand(len(orig_uniques), len(trans_uniques))
    t = pd.DataFrame(t_data, columns=trans_uniques, index=orig_uniques)

    # for each EM iteration (each unique word in a corpus)
    for t_idx, t_row in t.iterrows():
        # init pseudocounts
        c = init_c(orig_uniques, trans_uniques)  # soft counts

        # for each sentence
        for k in range(len(orig_corpus)):
            # for each orig word
            for i in range(len(orig_uniques)):
                # for each trans word
                for j in range(len(trans_uniques)):
                    # calc alignment prob and update pseudocount
                    orig_word, trans_word = orig_corpus[k][i], trans_corpus[k][j]
                    delta = t.loc[orig_word][trans_word] / t.loc[orig_word].sum()

                    orig_word = orig_corpus[k][i]
                    trans_word = trans_corpus[k][j]
                    c[(orig_word, trans_word)] += delta
                    c[orig_word] += delta

        # reestimate t probs
        t.loc[]

    return 1


1800288311


1

In [59]:
a = np.array([[1,2,3,4],[5,6,7,8]])
np.sum(a[:,1])

8

Decoding