In [1]:
ES_TRAIN_DATA_FILE = "./ES/train"
ES_TEST_DATA_FILE = "./ES/dev.in"
ES_ACTUAL_DATA_FILE = "./ES/dev.out"
ES_PART1_PREDICTED_DATA_FILE = "./ES/dev.p1.out"
ES_PART2_PREDICTED_DATA_FILE = "./ES/dev.p2.out"
ES_PART3_PREDICTED_DATA_FILE = "./ES/dev.p3.out"
ES_PART4_PREDICTED_DATA_FILE = "./ES/dev.p4.out"

RU_TRAIN_DATA_FILE = "./RU/train"
RU_TEST_DATA_FILE = "./RU/dev.in"
RU_ACTUAL_DATA_FILE = "./RU/dev.out"
RU_PART1_PREDICTED_DATA_FILE = "./RU/dev.p1.out"
RU_PART2_PREDICTED_DATA_FILE = "./RU/dev.p2.out"
RU_PART3_PREDICTED_DATA_FILE = "./RU/dev.p3.out"
RU_PART4_PREDICTED_DATA_FILE = "./RU/dev.p4.out"

In [18]:
import itertools
def _preprocess_training_file(training_file):
    tags = []
    tags_with_start_stop = []
    words = []

    with open(training_file,"r",encoding="utf8") as f:
        document = f.read().rstrip()
        lines = document.split("\n\n")

        for line in lines:
            tags_list = []
            tags_with_start_stop_list = []
            words_list = []

            for word_tag in line.split("\n"):
                i = word_tag.split(" ")

                if len(i) > 2:
                    i = [" ".join(i[0:len(i)-1]), i[len(i)-1]]

                word, tag = i[0], i[1]
                words_list.append(word)
                tags_list.append(tag)

            tags.append(tags_list)
            tags_with_start_stop_list = ["START"] + tags_list + ["STOP"]
            tags_with_start_stop.append(tags_with_start_stop_list)
            words.append(words_list)
    
    return tags, tags_with_start_stop, words

def _preprocess_test_file(testing_file):
    test_words = []

    with open(testing_file,"r",encoding="utf8") as f:
        document = f.read().rstrip()
        lines = document.split("\n\n")

        for line in lines:
            word_list = []
            for word in line.split("\n"):
                word_list.append(word)
            test_words.append(word_list)

    return test_words

In [52]:
def get_unique(x):
    return list(set(list(itertools.chain.from_iterable(x))))

In [22]:
def get_transition_pairs(tags):
    transition_pair_count = {}

    for tag in tags:
        for tag1, tag2 in zip(tag[:-1], tag[1:]):
            transition_pair = (tag1, tag2)
            if transition_pair in transition_pair_count:
                transition_pair_count[transition_pair] += 1
            else:
                transition_pair_count[transition_pair] = 1

    return transition_pair_count

In [23]:
def count_y(tag, tags):
    tags_flattened = list(itertools.chain.from_iterable(tags))
    return tags_flattened.count(tag)

In [98]:
def get_emission_using_MLE(unique_tags, unique_words, tags, words, k=1):
    emission = {}
    unique_tags.extend(["START", "STOP"])
    for tag in unique_tags:
        emission_word = {}
        for word in unique_words:
            emission_word[word] = 0.0
        emission_word["#UNK#"] = 0.0
        emission[tag] = emission_word
    # fill up emission dictionary
    for tag_sentence, word_sentence in zip(tags, words):
        for tag, word in zip(tag_sentence, word_sentence):
            emission[tag][word] += 1
            
    for tag, emission_word in emission.items():
        tag_count = count_y(tag, tags) + k
        for word, count_y_x in emission_word.items():
            emission[tag][word] = count_y_x / (tag_count)

        emission[tag]["#UNK#"] = k / (tag_count)
    return emission

In [118]:
def get_transition_using_MLE(unique_tags, tags_with_start_stop, transition_pair_count):
    unique_tags = ["START"] + unique_tags + ["STOP"]
    transition = {}
    for u in unique_tags[:-1]: # omit STOP
        transition_tag = {}
        for v in unique_tags[1:]: # omit START
            transition_tag[v] = 0.0
        transition[u] = transition_tag

    # fill up transition parameters
    for u, v in transition_pair_count:
        transition[u][v] += transition_pair_count[(u, v)]
    
    # divide transition_count by count_yi, to get probability
    for u, transition_tag in transition.items():
        count_yi = count_y(u, tags_with_start_stop)
        # words in training set
        for v, transition_count in transition_tag.items():
            if count_yi == 0:
                transition[u][v] = 0.0
            else:
                transition[u][v] = transition_count / count_yi

    return transition

In [60]:
def get_unknown_words(train, test):
    test_words = get_unique(test)
    train_words = get_unique(train)
    return set(test_words) - set(train_words)


In [119]:
ES_tags, ES_tags_with_start_stop, ES_train_words = _preprocess_training_file(ES_TRAIN_DATA_FILE)
ES_test_words = _preprocess_test_file(ES_TEST_DATA_FILE)
ES_unique_tags = get_unique(ES_tags)
ES_unique_words = get_unique(ES_train_words)
ES_emission_parameters = get_emission_using_MLE(ES_unique_tags, ES_unique_words, ES_tags, ES_train_words)

ES_transition_pair_count = get_transition_pairs(ES_tags_with_start_stop)
ES_transition_parameters = get_transition_using_MLE(ES_unique_tags, ES_tags_with_start_stop, ES_transition_pair_count)

ES_unknown_words = get_unknown_words(ES_train_words, ES_test_words)
