In [38]:
es_train_data_file = "./ES/train"
es_test_data_file = "./ES/dev.in"
es_predicted_data_file = "./ES/dev.p1.out"
es_actual_data_file = "./ES/dev.out"
ru_train_data_file = "./RU/train"
ru_test_data_file = "./RU/dev.in"
ru_predicted_data_file = "./RU/dev.p1.out"
ru_actual_data_file = "./RU/dev.out"

# Part 1 - Emission Paramters, UNK Token

In [39]:
def _preprocess_training_file(file_path):
    with open(file_path, encoding="utf8") as f_lines:
        data = f_lines.read().splitlines()
        data[:] = [x for x in data if x]

    output = []
    for i in data:
        i = i.split(" ")
        if(len(i) > 2):
            i = [" ".join(i[0:len(i)-1]), i[len(i)-1]]
            output.append(i)
        else:
            output.append(i)
        
    return output

def _preprocess_testing_file(path):
    with open(path, encoding="utf8") as f:
        data = f.read().splitlines()

    output = []
    for word in data:
        # if word: # leave the newlines or not?
        output.append(word)

    return output

In [40]:
def get_emission_using_MLE(training, k = 1):
    tags = {}
    tags_to_word = {}
    emission = {}
    for data in training:
        word, tag = data[0], data[1]
        if tag in tags:
            tags[tag] += 1
        else:
            tags[tag] = 1
        
        tag_to_word = tuple((tag, word))
        if tag_to_word in tags_to_word:
            tags_to_word[tag_to_word] += 1
        else:
            tags_to_word[tag_to_word] = 1

    for key in tags_to_word.keys():
        emission[key] = tags_to_word[key] / (tags[key[0]] + k)
    for key in tags.keys():
        transition = tuple((key, "#UNK#"))
        emission[transition] = k / (tags[key] + k)
    # print(emission)
    return emission

In [41]:
def get_most_probable_tag(emission):
    highest_prob = {}
    output = {}
    for key, prob in emission.items():
        tag, word = key[0], key[1]
        if word not in highest_prob:
            highest_prob[word] = prob
            output[word] = tag
        else:
            if prob > highest_prob[word]:
                highest_prob[word] = prob
                output[word] = tag
    # print(output)
    return output

In [42]:
def get_predicted_file(predicted_file, test_file, most_probable_tag):
    f = open(predicted_file, "w", encoding="utf8")
    for word in test_file:
        if len(word) > 0:
            try:
                y = most_probable_tag[word]
            except:
                y = most_probable_tag["#UNK#"]
            f.write(f"{word} {y}\n")
        else: # leave the newlines??
            f.write("\n")
    f.close()

In [43]:
ES_train_data = _preprocess_training_file(es_train_data_file)
ES_test_data = _preprocess_testing_file(es_test_data_file)

emission_parameters = get_emission_using_MLE(ES_train_data)
most_probable_tag = get_most_probable_tag(emission_parameters)
get_predicted_file(es_predicted_data_file, ES_test_data, most_probable_tag)

In [44]:
def evaluate_precision(path_predicted, path_actual):
    correctly_predicted_entities = 0
    predicted_entities = 0
    with open(path_predicted, encoding="utf8") as f:
        predicted_data = f.read().splitlines()
        predicted_data[:] = [x for x in predicted_data if x]

    with open(path_actual, encoding="utf8") as f:
        actual_data = f.read().splitlines()
        actual_data[:] = [x for x in actual_data if x]

    for i in range(len(predicted_data)):
        predicted_entities += 1
        if predicted_data[i] == actual_data[i]:
            correctly_predicted_entities += 1

    return correctly_predicted_entities / predicted_entities

evaluate_precision(es_predicted_data_file, es_actual_data_file)

0.6397645271553362

# Part 2i - Transition Parameters

In [45]:
def _preprocess_training_file(training_file):
    tags = []
    tags_with_start_stop = []
    words = []    

    with open(training_file, encoding="utf8") as f:
        document = f.read().rstrip()
        lines = document.split("\n\n")

        for line in lines:
            words_ls = []
            tag_ls = []
            tags_with_start_stop = []
            for word_tag in line.split("\n"):
                word, tag = word_tag.split(" ")
                words_ls.append(word)
                tags_ls.append(tag)

            tag_ls_with_start_stop = ["START"] + tag_ls + ["STOP"]
            tags.append(tag_seq)
            tags_with_start_stop.append(tag_ls_with_start_stop)
            words.append(word_ls)
    return tags, tags_with_start_stop, words

def _preprocess_test_file(training_file):
    test_words = []

    with open(testing_file, encoding="utf8") as f:
        document = f.read().rstrip()
        lines = document.split("\n\n")

        for line in lines:
            word_ls = []
            for word in line.split("\n"):
                word_ls.append(word)
            test_words.append(word_ls)

    return test_words

In [46]:
def get_unique_tags(tags):
    tags_unique = get_unique_elements(tags)
    tags_unique.sort()
    tags_unique_with_start_stop = ["START"] + tags_unique + ["STOP"]
    return tags_unique, tags_unique_with_start_stop


In [47]:
def get_transition_pairs(tags):
    transition_pairs = []

    for tag in tags:
        #yi-1 and yi tuples
        for tag1, tag2 in zip(tags[:-1], tags[1:]):
            transition_pairs.append([tag1, tag2])

    return transition_pairs

In [48]:
def count_y(tag, tags):
    tags_flattened = list(itertools.chain.from_iterable(tags))
    return tags_flattened.count(tag)

In [49]:
def get_transition_using_MLE(tags_unique_with_start_stop, transition_pairs,
                          tags_with_start_stop):
  
    transition = {}
    #create matrix dimensions
    for tag1 in tags_unique_with_start_stop[:-1]:
        transition_row = {}
        for tag2 in tags_unique_with_start_stop[1:]:
            transition_row[tag2] = 0.0
        transition[tag1] = transition_row

    # populate transition parameters with counts
    for tag1, tag2 in transition_pairs:
        transition[tag1][tag2] += 1

    # divide transition_count by count_yi, to get probability
    for tag1, transition_row in transition.items():
        count_yi = count_y(tag1, tags_with_start_stop)

        # words in training set
        for tag2, transition_count in transition_row.items():
            transition[tag1][tag2] = transition_count / count_yi
    
    print(transition)
    return transition

In [None]:
# TODO: For Sath to include code to run the transition parameter functions

# Part 2ii - Viterbi Algorithm

In [None]:
# TODO: Song Gee to write algorithm for Viterbi