In [15]:
ES_TRAIN_DATA_FILE = "./ES/train"
ES_TEST_DATA_FILE = "./ES/dev.in"
ES_PREDICTED_DATA_FILE = "./ES/dev.p1.out"
ES_ACTUAL_DATA_FILE = "./ES/dev.out"
RU_TRAIN_DATA_FILE = "./RU/train"
RU_TEST_DATA_FILE = "./RU/dev.in"
RU_PREDICTED_DATA_FILE = "./RU/dev.p1.out"
RU_ACTUAL_DATA_FILE = "./RU/dev.out"

# Part 1 - Emission Paramters, UNK Token

In [16]:
def _preprocess_training_file(file_path):
    with open(file_path, encoding="utf8") as f_lines:
        data = f_lines.read().splitlines()
        data[:] = [x for x in data if x]

    output = []
    for i in data:
        i = i.split(" ")
        if(len(i) > 2):
            i = [" ".join(i[0:len(i)-1]), i[len(i)-1]]
            output.append(i)
        else:
            output.append(i)
        
    return output

def _preprocess_testing_file(path):
    with open(path, encoding="utf8") as f:
        data = f.read().splitlines()

    output = []
    for word in data:
        # if word: # leave the newlines or not?
        output.append(word)

    return output

In [17]:
def get_emission_using_MLE(training, k = 1):
    tags = {}
    tags_to_word = {}
    emission = {}
    for data in training:
        word, tag = data[0], data[1]
        if tag in tags:
            tags[tag] += 1
        else:
            tags[tag] = 1
        
        tag_to_word = tuple((tag, word))
        if tag_to_word in tags_to_word:
            tags_to_word[tag_to_word] += 1
        else:
            tags_to_word[tag_to_word] = 1

    for key in tags_to_word.keys():
        emission[key] = tags_to_word[key] / (tags[key[0]] + k)
    for key in tags.keys():
        transition = tuple((key, "#UNK#"))
        emission[transition] = k / (tags[key] + k)
    # print(emission)
    return emission

In [18]:
def get_most_probable_tag(emission):
    highest_prob = {}
    output = {}
    for key, prob in emission.items():
        tag, word = key[0], key[1]
        if word not in highest_prob:
            highest_prob[word] = prob
            output[word] = tag
        else:
            if prob > highest_prob[word]:
                highest_prob[word] = prob
                output[word] = tag
    # print(output)
    return output

In [19]:
def write_to_predicted_file(predicted_file, test_file, most_probable_tag):
    f = open(predicted_file, "w", encoding="utf8")
    for word in test_file:
        if len(word) > 0:
            try:
                y = most_probable_tag[word]
            except:
                y = most_probable_tag["#UNK#"]
            f.write(f"{word} {y}\n")
        else: # leave the newlines??
            f.write("\n")
    f.close()

In [20]:
# Part 1 code for ES Dataset
ES_train_data = _preprocess_training_file(ES_TRAIN_DATA_FILE)
ES_test_data = _preprocess_testing_file(ES_TEST_DATA_FILE)

ES_emission_parameters = get_emission_using_MLE(ES_train_data)
ES_most_probable_tag = get_most_probable_tag(ES_emission_parameters)
write_to_predicted_file(ES_PREDICTED_DATA_FILE, ES_test_data, ES_most_probable_tag)

# Part 1 code for RU Dataset
RU_train_data = _preprocess_training_file(RU_TRAIN_DATA_FILE)
RU_test_data = _preprocess_testing_file(RU_TEST_DATA_FILE)

RU_emission_parameters = get_emission_using_MLE(RU_train_data)
RU_most_probable_tag = get_most_probable_tag(RU_emission_parameters)
write_to_predicted_file(RU_PREDICTED_DATA_FILE, RU_test_data, RU_most_probable_tag)

In [21]:
def evaluate_precision(path_predicted, path_actual):
    correctly_predicted_entities = 0
    predicted_entities = 0
    with open(path_predicted, encoding="utf8") as f:
        predicted_data = f.read().splitlines()
        predicted_data[:] = [x for x in predicted_data if x]

    with open(path_actual, encoding="utf8") as f:
        actual_data = f.read().splitlines()
        actual_data[:] = [x for x in actual_data if x]

    for i in range(len(predicted_data)):
        predicted_entities += 1
        if predicted_data[i] == actual_data[i]:
            correctly_predicted_entities += 1

    return correctly_predicted_entities / predicted_entities

evaluate_precision(ES_PREDICTED_DATA_FILE, ES_ACTUAL_DATA_FILE)

0.6397645271553362

# Part 2i - Transition Parameters

In [22]:
import itertools
def _preprocess_training_file(training_file):
    tags = []
    tags_with_start_stop = []
    words = []

    with open(training_file,"r",encoding="utf8") as f:
        document = f.read().rstrip()
        lines = document.split("\n\n")

        for line in lines:
            tags_list = []
            tags_with_start_stop_list = []
            words_list = []

            for word_tag in line.split("\n"):
                word, tag = word_tag.split(" ")
                words_list.append(word)
                tags_list.append(tag)

            tags.append(tags_list)
            tags_with_start_stop_list = ["START"] + tags_list + ["STOP"]
            tags_with_start_stop.append(tags_with_start_stop_list)
            words.append(words_list)
    
    print(f'tags: {tags[0:3]}')
    print(f'tags_with_start_stop: {tags_with_start_stop[0:3]}')
    print(f'words: {words[0:3]}\n')
    return tags, tags_with_start_stop, words

def _preprocess_test_file(testing_file):
    test_words = []

    with open(testing_file,"r",encoding="utf8") as f:
        document = f.read().rstrip()
        lines = document.split("\n\n")

        for line in lines:
            word_list = []
            for word in line.split("\n"):
                word_list.append(word)
            test_words.append(word_list)

    return test_words

In [23]:
def get_unique_elements(lst):
    return list(set(list(itertools.chain.from_iterable(lst))))

def get_unique_tags(tags):
    tags_unique = get_unique_elements(tags)
    print(f'tags_unique: {tags_unique}')
    tags_unique.sort()
    print(f'tags_unique_sorted: {tags_unique}')
    tags_unique_with_start_stop = ["START"] + tags_unique + ["STOP"]
    print(f'tags_unique_with_start_stop: {tags_unique_with_start_stop}\n')

    return tags_unique, tags_unique_with_start_stop


In [24]:
def get_transition_pairs(tags):
    transition_pair_count = {}

    for tag in tags:
        #yi-1 and yi tuples
        for tag1, tag2 in zip(tag[:-1], tag[1:]):
            transition_pair = (tag1, tag2)
            if transition_pair in transition_pair_count:
                transition_pair_count[transition_pair] += 1
            else:
                transition_pair_count[transition_pair] = 1

    print(f'transition_pair_count: {transition_pair_count}\n')
    return transition_pair_count

In [25]:
def count_y(tag, tags):
    tags_flattened = list(itertools.chain.from_iterable(tags))
    return tags_flattened.count(tag)

In [26]:
def get_transition_using_MLE(tags_unique_with_start_stop, transition_pair_count,
                          tags_with_start_stop):

    transition = {}
    #create matrix dimensions
    for tag1 in tags_unique_with_start_stop[:-1]:
        transition_row = {}
        for tag2 in tags_unique_with_start_stop[1:]:
            transition_row[tag2] = 0.0
        transition[tag1] = transition_row

    print(f'transition_params_init: {transition}\n')
    # populate transition parameters with counts
    for tag1, tag2 in transition_pair_count:
        transition[tag1][tag2] += transition_pair_count[(tag1, tag2)]
    
    print(f'transition_params_with_count: {transition}\n')

    # divide transition_count by count_yi, to get probability
    for tag1, transition_row in transition.items():
        count_yi = count_y(tag1, tags_with_start_stop)

        # words in training set
        for tag2, transition_count in transition_row.items():
            if count_yi == 0:
                transition[tag1][tag2] = 0.0
            else:
                transition[tag1][tag2] = transition_count / count_yi

    print(f'transition_params_final: {transition}\n')

    return transition

In [27]:
# TODO: For Sath to include code to run the transition parameter functions
tags, tags_with_start_stop, words = _preprocess_training_file(ES_TRAIN_DATA_FILE)
test_words = _preprocess_test_file(ES_TEST_DATA_FILE)
tags_unique_with_start_stop, tags_unique_with_start_stop = get_unique_tags(tags)
transition_pair_count = get_transition_pairs(tags_with_start_stop)

transition_matrix = get_transition_using_MLE(tags_unique_with_start_stop, transition_pair_count, tags_with_start_stop)

tags: [['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-positive', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], ['O', 'O', 'B-negative', 'O']]
tags_with_start_stop: [['START', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-positive', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'STOP'], ['START', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'STOP'], ['START', 'O', 'O', 'B-negative', 'O', 'STOP']]
words: [['disfrutemos', 'de', 'una', 'buenísima', 'calidad', 'en', 'el', 'producto', 'y', 'una', 'inmejorable', 'relación', 'calidad', 'precio', '.'], ['Hoy', 'he', 'ido', 'a', 'comer', 'con', 'mia', 'padres', 'y', 'he', 'salido', 'muy', 'defraudado', '.'], ['A', 'mejorar', 'baños', '.']]

tags_unique: ['B-positive', 'I-negative', 'B-negative', 'O', 'B-neutral', 'I-neutral', 'I-positive']
tags_unique_sorted: ['B-negative', 'B-neutral', 'B-positive', 'I-negative', 'I-neutral', 'I-positive', 'O']
tags_unique_with_start_stop: ['START', 'B

# Part 2ii - Viterbi Algorithm

In [28]:
# TODO: Song Gee to write algorithm for Viterbi