In [1]:
es_train_data_file = "./ES/train"
es_test_data_file = "./ES/dev.in"
es_predicted_data_file = "./ES/dev.p1.out"
es_actual_data_file = "./ES/dev.out"
ru_train_data_file = "./RU/train"
ru_test_data_file = "./RU/dev.in"
ru_predicted_data_file = "./RU/dev.p1.out"
ru_actual_data_file = "./RU/dev.out"

In [3]:
def _preprocess_training_file(file_path):
    with open(file_path) as f_lines:
        data = f_lines.read().splitlines()
        data[:] = [x for x in data if x]

    output = []
    for i in data:
        i = i.split(" ")
        if(len(i) > 2):
            i = [" ".join(i[0:len(i)-1]), i[len(i)-1]]
            output.append(i)
        else:
            output.append(i)
        
    return output

def _preprocess_testing_file(path):
    with open(path) as f:
        data = f.read().splitlines()

    output = []
    for word in data:
        # if word: # leave the newlines or not?
        output.append(word)

    return output

In [4]:
def get_emission_using_MLE(training, k = 1):
    tags = {}
    tags_to_word = {}
    emission = {}
    for data in training:
        word, tag = data[0], data[1]
        if tag in tags:
            tags[tag] += 1
        else:
            tags[tag] = 1
        
        tag_to_word = tuple((tag, word))
        if tag_to_word in tags_to_word:
            tags_to_word[tag_to_word] += 1
        else:
            tags_to_word[tag_to_word] = 1

    for key in tags_to_word.keys():
        emission[key] = tags_to_word[key] / (tags[key[0]] + k)
    for key in tags.keys():
        transition = tuple((key, "#UNK#"))
        emission[transition] = k / (tags[key] + k)
    # print(emission)
    return emission

In [5]:
def get_most_probable_tag(emission):
    highest_prob = {}
    output = {}
    for key, prob in emission.items():
        tag, word = key[0], key[1]
        if word not in highest_prob:
            highest_prob[word] = prob
            output[word] = tag
        else:
            if prob > highest_prob[word]:
                highest_prob[word] = prob
                output[word] = tag
    # print(output)
    return output

In [6]:
def get_predicted_file(predicted_file, test_file, most_probable_tag):
    f = open(predicted_file, "w")
    for word in test_file:
        if len(word) > 0:
            try:
                y = most_probable_tag[word]
            except:
                y = most_probable_tag["#UNK#"]
            f.write(f"{word} {y}\n")
        else: # leave the newlines??
            f.write("\n")
    f.close()

In [7]:
ES_train_data = _preprocess_training_file(es_train_data_file)
ES_test_data = _preprocess_testing_file(es_test_data_file)

emission_parameters = get_emission_using_MLE(ES_train_data)
most_probable_tag = get_most_probable_tag(emission_parameters)
get_predicted_file(es_predicted_data_file, ES_test_data, most_probable_tag)

In [34]:
def evaluate_precision(path_predicted, path_actual):
    correctly_predicted_entities = 0
    predicted_entities = 0
    with open(path_predicted) as f:
        predicted_data = f.read().splitlines()
        predicted_data[:] = [x for x in predicted_data if x]

    with open(path_actual) as f:
        actual_data = f.read().splitlines()
        actual_data[:] = [x for x in actual_data if x]

    for i in range(len(predicted_data)):
        predicted_entities += 1
        if predicted_data[i] == actual_data[i]:
            correctly_predicted_entities += 1

    return correctly_predicted_entities / predicted_entities

evaluate_precision(es_predicted_data_file, es_actual_data_file)

0.6397645271553362