In [39]:
ES_TRAIN_DATA_FILE = "./ES/train"
ES_TEST_DATA_FILE = "./ES/dev.in"
ES_ACTUAL_DATA_FILE = "./ES/dev.out"
ES_PART1_PREDICTED_DATA_FILE = "./ES/dev.p1.out"
ES_PART2_PREDICTED_DATA_FILE = "./ES/dev.p2.out"
ES_PART3_PREDICTED_DATA_FILE = "./ES/dev.p3.out"
ES_PART4_PREDICTED_DATA_FILE = "./ES/dev.p4.out"

RU_TRAIN_DATA_FILE = "./RU/train"
RU_TEST_DATA_FILE = "./RU/dev.in"
RU_ACTUAL_DATA_FILE = "./RU/dev.out"
RU_PART1_PREDICTED_DATA_FILE = "./RU/dev.p1.out"
RU_PART2_PREDICTED_DATA_FILE = "./RU/dev.p2.out"
RU_PART3_PREDICTED_DATA_FILE = "./RU/dev.p3.out"
RU_PART4_PREDICTED_DATA_FILE = "./RU/dev.p4.out"

UNK_WORD = "#UNK#"

# Part 1 - Emission Parameters, UNK Token

In [40]:
def _preprocess_training_file(file_path):
    with open(file_path, encoding="utf8") as f:
        data = f.read().splitlines()
        data[:] = [x for x in data if x]

    output = []
    for i in data:
        i = i.split(" ")
        if(len(i) > 2):
            i = [" ".join(i[0:len(i)-1]), i[len(i)-1]]
            output.append(i)
        else:
            output.append(i)
        
    return output

def _preprocess_testing_file(path):
    with open(path, encoding="utf8") as f:
        data = f.read().splitlines()

    output = []
    for word in data:
        # if word: # leave the newlines or not?
        output.append(word)

    return output

In [41]:
def get_emission_using_MLE(training, k = 1):
    tags = {}
    tags_to_word = {}
    emission = {}
    for data in training:
        word, tag = data[0], data[1]
        if tag in tags:
            tags[tag] += 1
        else:
            tags[tag] = 1
        
        tag_to_word = tuple((tag, word))
        if tag_to_word in tags_to_word:
            tags_to_word[tag_to_word] += 1
        else:
            tags_to_word[tag_to_word] = 1

    for key in tags_to_word.keys():
        emission[key] = tags_to_word[key] / (tags[key[0]] + k)
    for key in tags.keys():
        transition = tuple((key, UNK_WORD))
        emission[transition] = k / (tags[key] + k)
    # print(emission)
    return emission

In [42]:
def get_most_probable_tag(emission):
    highest_prob = {}
    output = {}
    for key, prob in emission.items():
        tag, word = key[0], key[1]
        if word not in highest_prob:
            highest_prob[word] = prob
            output[word] = tag
        else:
            if prob > highest_prob[word]:
                highest_prob[word] = prob
                output[word] = tag
    # print(output)
    return output

In [43]:
def write_to_predicted_file_part1(predicted_file, test_file, most_probable_tag):
    with open(predicted_file, "w", encoding="utf8") as f:
        for word in test_file:
            if len(word) > 0:
                try:
                    y = most_probable_tag[word]
                except:
                    y = most_probable_tag[UNK_WORD]
                f.write(f"{word} {y}\n")
            else: # leave the newlines??
                f.write("\n")

In [44]:
# Part 1 code for ES Dataset
ES_train_data = _preprocess_training_file(ES_TRAIN_DATA_FILE)
ES_test_data = _preprocess_testing_file(ES_TEST_DATA_FILE)

ES_emission_parameters = get_emission_using_MLE(ES_train_data)
ES_most_probable_tag = get_most_probable_tag(ES_emission_parameters)
write_to_predicted_file_part1(ES_PART1_PREDICTED_DATA_FILE, ES_test_data, ES_most_probable_tag)

# Part 1 code for RU Dataset
RU_train_data = _preprocess_training_file(RU_TRAIN_DATA_FILE)
RU_test_data = _preprocess_testing_file(RU_TEST_DATA_FILE)

RU_emission_parameters = get_emission_using_MLE(RU_train_data)
RU_most_probable_tag = get_most_probable_tag(RU_emission_parameters)
write_to_predicted_file_part1(RU_PART1_PREDICTED_DATA_FILE, RU_test_data, RU_most_probable_tag)

In [45]:
from EvalScript.evalResult import get_observed, get_predicted,compare_observed_to_predicted

def evaluateScores(actual_file, predicted_file):
    with open(predicted_file, encoding="utf8") as f:
        predicted = f.read().splitlines()

    with open(actual_file, encoding="utf8") as f:
        actual = f.read().splitlines()
   
    compare_observed_to_predicted(get_observed(actual), get_predicted(predicted))

evaluateScores(ES_ACTUAL_DATA_FILE, ES_PART1_PREDICTED_DATA_FILE)


#Entity in gold data: 255
#Entity in prediction: 1733

#Correct Entity : 205
Entity  precision: 0.1183
Entity  recall: 0.8039
Entity  F: 0.2062

#Correct Sentiment : 113
Sentiment  precision: 0.0652
Sentiment  recall: 0.4431
Sentiment  F: 0.1137


# Part 2i - Transition Parameters

In [46]:
import itertools
def _preprocess_training_file(training_file):
    tags = []
    tags_with_start_stop = []
    words = []

    with open(training_file,"r",encoding="utf8") as f:
        document = f.read().rstrip()
        lines = document.split("\n\n")

        for line in lines:
            tags_list = []
            tags_with_start_stop_list = []
            words_list = []

            for word_tag in line.split("\n"):
                i = word_tag.split(" ")

                if len(i) > 2:
                    i = [" ".join(i[0:len(i)-1]), i[len(i)-1]]

                word, tag = i[0], i[1]
                words_list.append(word)
                tags_list.append(tag)

            tags.append(tags_list)
            tags_with_start_stop_list = ["START"] + tags_list + ["STOP"]
            tags_with_start_stop.append(tags_with_start_stop_list)
            words.append(words_list)
    
    # print(f'tags: {tags[0:3]}')
    # print(f'tags_with_start_stop: {tags_with_start_stop[0:3]}')
    # print(f'words: {words[0:3]}\n')
    return tags, tags_with_start_stop, words

def _preprocess_test_file(testing_file):
    test_words = []

    with open(testing_file,"r",encoding="utf8") as f:
        document = f.read().rstrip()
        lines = document.split("\n\n")

        for line in lines:
            word_list = []
            for word in line.split("\n"):
                word_list.append(word)
            test_words.append(word_list)

    return test_words

In [47]:
def get_unique_elements(lst):
    return list(set(list(itertools.chain.from_iterable(lst))))

def get_unique_tags(tags):
    tags_unique = get_unique_elements(tags)
    # print(f'tags_unique: {tags_unique}')
    tags_unique.sort()
    # print(f'tags_unique_sorted: {tags_unique}')
    tags_unique_with_start_stop = ["START"] + tags_unique + ["STOP"]
    # print(f'tags_unique_with_start_stop: {tags_unique_with_start_stop}\n')

    return tags_unique, tags_unique_with_start_stop


In [48]:
def get_transition_pairs(tags):
    transition_pair_count = {}

    for tag in tags:
        #yi-1 and yi tuples
        for tag1, tag2 in zip(tag[:-1], tag[1:]):
            transition_pair = (tag1, tag2)
            if transition_pair in transition_pair_count:
                transition_pair_count[transition_pair] += 1
            else:
                transition_pair_count[transition_pair] = 1

    # print(f'transition_pair_count: {transition_pair_count}\n')
    return transition_pair_count

In [49]:
def count_y(tag, tags):
    tags_flattened = list(itertools.chain.from_iterable(tags))
    return tags_flattened.count(tag)

In [50]:
def get_transition_using_MLE(tags_unique_with_start_stop, transition_pair_count,
                          tags_with_start_stop):

    transition = {}
    #create matrix dimensions
    for tag1 in tags_unique_with_start_stop[:-1]:
        transition_row = {}
        for tag2 in tags_unique_with_start_stop[1:]:
            transition_row[tag2] = 0.0
        transition[tag1] = transition_row

    # print(f'transition_params_init: {transition}\n')
    # populate transition parameters with counts
    for tag1, tag2 in transition_pair_count:
        transition[tag1][tag2] += transition_pair_count[(tag1, tag2)]
    
    # print(f'transition_params_with_count: {transition}\n')

    # divide transition_count by count_yi, to get probability
    for tag1, transition_row in transition.items():
        count_yi = count_y(tag1, tags_with_start_stop)

        # words in training set
        for tag2, transition_count in transition_row.items():
            if count_yi == 0:
                transition[tag1][tag2] = 0.0
            else:
                transition[tag1][tag2] = transition_count / count_yi

    # print(f'transition_params_final: {transition}\n')

    return transition

In [51]:
ES_tags, ES_tags_with_start_stop, ES_train_words = _preprocess_training_file(ES_TRAIN_DATA_FILE)
ES_test_words = _preprocess_test_file(ES_TEST_DATA_FILE)
ES_tags_unique, ES_tags_unique_with_start_stop = get_unique_tags(ES_tags)
ES_transition_pair_count = get_transition_pairs(ES_tags_with_start_stop)
ES_transition_parameters = get_transition_using_MLE(ES_tags_unique_with_start_stop, ES_transition_pair_count, ES_tags_with_start_stop)

RU_tags, RU_tags_with_start_stop, RU_train_words = _preprocess_training_file(RU_TRAIN_DATA_FILE)
RU_test_words = _preprocess_test_file(RU_TEST_DATA_FILE)
RU_tags_unique, RU_tags_unique_with_start_stop = get_unique_tags(RU_tags)
RU_transition_pair_count = get_transition_pairs(RU_tags_with_start_stop)
RU_transition_parameters = get_transition_using_MLE(RU_tags_unique_with_start_stop, RU_transition_pair_count, RU_tags_with_start_stop)

# Part 2ii - Viterbi Algorithm

In [59]:
# TODO: Song Gee to write algorithm for Viterbi
import math
import sys

word_output_list = []  # list of tuple(word, predicted_tag) for writing to file
viterbi_values = {}  # key: (n, tag)  value: float 

def generate_viterbi_values(n, current_tag, word_list, tags_unique, emission_params, transmission_params):
    global viterbi_values
    current_max_viterbi_value = -sys.float_info.max  # Smallest negative float
    current_viterbi_values = []
    if n == 0:
        return

    # Recursive call to generate viterbi_values for (n-1, tag)
    for tag in tags_unique:
        if (n-1, tag) not in viterbi_values:
            generate_viterbi_values(n-1, tag, word_list, tags_unique, emission_params, transmission_params)

    # Use viterbi values from n-1 to generate current viterbi value
    for tag in tags_unique:
        # Here, we use a try-except block because our emission parameters only contain emissions which appeared in our datasets
        # Thus, any unobserved emission will throw a KeyError, so we use the emission params with (tag, "#UNK#") instead
        # If transmission_params gives 0, math.log will throw a valueError, thus we catch it and skip the current tag since 0 means we should never consider it
        try:
            try:
                if n == 1:
                    value = math.log(emission_params[(current_tag, word_list[n-1])] * transmission_params[tag][current_tag])  # Log-likelihood for solving numerical underflow
                    # print(f'viterbi_value: {viterbi_values[(n-1, tag)]}, emission_params_value: {emission_params[(current_tag, word_list[n-1])]}, transmission_params_value: {transmission_params[tag][current_tag]}')
                else:
                    value = viterbi_values[(n-1, tag)] + math.log(emission_params[(current_tag, word_list[n-1])] * transmission_params[tag][current_tag])  # Add instead of multiplying using log rules
                    # current_viterbi_values.append(value)
                    # print(current_viterbi_values)
            except KeyError:
                if n == 1:
                    value = math.log(emission_params[(current_tag, UNK_WORD)] * transmission_params[tag][current_tag])
                else:
                    value = viterbi_values[(n-1, tag)] + math.log(emission_params[(current_tag, UNK_WORD)] * transmission_params[tag][current_tag])

        except ValueError:
            continue
        
        # print(f'n: {n}, current_tag: {current_tag}, tag: {tag}, value: {value}')
        current_max_viterbi_value = max(current_max_viterbi_value, value)
        # ? for Part 3, find the 5-th best output sequence 
        # current_viterbi_values.sort(reverse=True)
        # print(current_viterbi_values)
        # current_max_viterbi_value = current_viterbi_values[-1] # take the 5th index??
    
    viterbi_values[(n, current_tag)] = current_max_viterbi_value


# function to kickstart viterbi recursive chain, and add (n+1, STOP) to veterbi_values
def start_viterbi(word_list, tags_unique, emission_params, transmission_params):
    global viterbi_values
    max_final_viterbi_value = -sys.float_info.max

    n = len(word_list)

    # Recursive call to generate viterbi_values for (n, tag)
    for tag in tags_unique:
        generate_viterbi_values(n, tag, word_list, tags_unique, emission_params, transmission_params)

    # Use viterbi values from n to generate viterbi value for (n+1, STOP)
    for tag in tags_unique:
        try:
            value = viterbi_values[(n, tag)] + math.log(transmission_params[tag]['STOP'])
        except ValueError:
            continue
        max_final_viterbi_value = max(max_final_viterbi_value, value)

    viterbi_values[(n+1, 'STOP')] = max_final_viterbi_value


def generate_predictions_viterbi(word_list, tags_unique, emission_params, transmission_params):
    global viterbi_values

    n = len(word_list)

    generated_tag_list = ['' for _ in range(n)]

    # Compute tag for n
    current_best_tag = ''
    current_best_tag_value = -sys.float_info.max

    for tag in tags_unique:
        try:
            value = viterbi_values[(n, tag)] + math.log(transmission_params[tag]['STOP'])
        except ValueError:
            continue
        if value > current_best_tag_value:
            current_best_tag = tag
            current_best_tag_value = value

    # if current_best_tag == '':
    #     current_best_tag = 'O'
    generated_tag_list[n-1] = current_best_tag #! unable to generate best tag sometimes, stuck at ''

    # Generate predictions starting from n-1 going down to 1
    for i in range(n-1, 0, -1):
        current_best_tag = ''
        current_best_tag_value = -sys.float_info.max

        for tag in tags_unique:
            try:
                value = viterbi_values[(i, tag)] + math.log(transmission_params[tag][generated_tag_list[i]])
            except ValueError: #! produces key error: generated_tag_list ["", "B-positive", ...]
                continue
            if value > current_best_tag_value:
                current_best_tag = tag
                current_best_tag_value = value

        generated_tag_list[i-1] = current_best_tag
    
    return generated_tag_list


def write_to_predicted_file_part2(predicted_file, words_list, tags_list):
    assert len(words_list) == len(tags_list)

    with open(predicted_file, "w", encoding="utf8") as f:
        for words, tags in zip(words_list, tags_list):  # Unpack all sentences and list of tags
            assert len(words) == len(tags)
            for word, tag in zip (words, tags):  # Unpack all words and tags
                f.write(f"{word} {tag}\n")
            f.write("\n")


In [60]:
ES_test_words
ES_train_words

[['disfrutemos',
  'de',
  'una',
  'buenísima',
  'calidad',
  'en',
  'el',
  'producto',
  'y',
  'una',
  'inmejorable',
  'relación',
  'calidad',
  'precio',
  '.'],
 ['Hoy',
  'he',
  'ido',
  'a',
  'comer',
  'con',
  'mia',
  'padres',
  'y',
  'he',
  'salido',
  'muy',
  'defraudado',
  '.'],
 ['A', 'mejorar', 'baños', '.'],
 ['En',
  'el',
  'apartado',
  'de',
  'ambiente',
  'todo',
  'muy',
  'correcto',
  ',',
  'decoración',
  ',',
  'música',
  ',',
  'ambiente',
  ',',
  'etc',
  '.'],
 ['La',
  'excusa',
  'del',
  'restaurante',
  'ha',
  'sido',
  'que',
  'el',
  'dia',
  '1',
  'de',
  'enero',
  'siempre',
  'hay',
  'mucho',
  'trabajo',
  '.'],
 ['Excelente', '.'],
 ['Así', 'lo', 'hizo', 'y', 'el', 'resultado', 'fue', 'espectacular', '.'],
 ['Las',
  'escaleras',
  'y',
  'detalles',
  'del',
  'mobiliario',
  ',',
  'con',
  'polvo',
  'y',
  'desgastados',
  ',',
  'necesitan',
  'renovación',
  '.'],
 ['(',
  'pense',
  ')',
  '.',
  'El',
  'resultado',


In [61]:
# word = ['disfrutemos', 'de', 'una', 'buenísima', 'calidad', 'en', 'el', '.']
# word = ES_train_words[0]
# start_viterbi(word, ES_tags_unique, ES_emission_parameters, ES_transition_parameters)
# print(f'viterbi_values: {viterbi_values}')
# ES_generated_tag_list = generate_predictions_viterbi(word, ES_tags_unique, ES_emission_parameters, ES_transition_parameters)
# print(ES_generated_tag_list)


# Run and output Viterbi for ES
ES_predicted_tags_list = []
for word in ES_test_words:
    viterbi_values = {}
    start_viterbi(word, ES_tags_unique, ES_emission_parameters, ES_transition_parameters)
    ES_generated_tag_list = generate_predictions_viterbi(word, ES_tags_unique, ES_emission_parameters, ES_transition_parameters)
    ES_predicted_tags_list.append(ES_generated_tag_list)

write_to_predicted_file_part2(ES_PART3_PREDICTED_DATA_FILE, ES_test_words, ES_predicted_tags_list)


# Run and output Viterbi for RU
RU_predicted_tags_list = []
for word in RU_test_words:
    viterbi_values = {}
    start_viterbi(word, RU_tags_unique, RU_emission_parameters, RU_transition_parameters)
    RU_generated_tag_list = generate_predictions_viterbi(word, RU_tags_unique, RU_emission_parameters, RU_transition_parameters)
    RU_predicted_tags_list.append(RU_generated_tag_list)

write_to_predicted_file_part2(RU_PART3_PREDICTED_DATA_FILE, RU_test_words, RU_predicted_tags_list)


In [58]:
from EvalScript.evalResult import get_observed, get_predicted,compare_observed_to_predicted

def evaluateScores(actual_file, predicted_file):
    with open(predicted_file, encoding="utf8") as f:
        predicted = f.read().splitlines()

    with open(actual_file, encoding="utf8") as f:
        actual = f.read().splitlines()
   
    compare_observed_to_predicted(get_observed(actual), get_predicted(predicted))

evaluateScores(ES_ACTUAL_DATA_FILE, ES_PART3_PREDICTED_DATA_FILE)


#Entity in gold data: 255
#Entity in prediction: 292

#Correct Entity : 1
Entity  precision: 0.0034
Entity  recall: 0.0039
Entity  F: 0.0037

#Correct Sentiment : 0
Sentiment  precision: 0.0000
Sentiment  recall: 0.0000
Sentiment  F: 0.0000
