# HMM Section w/ Baseline Models

In [12]:
import csv
from collections import Counter, defaultdict
import numpy as np

#PREPROCESSING
categories = ["O", "B-MISC", "I-MISC", "B-ORG", "I-ORG", "B-LOC", "I-LOC", "B-PER", "I-PER"]
tsv_file = open('train.txt', encoding="utf8")
tsv_reader = csv.reader(tsv_file, delimiter='\t', quotechar=None)
vocabulary = []
vocabulary2 = []
train = []
validation = []
validation_tags = []
baseline_lines = []
baseline2_lines = []
train_count_tags = defaultdict(float)
train_count_words = defaultdict(float)
train_tag_bigram = defaultdict(lambda: defaultdict(float))
val_tag_bigram = defaultdict(lambda: defaultdict(float))
train_words = defaultdict(lambda: defaultdict(float))
val_words = defaultdict(lambda: defaultdict(float))
val = False
i = 0
line = []
for row in tsv_reader:
    if i % 3 == 0: # Only the 3rd line contains the actual sentences
        line = []
        if i % 15 == 0: # Every 15th word line is set aside for validation set
            val = True
        else:
            val = False
        if not val:
            for element in row:
                if element not in vocabulary2:
                    vocabulary2.append(element)
                    num2 = np.random.randint(0,100)
                    if num2 < 30:                      # 30% chance to replace unique words with <UNK>
                        line.append("<UNK>")
                        train_count_words["<UNK>"] += 1
                        continue
                train_count_words[element] += 1
                line.append(element)
            train.append(row)
        else:
            for element in row:
                line.append(element)
            validation.append(line)
    if i % 3 == 2: #dealing with the BIO tags in the training set
        line_index = 0
        previous_tag = "<s>" #only adding start tags to the very first entry
        if val:
            validation_tags.append(row)
        for element in row:
            if not val:
#                 print(line[line_index])
                train_words[line[line_index]][element] += 1
                train_tag_bigram[element][previous_tag] += 1
                previous_tag = element
                train_count_tags[element] += 1
            else:
                val_words[line[line_index]][element] += 1
                val_tag_bigram[element][previous_tag] += 1
                previous_tag = element
            line_index += 1
    i += 1
for i in vocabulary2: # Goes back through the vocabulary and removes words that only appeared once and were replaced with <UNK> tags
    if not train_count_words[i] == 0:
        vocabulary.append(i)

In [99]:
# Calculating + getting probability counts for the tag bigrams and words given tags
k = 2
    
train_tag_bigram_prob = defaultdict(lambda: defaultdict(float))
train_words_prob = defaultdict(lambda: defaultdict(float))
for first in train_tag_bigram:
    for tag in categories:
        count = train_tag_bigram[first][tag]
        train_tag_bigram_prob[first][tag] = (count+k)/(train_count_tags[first]+k*(len(categories)+1))
    train_tag_bigram_prob[first]["<s>"]= (count+k)/(train_count_tags[first]+k*(len(categories)+1))
k = 0.0001

for first in train_words:
    for tag in categories:
        count = train_words[first][tag]
        train_words_prob[first][tag] = (count+k)/(train_count_words[first]+k*len(vocabulary))


In [100]:
# Our first Baseline Model
def baseline(line):
    tags = []
    for element in line:
        if element not in vocabulary:
            element = "<UNK>"
        if element == "<UNK>":
            tags.append("O")
        else:
            vals = train_words[element]
            most_likely = max(vals, key=train_words[element].get)
            tags.append(most_likely)
    return tags

In [101]:
# Our 2nd Baseline Model
def baseline2(line):
    tags = []
    for element in line:
        if element not in vocabulary:
            element = "<UNK>"
        if element == "<UNK>":
            tags.append("O")
        else:
            tags.append(np.random.choice(list(train_words[element])))
    return tags

In [102]:
# Function used in Viterbi to find the maximum score
def max_score(score,index, tag):
    scores = [0,0,0,0,0,0,0,0,0]
    for i in range(9):
        scores[i] = score[i][index-1]*train_tag_bigram_prob[tag][categories[i]]
    return max(scores), np.argmax(scores)

In [103]:
#HMM Viterbi Algorithm
def viterbi(line):
    backpointer = np.zeros([len(categories), len(line)])
    score = np.zeros([len(categories), len(line)])
    for i in range(len(line)):
        if line[i] not in vocabulary:
            line[i] = "<UNK>"
    for i in range(len(categories)):
        score[i][0] = train_tag_bigram_prob[categories[i]]["<s>"]*train_words_prob[line[0]][categories[i]]
    if len(line) == 1:
        maxscore, maxpointer = max_score(score, 0, categories[i])
    for t in range(1, len(line)):
        for i in range(len(categories)):
            maxscore, maxpointer = max_score(score, t, categories[i])
            score[i][t] = maxscore*train_words_prob[line[t]][categories[i]]
            backpointer[i][t] = maxpointer
    sequence = np.zeros(len(line))
    max_s = 0
    max_index = 0
    for i in range(9):
        if max_s < score[i][len(line)-1]:
            max_s = score[i][len(line)-1]
            max_index = i
    sequence[len(line)-1] = max_index
    char_sequence = []
    for i in range(len(line)-2, -1, -1):
        sequence[i] = backpointer[int(sequence[i+1])][i+1]
    for i in range(len(line)):
        char_sequence.append(categories[int(sequence[i])])
    return char_sequence

In [104]:
# Calculated Precision, Recall, F-Score
def check_val(method,validation, validation_tags):
    correct = 0
    guessed = 0
    total = 0
    for i in range(len(validation)):
        tags = []
        if method == 'viterbi':
            guess_line = viterbi(validation[i])
        elif method == 'baseline1':
            guess_line = baseline(validation[i])
        else:
            guess_line = baseline2(validation[i])
        for tag in range(len(guess_line)):
            if "PER" in guess_line[tag]:
                tags.append("PER")
            elif "LOC" in guess_line[tag]:
                tags.append("LOC")
            elif "ORG" in guess_line[tag]:
                tags.append("ORG")
            elif "MISC" in guess_line[tag]:
                tags.append("MISC")
            else:
                tags.append("O")
        for tag in range(len(validation_tags[i])):
            if "PER" in validation_tags[i][tag]:
                validation_tags[i][tag] = "PER"
            elif "LOC" in validation_tags[i][tag]:
                validation_tags[i][tag] = "LOC"
            elif "ORG" in validation_tags[i][tag]:
                validation_tags[i][tag] = "ORG"
            elif "MISC" in validation_tags[i][tag]:
                validation_tags[i][tag] = "MISC"
            else:
                validation_tags[i][tag] = "O"
        for tag in range(len(validation_tags[i])):
            if tags[tag] == "O":
                continue
            correct += (tags[tag] == validation_tags[i][tag])
        for j in guess_line:
            if not j == 'O':
                guessed += 1
        for k in validation_tags[i]:
            if not k == 'O':
                total += 1
    precision = correct/guessed
    recall = correct/total
    fscore = 2*precision*recall/(precision+recall)
    return precision, recall, fscore

In [105]:
precision, recall, fscore = check_val('viterbi', validation, validation_tags)
print(precision, recall, fscore)

0.9064828614008942 0.7161147902869757 0.8001315464934639


In [106]:
precision, recall, fscore = check_val('baseline1', validation, validation_tags)
print(precision, recall, fscore)

0.866642958748222 0.7172921265636497 0.7849263225702552


In [107]:
precision, recall, fscore = check_val('baseline2', validation, validation_tags)
print(precision, recall, fscore)

0.03860002441108263 0.1861662987490802 0.06394217403391715


In [113]:
#Executing HMM on test set
tsv_file = open('test.txt', encoding="utf8")
tsv_reader = csv.reader(tsv_file, delimiter='\t', quotechar=None)
PER = []
LOC = []
ORG = []
MISC = []
indices = []
display = []
tags = []
i = 0
#Obtaining tag predictions for each line using Viterbi
for row in tsv_reader:
    if i%3 == 0:
        line = []
        for element in row:
            if element not in vocabulary:
                line.append("<UNK>")
            else:
                line.append(element)
        line_tags = viterbi(line)
        display.append(line_tags)
        for tag in line_tags:
            tags.append(tag)
    i+=1
tsv_file = open('test.txt', encoding="utf8")
i = 0
tsv_reader = csv.reader(tsv_file, delimiter=' ', quotechar=None)
for row in tsv_reader: #obtaining indices from test.txt
    if i%3 == 2:
        for element in row:
            indices.append(element)
    i+= 1
for i in range(len(tags)):
    if "PER" in tags[i]:
         tags[i] = "PER"
    elif "LOC" in  tags[i]:
         tags[i] = "LOC"
    elif "ORG" in  tags[i]:
         tags[i] = "ORG"
    elif "MISC" in  tags[i]:
         tags[i] = "MISC"
    else:
         tags[i] = "O"
beginning = False
log_tag = ""
end = ""
begin = ""
intermediate = False
#Converting tags for submission
for i in range(len(tags)):
    if beginning and log_tag == tags[i]:
        end = str(indices[i])
    if beginning and not log_tag == tags[i]:
        beginning = False
        if "PER" in log_tag:
            PER.append(begin+end)
        elif "LOC" in log_tag:
            LOC.append(begin+end)
        elif "ORG" in log_tag:
            ORG.append(begin+end)
        elif "MISC" in log_tag:
            MISC.append(begin+end)
    if beginning == False and not tags[i] == 'O':
        beginning = True
        log_tag = tags[i]
        begin = str(indices[i]) + "-"
        end = str(indices[i])

In [111]:
# Creating CSV
csv_per = ""
csv_loc = ""
csv_org = ""
csv_misc = ""
for i in PER:
    csv_per += i + " "
for i in LOC:
    csv_loc += i + " "
for i in ORG:
    csv_org += i + " "
for i in MISC:
    csv_misc += i + " "
with open('preds_idk.csv', 'w', newline='') as csvfile:
    spamwriter = csv.writer(csvfile)
    spamwriter.writerow(["Type", "Prediction"])
    spamwriter.writerow(["PER", csv_per])
    spamwriter.writerow(["LOC", csv_loc])
    spamwriter.writerow(["ORG", csv_org])
    spamwriter.writerow(["MISC", csv_misc])

In [127]:
print(display[45])

['B-ORG', 'I-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
