# CSCI 544 HW2

In [1]:
import json
import numpy as np
from collections import Counter, defaultdict

## Task 1: Vocabulary Creation (20 points)

In [128]:
# Constants
TRAIN_DATA_PATH = 'data/train.json'
DEV_DATA_PATH = 'data/dev.json'
UNKNOWN_KEY = '<unk>'
THRESHOLD = 3
OUTPUT_FOLDER = 'verification/out'
OUTPUT_PATH_VOCAB = OUTPUT_FOLDER + '/vocab.txt'
OUTPUT_PATH_HMM = OUTPUT_FOLDER + '/hmm.json'
OUTPUT_PATH_GREEDY = OUTPUT_FOLDER + '/greedy.json'
OUTPUT_PATH_VITERBI = OUTPUT_FOLDER + '/viterbi.json'

In [129]:
initial_counts = {}
transition_counts = defaultdict(lambda: defaultdict(int))
emission_counts = defaultdict(lambda: defaultdict(int))
with open(TRAIN_DATA_PATH) as f:
    train_data = json.load(f)
    train_data_words = []
    for train_entry in train_data:
        train_data_words.extend(train_entry['sentence'])

temp_dict = Counter(train_data_words)

freq_dict = {}

freq_dict[UNKNOWN_KEY] = 0
for word in temp_dict:
    if temp_dict[word] < THRESHOLD:
        freq_dict[UNKNOWN_KEY] += temp_dict[word]
    else:
        freq_dict[word] = temp_dict[word]

unk_value = freq_dict[UNKNOWN_KEY]
del freq_dict[UNKNOWN_KEY]
freq_dict = dict([(UNKNOWN_KEY, unk_value)] + sorted(freq_dict.items(), key=lambda item: item[1], reverse=True))

with open(OUTPUT_PATH_VOCAB, 'w') as f:
    for o, word in enumerate(freq_dict):
        freq_dict[word] = {
            'index': o,
            'frequency': freq_dict[word]
        }
        f.write(f'{word}\t{o}\t{freq_dict[word]["frequency"]}\n')

What threshold value did you choose for identifying unknown words for replacement?
3

What is the overall size of your vocabulary, and how many times does the special token ”< unk >” occur following the replacement process?
Vocabulary size:  16920
< unk > count:  32357



## Task 2: Model Learning

In [130]:
tags = {}
for train_entry in train_data:
    labels = train_entry['labels']
    label_len = len(labels)
    for s in range(label_len):
        tag = labels[s]
        if tag not in tags:
            tags[tag] = {
                'index': len(tags),
                'frequency': 1
            }
        else:
            tags[tag]['frequency'] += 1
        if s == 0:
            initial_counts[tag] = initial_counts.get(tag, 0) + 1
        emitted_word = train_entry['sentence'][s] if train_entry['sentence'][s] in freq_dict else UNKNOWN_KEY
        emission_counts[tag][emitted_word] += 1
        if s < label_len - 1:
            next_tag = labels[s + 1]
            transition_counts[tag][next_tag] += 1

NUM_TAGS = len(tags)
NUM_WORDS = len(freq_dict)

In [131]:
transition = {}
emission = {}

for tag in transition_counts:
    for next_tag in transition_counts[tag]:
        transition[f'({tag},{next_tag})'] = transition_counts[tag][next_tag] / tags[tag]['frequency']

for tag in emission_counts:
    for next_tag in emission_counts[tag]:
        emission[f'({tag},{next_tag})'] = emission_counts[tag][next_tag] / tags[tag]['frequency']

hmm_json = {
    'transition': transition,
    'emission': emission,
}

with open(OUTPUT_PATH_HMM, 'w') as json_file:
    json.dump(hmm_json, json_file)

In [132]:
len(transition), len(emission)

(1351, 23373)

How many transition and emission parameters in your HMM?

Transition parameters:  1351
Emission parameters:  23373

## Task 3: Greedy Decoding with HMM

In [133]:
initial_prob = np.zeros(NUM_TAGS)
for o, tag in enumerate(tags):
    initial_prob[o] = initial_counts.get(tag, 0) / len(train_data)

transition_prob = np.zeros((NUM_TAGS, NUM_TAGS))
for tag in transition_counts:
    for next_tag in transition_counts[tag]:
        transition_prob[tags[tag]['index']][tags[next_tag]['index']] = transition_counts[tag][next_tag] / tags[tag]['frequency']

emission_prob = np.zeros((NUM_WORDS, NUM_TAGS))
for tag in emission_counts:
    for word in emission_counts[tag]:
        emission_prob[freq_dict[word]['index']][tags[tag]['index']] = emission_counts[tag][word] / tags[tag]['frequency']

In [134]:
with open(DEV_DATA_PATH) as f:
    dev_data = json.load(f)

greedy = []
tag_list = list(tags.keys())
res = np.array([], dtype=bool)

for data_idx, dev_entry in enumerate(dev_data):
    sentence = dev_entry['sentence']
    pred = []
    for o, word in enumerate(sentence):
        init_prob = initial_prob if o == 0 else transition_prob[tags[pred[-1]]['index']]
        mul_value = init_prob * emission_prob[freq_dict.get(word, freq_dict[UNKNOWN_KEY])['index']]
        pred.append(tag_list[np.argmax(mul_value)])
    greedy.append({
        'index': data_idx,
        'sentence': sentence,
        'labels': pred
        })
    res = np.append(res, np.array(pred) == np.array(dev_entry['labels']))
print(res.mean())
with open(OUTPUT_PATH_GREEDY, 'w') as json_file:
    json.dump(greedy, json_file)

0.9298615748891992


What is the accuracy on the dev data? 0.9298615748891992

## Task 4: Viterbi Decoding with HMM

In [164]:
viterbi = []
res = np.array([], dtype=bool)

for data_idx, dev_entry in enumerate(dev_data):
    sentence = dev_entry['sentence']
    T = len(sentence)
    trellis = np.zeros((NUM_TAGS, T))
    pointers = np.zeros((NUM_TAGS, T))
    trellis[: , 0] = initial_prob * emission_prob[freq_dict.get(sentence[0], freq_dict[UNKNOWN_KEY])['index']]
    for o in range(1, T):
        for s in range(NUM_TAGS):
            k = np.argmax(trellis[k, o - 1] * transition_prob[k, s] * emission_prob[o, s] for k in range(NUM_TAGS))
            trellis[s, o] = trellis[k, o - 1] * transition_prob[k, s] * emission_prob[o, s]
            pointers[s, o] = k
    pred = []
    k = np.argmax(trellis[:, T - 1])
    for o in range(T - 1, -1, -1):
        pred.insert(0, tag_list[int(k)])
        k = pointers[k, o]
    viterbi.append({
        'index': data_idx,
        'sentence': sentence,
        'labels': pred
        })
    print("Prediction:", pred)
    print("Actual:", dev_entry['labels'])
    res = np.append(res, np.array(pred) == np.array(dev_entry['labels']))
print(res.mean())
with open(OUTPUT_PATH_VITERBI, 'w') as json_file:
    json.dump(viterbi, json_file)

TypeError: list indices must be integers or slices, not numpy.float64

In [158]:
trellis[:, 0].sum()

0.01827564821560826