# Part 2
In this section, we write some dank functions for the HMM

In [69]:
import numpy as np
from tqdm import tqdm_notebook

In [52]:
def learn_emissions(train_filename):
    ''' Learns emissions parameters from data and returns them as a nested dictionary '''
    with open(train_filename, "r") as f:
        lines = f.readlines()

    # Keep set of all unique states and observations
    states = set()
    observations = set()

    # Track emission counts
    emissions = {} # Where key is y, and value is a dictionary of emissions x from y with their frequency

    # Learn from data
    for line in lines:
        data_split = line.strip().split(" ")

        # Only process valid lines
        if len(data_split) == 2:
            obs, state = data_split[0], data_split[1]

            states.add(state)
            observations.add(obs)

            # Track this emission
            current_emissions = {}
            if state in emissions:
                current_emissions = emissions[state]

            # If it exists, increment it, if not set it to 1
            if obs in current_emissions:
                current_emissions[obs] += 1
            else:
                current_emissions[obs] = 1

            emissions[state] = current_emissions # Update
    
    return emissions

In [24]:
len(states), len(observations)

(7, 48925)

## Estimating Emission Parameters
We make use of MLE to estimate the emission parameters based on the training data.

In [38]:
def get_emission_parameters(emissions, x, y):
    ''' Returns the MLE of the emission parameters based on the emissions dictionary '''
    state_data = emissions[y]
    count_y_x = state_data[x] # Numerator
    count_y = sum(state_data.values()) # Denominator
    
    e = count_y_x / count_y
    return e

## Estimating with Smoothing

In [48]:
def get_emission_parameters(emissions, x, y, k=1):
    ''' Returns the MLE of the emission parameters based on the emissions dictionary '''
    state_data = emissions[y]
    count_y = sum(state_data.values()) # Denominator
    
    # If x == "#UNK#", it will return the following
    count_y_x = k
    
    # If x exists in training, return its MLE instead
    if (x != "#UNK#") and (x in state_data):
        count_y_x = state_data[x] # Numerator
    
    e = count_y_x / count_y
    return e

In [65]:
def label_sequence(sentence, emissions):
    ''' Takes a list `sentence` that contains words of a sentence as strings '''
    tags = []
    
    for word in sentence:
        predicted_label = ""
        max_prob = 0
        
        # Find y with maximum probability
        for y in emissions:
            prob = get_emission_parameters(emissions, word, y)
            
            # If this is higher than the previous highest, use this
            if prob > max_prob:
                predicted_label = y
                max_prob = prob
    
        # Add prediction to list
        tags.append(predicted_label)
    
    return tags
    
# label_sequence(["I'm", "a", "kumquat"], emissions)

# Training and Validation

In [66]:
dataset = "SG" # {SG, CN, EN, FR}
train_filename = f"data/{dataset}/train"
validation_filename = f"data/{dataset}/dev.in"

# Train
emissions = learn_emissions(train_filename)

In [87]:
# Test on validation set
with open(validation_filename, "r") as f:
    lines = f.readlines()
    sentence = []

result = []

for word in tqdm_notebook(lines):
    # If it's a newline, it's the end of a sentence. Predict!
    if word == "\n":
        preds = label_sequence(sentence, emissions)
        
        # Add predictions to overall results
        result += preds
        result += ["\n"]
        
        # Reset sentence list
        sentence = []
    else: # Sentence has not ended
        # Add word to sentence
        sentence.append(word.strip())

HBox(children=(IntProgress(value=0, max=44106), HTML(value='')))




In [88]:
# Write predictions to file
with open(validation_filename.replace(".in", ".p2.out"), "w") as f:
    for i in range(len(lines)):
        word = lines[i].strip()
        
        # Only write if it's not a newline
        if word:
            pred = result[i]
            f.write(word + " " + pred)
        
        f.write("\n")

# Evaluation Metrics
Calculate precision, recall, and F score.

Use `evalResult.py` to evaluate.

In [90]:
# # Load gold standard for dev set
# with open(validation_filename.replace(".in", ".out"), "r") as f:
#     lines = f.readlines()

# # Iterate through each line and calculate scores
# for i in range(len(lines)):
#     line = lines[i]
#     data_split = line.strip().split(" ")
    
#     # If this not a newline, process it
#     if len(data_split) == 2:
#         word, label = data_split[0], data_split[1]
#         pred = result[i]