In [1]:
from pathlib import Path
from collections import defaultdict
import numpy as np

data_dir = Path("data/")
dataset = "EN" # EN or ES

# Part 1(i): Emission scores
Compute $e(x|y) = \frac{\text{Count}(y \rightarrow x)}{\text{Count}(y)}$ and store it into a dictionary $f$.

In [2]:
def calculate_emission_scores(path):
    '''
    Inputs:
        path (Path object or str): path on the local directory to the dataset to load from.
    Outputs:
        f (dict): Key-value mapping of str(x, y) to log(e(x|y)) values.
    '''

    count_emission = defaultdict(int) # Stores Count(y -> x), where key is tuple (x, y), and value is Count(y -> x)
    count_labels = defaultdict(int) # Stores Count(y), where key is y
    
    # Read from dataset path
    with open(path) as f:
        lines = f.readlines()
        
        # Process lines
        for line in lines:
            # Strip newline
            formatted_line = line.strip()
            
            # Only process lines that are not newlines
            if len(formatted_line) > 0:
                # Split into (x, y) pair
                split_data = formatted_line.split(" ")
                x, y = split_data[0].lower(), split_data[1]

                count_emission[(x, y)] += 1
                count_labels[y] += 1
    
    # Result dictionary that maps str(x, y) to log(e(x|y)) values
    f = {}
    
    # Estimate e(x|y) based on counts
    for x, y in count_emission:
        # Create str(x, y)
        feature_str = f"emission:{y}+{x}"
        e = count_emission[(x, y)] / count_labels[y]
        
        # Store score
        f[feature_str] = np.log(e)
    
    return f

# Part 1(ii): Transition scores
Compute $q(y_i|y_{i-1}) = \frac{\text{Count}(y_{i-1}, y_i)}{\text{Count}(y_{i-1})}$ and store it into a dictionary $f$.

In [3]:
def calculate_transition_scores(path):
    '''
    Inputs:
        path (Path object or str): path on the local directory to the dataset to load from.
    Outputs:
        f (dict): Key-value mapping of str(y_{i-1}, y_i) to log(q(y_i|y_{i-1})) values.
    '''
    
    count_transition = defaultdict(int) # Key is tuple (y_i, y_{i-1}), and value is Count(y_{i-1}, y_i)
    count_labels = defaultdict(int) # Stores Count(y_i), where key is y_i
    
    # Read from dataset path
    with open(path) as f:
        lines = f.readlines()
        # Initialize prev_y as START
        prev_y = "START"
        
        # Process lines
        for line in lines:
            # Strip newline
            formatted_line = line.strip()
            
            # Only process lines that are not newlines
            if len(formatted_line) > 0:
                # Split into (x, y) pair
                split_data = formatted_line.split(" ")
                x, y = split_data[0].lower(), split_data[1]
                
                transition_key = (prev_y, y)
                count_transition[transition_key] += 1
                count_labels[y] += 1
                
                # Update for next word
                prev_y = y
            else:
                # End of sentence
                # Store Count(STOP|y_n)
                transition_key = (prev_y, "STOP")
                count_transition[transition_key] += 1
                
                # Start of next sentence: initialize prev_y to "START" and store Count(START)
                prev_y = "START"
                count_labels[prev_y] += 1
    
    # Result dictionary that maps str(x, y) to log(e(x|y)) values
    f = {}
    
    # Estimate e(x|y) based on counts
    for prev_y, y in count_transition:
        # Create str(y_{i-1}, y_i)
        feature_str = f"transition:{prev_y}+{y}"
        q = count_transition[(prev_y, y)] / count_labels[prev_y]
        
        # Store score
        f[feature_str] = np.log(q)
    
    return f

In [4]:
# Compute emission and transition parameters
f_emission_train = calculate_emission_scores(data_dir / dataset / "train")
f_transition_train = calculate_transition_scores(data_dir / dataset / "train")

# Combine the transition and emission dictionaries together
f = {**f_emission_train, **f_transition_train}
# Ensure the number of elements is correct
assert(len(f) == len(f_emission_train) + len(f_transition_train))

# Part 2(i)
Compute CRF scores for a given input and output sequence pair.

In [5]:
def compute_crf_score(x, y, feature_dict):
    ''' 
    Inputs:
        x (list[str]): Complete input word sentence (without START or STOP tags)
        y (list[str]): Complete output label sequence
        feature_dict (dict[str] -> float): Dictionary that maps a given feature to its score.
    Outputs:
        p (float): Score given by p(y | x)
    '''
    
    # Input and output sequences must be of the same length
    assert(len(x) == len(y))
    n = len(x) # Sequence length
    
    # Stores the number of times each feature appears in (x, y)
    feature_count = defaultdict(int)
    
    # Compute emission features
    for i in range(n):
        formatted_word = x[i].lower()
        emission_key = f"emission:{y[i]}+{formatted_word}"
        feature_count[emission_key] += 1
    
    # Compute transition features
    # Add START and STOP tags to y
    updated_y = ["START"] + y + ["STOP"]
    for i in range(1, n+2):
        prev_y = updated_y[i-1]
        y_i = updated_y[i]
        transition_key = f"transition:{prev_y}+{y_i}"
        feature_count[transition_key] += 1
    
    # Compute score
    score = 0
    for feature_key, count in feature_count.items():
        weight = feature_dict[feature_key]
        score += weight * count
    
    return score

# Test function
x = "Great food with an awesome atmosphere !".split()
y = "O B-positive O O O B-positive O".split()
compute_crf_score(x, y, f)

-44.57667948595218

# Part 2(ii)
Viterbi algorithm for decoding.

In [6]:
def get_states(path):
    '''
    Inputs:
        path (Path object or str): path on the local directory to the dataset to load from.
    Outputs:
        states (list[str]): Unique states in the dataset.
    '''
    states = set()
    
    # Read from dataset path
    with open(path) as f:
        lines = f.readlines()
        
        # Process lines
        for line in lines:
            # Strip newline
            formatted_line = line.strip()
            
            # Only process lines that are not newlines
            if len(formatted_line) > 0:
                # Split into (x, y) pair
                split_data = formatted_line.split(" ")
                x, y = split_data[0], split_data[1]
                states.add(y)
    
    return list(states)

def viterbi_decode(x, states, feature_dict, default_index=0):
    '''
    Inputs:
        x (list[str]): Input sequence.
        states (list[str]): Possible output states.
        feature_dict (dict[str] -> float): Dictionary that maps a given feature to its score.
        default_index (int, optional): Index to default for backpointer if scores are nan.
    Outputs:
        y (list[str]): Most probable output sequence.
    '''
    
    n = len(x) # Number of words
    d = len(states) # Number of states
    scores = np.full((n, d), np.nan)
    bp = np.full((n, d), default_index, dtype=np.int) # TODO: Default to 'O', or something else?

    # Convert to lowercase
    x = [x[i].lower() for i in range(n)]
    
    # Compute START transition scores
    for i, current_y in enumerate(states):
        transition_key = f"transition:START+{current_y}"
        emission_key = f"emission:{current_y}+{x[0]}"
        if transition_key in feature_dict and emission_key in feature_dict:
            transmission_score = feature_dict[transition_key]
            emission_score = feature_dict[emission_key]
            scores[0, i] = transmission_score + emission_score
    
    # Recursively compute best scores based on transmission and emission scores at each node
    for i in range(1, n):
        for k, prev_y in enumerate(states):
            for j, current_y in enumerate(states):
                transition_key = f"transition:{prev_y}+{current_y}"
                emission_key = f"emission:{current_y}+{x[i]}"
                
                # Only consider if the feature exists
                if transition_key in feature_dict and emission_key in feature_dict:
                    transition_score = feature_dict[transition_key]
                    emission_score = feature_dict[emission_key]
                    overall_score = emission_score + transition_score + scores[i-1, k]
                    
                    # Better score is found: Update backpointer and score arrays
                    if (np.isnan(scores[i, j]) and not np.isnan(overall_score)) or overall_score > scores[i, j]:
                        scores[i, j] = overall_score
                        bp[i,j] = k
    
    # Compute for STOP
    highest_score = None
    highest_bp = default_index
    
    for j, prev_y in enumerate(states):
        if not np.isnan(scores[n-1, j]):
            transition_key = f"transition:{prev_y}+STOP"

            if transition_key in feature_dict:
                transition_score = feature_dict[transition_key]
                overall_score = transition_score + scores[n-1, j]
                if highest_score == None or overall_score > highest_score:
                    highest_score = overall_score
                    highest_bp = j
    
    # Follow backpointers to get output sequence
    result = [states[highest_bp]]
    prev_bp = highest_bp
    for i in range(n-1, 0, -1):
        prev_bp = bp[i, prev_bp]
        output = states[prev_bp]
        # Prepend result to output list
        result = [output] + result
    
    return result

states = get_states(data_dir / dataset / "train")
viterbi_decode(x, states, f) # Should be similar or equal to y

['O', 'B-positive', 'O', 'O', 'O', 'B-positive', 'O']

In [7]:
# Perform decoding on dev sets
def inference(path, states, feature_dict):
    '''
    Given a path, perform inference on sentences in the dataset and writes it to disk.
    
    Inputs:
        path (Path object or str): path on the local directory to the dataset to perform inference on.
        states (list[str]): Unique states that can be predicted.
        feature_dict (dict[str] -> float): Dictionary that maps a given feature to its score.
    Outputs:
        None
    '''
    
    default_index = states.index('O')
    sentences = []

    # Write predictions to file
    output_filename = str(path).replace(".in", ".p2.out")

    # Read from dataset path
    with open(path) as f:
        lines = f.readlines()
        sentence = []
        
        for line in lines:
            formatted_line = line.strip()
            
            # Not the end of sentence, add it to the list
            if len(formatted_line) > 0:
                sentence.append(formatted_line)
            else:
                # End of sentence
                sentences.append(sentence)
                sentence = []
    
    # Write output file
    with open(output_filename, "w") as wf:
        for sentence in sentences:
            # Run predictions
            pred_sentence = viterbi_decode(sentence, states, feature_dict, default_index)
            
            # Write original word and predicted tags
            for i in range(len(sentence)):
                wf.write(sentence[i] + " " + pred_sentence[i] + "\n")
            
            # End of sentence, write newline
            wf.write("\n")

inference(data_dir / dataset / "dev.in", states, f)

# Part 3(i)
Loss calculation using forward algorithm. We first define the score calculation functions for a single sample.

In [8]:
def compute_forward_score(x, feature_dict, states):
    '''
    Uses the forward algorithm to compute the score for a given sequence.
    
    Inputs:
        x (list[str]): Input sequence.
        feature_dict (dict[str] -> float): Dictionary that maps a given feature to its score.
        states (list[str]): List of unique states.
    Outputs:
        forward_score (float): Forward score for this sequence.
    '''
    
    n = len(x) # Number of words
    d = len(states) # Number of states
    scores = np.full((n, d), np.nan)

    # Convert to lowercase
    x = [x[i].lower() for i in range(n)]
    
    # Compute START transition scores
    for i, current_y in enumerate(states):
        transition_key = f"transition:START+{current_y}"
        emission_key = f"emission:{current_y}+{x[0]}"
        if transition_key in feature_dict and emission_key in feature_dict:
            transmission_score = feature_dict[transition_key]
            emission_score = feature_dict[emission_key]
            # Sum exponentials
            scores[0, i] = np.exp(transmission_score + emission_score)
    
    # Recursively compute best scores based on transmission and emission scores at each node
    for i in range(1, n):
        for k, prev_y in enumerate(states):
            for j, current_y in enumerate(states):
                transition_key = f"transition:{prev_y}+{current_y}"
                emission_key = f"emission:{current_y}+{x[i]}"
                
                # Only consider if the feature exists
                if transition_key in feature_dict and emission_key in feature_dict:
                    transition_score = feature_dict[transition_key]
                    emission_score = feature_dict[emission_key]
                    
                    # Sum exponentials
                    overall_score = np.exp(emission_score + transition_score) * scores[i-1, k]
                    
                    # Better score is found: Add to score array
                    if not np.isnan(overall_score):
                        if np.isnan(scores[i, j]):
                            scores[i, j] = 0
                        
                        scores[i, j] += overall_score
    
    # Compute for STOP
    forward_score = 0
    
    for j, prev_y in enumerate(states):
        if not np.isnan(scores[n-1, j]):
            transition_key = f"transition:{prev_y}+STOP"

            if transition_key in feature_dict:
                transition_score = feature_dict[transition_key]
                
                # Sum exponentials
                overall_score = np.exp(transition_score) * scores[n-1, j]
                forward_score += overall_score
    
    # Take log over entire sum
    forward_score = np.log(forward_score)
    
    return forward_score


def compute_crf_loss_sample(x, y, feature_dict, states):
    '''
    Inputs:
        x (list[str]): Input sequence.
        y (list[str]): Groundtruth output sequence.
        feature_dict (dict[str] -> float): Dictionary that maps a given feature to its score.
        states (list[str]): List of unique states.
    Outputs:
        loss (float): Loss value for a single sample.
    '''
    first_term = compute_crf_score(x, y, feature_dict)
    forward_score = compute_forward_score(x, feature_dict, states)
    
    loss = -(first_term - forward_score)
    return loss

We use the methods defined early to compute the loss across the entire dataset.

In [68]:
def get_dataset(path):
    '''
    Given a path, load the data from file.
    
    Inputs:
        path (Path object or str): path on the local directory to the dataset to perform inference on.
    Outputs:
        input_sequences (list[list[str]]): List of input sequences
        input_labels (list[list[str]]): List of input labels
    '''
    input_sequences = []
    input_labels = []
    
    # Read from dataset path
    with open(path) as f:
        lines = f.readlines()
        sentence = []
        labels = []
        
        for line in lines:
            formatted_line = line.strip()
            
            # Not the end of sentence, add it to the list
            if len(formatted_line) > 0:
                split_data = formatted_line.split(" ")
                x, y = split_data[0].lower(), split_data[1]

                sentence.append(x)
                labels.append(y)
            else:
                # End of sentence
                input_sequences.append(sentence)
                input_labels.append(labels)
                sentence = []
                labels = []
    
    return input_sequences, input_labels

def compute_crf_loss(input_sequences, input_labels, feature_dict, states):
    '''
    Given a path, perform inference on sentences in the dataset and writes it to disk.
    
    Inputs:
        input_sequences (list[list[str]]): List of input sequences
        input_labels (list[list[str]]): List of input labels
        feature_dict (dict[str] -> float): Dictionary that maps a given feature to its score.
        states (list[str]): List of unique states.
    Outputs:
        loss (float): Loss value for the dataset.
    '''
    loss = 0
    for i in range(len(input_sequences)):
        sample_loss = compute_crf_loss_sample(input_sequences[i], input_labels[i], feature_dict, states)
        loss += sample_loss
    
    return loss

f_zero = {k: -3 for k in f}
train_inputs, train_labels = get_dataset(data_dir / dataset / "train")
compute_crf_loss(train_inputs, train_labels, f, states)

2178.0848344696287

In [125]:
def forward_backward(x, feature_dict, states):
    '''
    Uses the backward algorithm to compute the score for a given sequence.
    
    Inputs:
        x (list[str]): Input sequence.
        feature_dict (dict[str] -> float): Dictionary that maps a given feature to its score.
        states (list[str]): List of unique states.
    Outputs:
        backward_score (float): Backward score for this sequence.
    '''
    
    n = len(x) # Number of words
    d = len(states) # Number of states
    forward_scores = np.zeros((n, d))
    backward_scores = np.zeros((n, d))
    features = [[[] for j in range(d)] for i in range(n)]

    # Convert to lowercase
    x = [x[i].lower() for i in range(n)]
    
    # Start forward pass
    # Compute START transition scores
    for i, current_y in enumerate(states):
        transition_key = f"transition:START+{current_y}"
        emission_key = f"emission:{current_y}+{x[0]}"
        if transition_key in feature_dict and emission_key in feature_dict:
            transmission_score = feature_dict[transition_key]
            emission_score = feature_dict[emission_key]
            # Sum exponentials
            forward_scores[0, i] = np.exp(transmission_score + emission_score)
        
        features[0][i].extend([transition_key, emission_key])
    
    # Recursively compute best scores based on transmission and emission scores at each node
    for i in range(1, n):
        for k, prev_y in enumerate(states):
            for j, current_y in enumerate(states):
                transition_key = f"transition:{prev_y}+{current_y}"
                emission_key = f"emission:{current_y}+{x[i]}"
                
                # Only consider if the feature exists
                if transition_key in feature_dict and emission_key in feature_dict:
                    transition_score = feature_dict[transition_key]
                    emission_score = feature_dict[emission_key]
                    # Sum exponentials
                    overall_score = np.exp(emission_score + transition_score) * forward_scores[i-1, k]

                    # Better score is found: Add to score array
                    forward_scores[i, j] += overall_score
                    features[i][j].extend([transition_key, emission_key])

    # Compute for STOP
    forward_prob = 0
    for j, prev_y in enumerate(states):
        transition_key = f"transition:{prev_y}+STOP"
        
        # TODO: Take care of this
#         features[n][j].extend([transition_key])

        if transition_key in feature_dict:
            transition_score = feature_dict[transition_key]
            # Sum exponentials
            overall_score = np.exp(transition_score) * forward_scores[n-1, j]
            forward_prob += overall_score
    # End forward pass

    # Start backward pass
    # Compute STOP transition scores
    for i, current_y in enumerate(states):
        transition_key = f"transition:{current_y}+STOP"
        if transition_key in feature_dict: # and emission_key in feature_dict:
            transition_score = feature_dict[transition_key]
            # Sum exponentials
            backward_scores[n-1, i] = np.exp(transition_score)
    
    # Recursively compute best scores based on transmission and emission scores at each node
    for i in range(n-1, 0, -1):
        for k, next_y in enumerate(states):
            for j, current_y in enumerate(states):
                transition_key = f"transition:{current_y}+{next_y}"
                emission_key = f"emission:{next_y}+{x[i]}"
                
                # Only consider if the feature exists
                if transition_key in feature_dict and emission_key in feature_dict:
                    transition_score = feature_dict[transition_key]
                    emission_score = feature_dict[emission_key]
                    
                    # Sum exponentials
                    overall_score = np.exp(emission_score + transition_score) * backward_scores[i, k]
                    
                    # Better score is found: Add to score array
                    backward_scores[i-1, j] += overall_score
    
    # Compute for STOP
    backward_prob = 0
    
    for j, next_y in enumerate(states):
        transition_key = f"transition:START+{next_y}"
        emission_key = f"emission:{next_y}+{x[0]}" # Emission of last word

        if transition_key in feature_dict and emission_key in feature_dict:
            transition_score = feature_dict[transition_key]
            emission_score = feature_dict[emission_key]
            # Sum exponentials
            overall_score = np.exp(emission_score + transition_score) * backward_scores[0, j]
            backward_prob += overall_score
    # End backward pass
    
    # Ensure forward and backward prob are the same up to floating point errors
    assert(abs(forward_prob - backward_prob) < 1e-8)

    expected_counts = np.zeros((n, d))
    for i in range(0, n):
        for j, y in enumerate(states):
            expected_counts[i, j] = forward_scores[i, j] * backward_scores[i, j] / forward_prob
    
    # Compute expected count of each feature
    feature_expected_counts = defaultdict(float)
    for i in range(n):
        for j in range(d):
            for feature in features[i][j]:
                feature_expected_counts[feature] += expected_counts[i, j]
    
    return feature_expected_counts

In [121]:
def get_feature_count(x, y, feature_dict):
    ''' 
    Inputs:
        x (list[str]): Complete input word sentence (without START or STOP tags)
        y (list[str]): Complete output label sequence
        feature_dict (dict[str] -> float): Dictionary that maps a given feature to its score.
    Outputs:
        p (float): Score given by p(y | x)
    '''
    
    # Input and output sequences must be of the same length
    assert(len(x) == len(y))
    n = len(x) # Sequence length
    
    # Stores the number of times each feature appears in (x, y)
    feature_count = defaultdict(int)
    
    # Compute emission features
    for i in range(n):
        formatted_word = x[i].lower()
        emission_key = f"emission:{y[i]}+{formatted_word}"
        feature_count[emission_key] += 1
    
    # Compute transition features
    # Add START and STOP tags to y
    updated_y = ["START"] + y + ["STOP"]
    for i in range(1, n+2):
        prev_y = updated_y[i-1]
        y_i = updated_y[i]
        transition_key = f"transition:{prev_y}+{y_i}"
        feature_count[transition_key] += 1
    
    return feature_count

In [126]:
feature_gradients = defaultdict(float)

for i in range(len(train_labels)):
    x = train_inputs[i]
    y = train_labels[i]
    
    feature_expected_counts = forward_backward(x, f, states)
    actual_counts = get_feature_count(x, y, f)

    for k, v in feature_expected_counts.items():
        feature_gradients[k] += v

    for k, v in actual_counts.items():
        feature_gradients[k] -= v

defaultdict(<class 'float'>, {'transition:START+O': 0.9799132852457739, 'emission:O+all': 7.8164209852318205, 'transition:START+B-positive': 0.009724526694190787, 'emission:B-positive+all': 0.031957494132006756, 'transition:START+B-neutral': 0.0, 'emission:B-neutral+all': 0.0, 'transition:START+I-neutral': 0.0, 'emission:I-neutral+all': 0.0, 'transition:START+B-negative': 0.010362188060035062, 'emission:B-negative+all': 0.02260174720026268, 'transition:START+I-negative': 0.0, 'emission:I-negative+all': 0.0, 'transition:START+I-positive': 0.0, 'emission:I-positive+all': 0.0, 'transition:O+O': 12.212154139816437, 'emission:O+in': 6.983145625705155, 'transition:B-positive+O': 12.212154139816437, 'transition:B-neutral+O': 12.212154139816437, 'transition:I-neutral+O': 12.212154139816437, 'transition:B-negative+O': 12.212154139816437, 'transition:I-negative+O': 12.212154139816437, 'transition:I-positive+O': 12.212154139816437, 'transition:B-negative+I-negative': 0.0021233408439760553, 'emiss

defaultdict(<class 'float'>, {'transition:START+O': 0.9313621915410655, 'emission:O+outside': 0.9313621915410655, 'transition:START+B-positive': 0.06863780845893415, 'emission:B-positive+outside': 0.06863780845893415, 'transition:START+B-neutral': 0.0, 'emission:B-neutral+outside': 0.0, 'transition:START+I-neutral': 0.0, 'emission:I-neutral+outside': 0.0, 'transition:START+B-negative': 0.0, 'emission:B-negative+outside': 0.0, 'transition:START+I-negative': 0.0, 'emission:I-negative+outside': 0.0, 'transition:START+I-positive': 0.0, 'emission:I-positive+outside': 0.0, 'transition:O+O': 18.639589731905904, 'emission:O+in': 6.971274621638217, 'transition:B-positive+O': 18.639589731905904, 'transition:B-neutral+O': 18.639589731905904, 'transition:I-neutral+O': 18.639589731905904, 'transition:B-negative+O': 18.639589731905904, 'transition:I-negative+O': 18.639589731905904, 'transition:I-positive+O': 18.639589731905904, 'transition:B-negative+I-negative': 0.16752738207428128, 'emission:I-neg

defaultdict(<class 'float'>, {'transition:START+O': 1.0, 'emission:O+try': 1.0, 'transition:START+B-positive': 0.0, 'emission:B-positive+try': 0.0, 'transition:START+B-neutral': 0.0, 'emission:B-neutral+try': 0.0, 'transition:START+I-neutral': 0.0, 'emission:I-neutral+try': 0.0, 'transition:START+B-negative': 0.0, 'emission:B-negative+try': 0.0, 'transition:START+I-negative': 0.0, 'emission:I-negative+try': 0.0, 'transition:START+I-positive': 0.0, 'emission:I-positive+try': 0.0, 'transition:O+O': 12.438068854958384, 'emission:O+the': 20.978966474016108, 'transition:B-positive+O': 12.438068854958384, 'transition:B-neutral+O': 12.438068854958384, 'transition:I-neutral+O': 12.438068854958384, 'transition:B-negative+O': 12.438068854958384, 'transition:I-negative+O': 12.438068854958384, 'transition:I-positive+O': 12.438068854958384, 'transition:O+B-positive': 3.0704126304944483, 'emission:B-positive+the': 0.005527665729060794, 'transition:B-positive+B-positive': 3.0704126304944483, 'transit

defaultdict(<class 'float'>, {'transition:START+O': 1.0, 'emission:O+great': 1.0, 'transition:START+B-positive': 0.0, 'emission:B-positive+great': 0.0, 'transition:START+B-neutral': 0.0, 'emission:B-neutral+great': 0.0, 'transition:START+I-neutral': 0.0, 'emission:I-neutral+great': 0.0, 'transition:START+B-negative': 0.0, 'emission:B-negative+great': 0.0, 'transition:START+I-negative': 0.0, 'emission:I-negative+great': 0.0, 'transition:START+I-positive': 0.0, 'emission:I-positive+great': 0.0, 'transition:O+O': 9.993151187561795, 'emission:O+bagels': 1.878185732515605, 'transition:B-positive+O': 9.993151187561795, 'transition:B-neutral+O': 9.993151187561795, 'transition:I-neutral+O': 9.993151187561795, 'transition:B-negative+O': 9.993151187561795, 'transition:I-negative+O': 9.993151187561795, 'transition:I-positive+O': 9.993151187561795, 'transition:O+B-positive': 1.518597888664357, 'emission:B-positive+bagels': 1.193705898620042, 'transition:B-positive+B-positive': 1.518597888664357, '

defaultdict(<class 'float'>, {'transition:START+O': 1.0000000000000002, 'emission:O+visited': 1.0000000000000002, 'transition:START+B-positive': 0.0, 'emission:B-positive+visited': 0.0, 'transition:START+B-neutral': 0.0, 'emission:B-neutral+visited': 0.0, 'transition:START+I-neutral': 0.0, 'emission:I-neutral+visited': 0.0, 'transition:START+B-negative': 0.0, 'emission:B-negative+visited': 0.0, 'transition:START+I-negative': 0.0, 'emission:I-negative+visited': 0.0, 'transition:START+I-positive': 0.0, 'emission:I-positive+visited': 0.0, 'transition:O+O': 8.99808043579167, 'emission:O+there': 7.0, 'transition:B-positive+O': 8.99808043579167, 'transition:B-neutral+O': 8.99808043579167, 'transition:I-neutral+O': 8.99808043579167, 'transition:B-negative+O': 8.99808043579167, 'transition:I-negative+O': 8.99808043579167, 'transition:I-positive+O': 8.99808043579167, 'emission:O+while': 7.0, 'emission:O+on': 7.0, 'transition:B-neutral+I-neutral': 0.0, 'emission:I-neutral+on': 0.0, 'transition:I

defaultdict(<class 'float'>, {'transition:START+O': 1.0000000000000002, 'emission:O+so': 1.0000000000000002, 'transition:START+B-positive': 0.0, 'emission:B-positive+so': 0.0, 'transition:START+B-neutral': 0.0, 'emission:B-neutral+so': 0.0, 'transition:START+I-neutral': 0.0, 'emission:I-neutral+so': 0.0, 'transition:START+B-negative': 0.0, 'emission:B-negative+so': 0.0, 'transition:START+I-negative': 0.0, 'emission:I-negative+so': 0.0, 'transition:START+I-positive': 0.0, 'emission:I-positive+so': 0.0, 'transition:O+O': 28.87162469063925, 'emission:O+about': 14.0, 'transition:B-positive+O': 28.87162469063925, 'transition:B-neutral+O': 28.87162469063925, 'transition:I-neutral+O': 28.87162469063925, 'transition:B-negative+O': 28.87162469063925, 'transition:I-negative+O': 28.87162469063925, 'transition:I-positive+O': 28.87162469063925, 'emission:O+the': 20.84523727022657, 'transition:O+B-positive': 2.5929753442874857, 'emission:B-positive+the': 0.016935195339199103, 'transition:B-positive+

defaultdict(<class 'float'>, {'transition:START+O': 0.9955521477495023, 'emission:O+the': 7.982115198291185, 'transition:START+B-positive': 0.002593007786039545, 'emission:B-positive+the': 0.0038025050538439257, 'transition:START+B-neutral': 0.0013852555367424493, 'emission:B-neutral+the': 0.0020053830206214537, 'transition:START+I-neutral': 0.0, 'emission:I-neutral+the': 0.0, 'transition:START+B-negative': 0.0004695889277156089, 'emission:B-negative+the': 0.001164277018265683, 'transition:START+I-negative': 0.0, 'emission:I-negative+the': 0.0, 'transition:START+I-positive': 0.0, 'emission:I-positive+the': 0.0, 'transition:O+O': 24.416547797963883, 'emission:O+whole': 6.985527343836875, 'transition:B-positive+O': 24.416547797963883, 'transition:B-neutral+O': 24.416547797963883, 'transition:I-neutral+O': 24.416547797963883, 'transition:B-negative+O': 24.416547797963883, 'transition:I-negative+O': 24.416547797963883, 'transition:I-positive+O': 24.416547797963883, 'transition:B-positive+I

defaultdict(<class 'float'>, {'transition:START+O': 0.9999999999999999, 'emission:O+we': 0.9999999999999999, 'transition:START+B-positive': 0.0, 'emission:B-positive+we': 0.0, 'transition:START+B-neutral': 0.0, 'emission:B-neutral+we': 0.0, 'transition:START+I-neutral': 0.0, 'emission:I-neutral+we': 0.0, 'transition:START+B-negative': 0.0, 'emission:B-negative+we': 0.0, 'transition:START+I-negative': 0.0, 'emission:I-negative+we': 0.0, 'transition:START+I-positive': 0.0, 'emission:I-positive+we': 0.0, 'transition:O+O': 12.90163161701877, 'emission:O+stood': 6.999999999999999, 'transition:B-positive+O': 12.90163161701877, 'transition:B-neutral+O': 12.90163161701877, 'transition:I-neutral+O': 12.90163161701877, 'transition:B-negative+O': 12.90163161701877, 'transition:I-negative+O': 12.90163161701877, 'transition:I-positive+O': 12.90163161701877, 'emission:O+there': 6.999999999999999, 'emission:O+for': 7.0, 'transition:B-negative+I-negative': 0.0, 'emission:I-negative+for': 0.0, 'transit

defaultdict(<class 'float'>, {'transition:START+O': 0.9999999999999992, 'emission:O+i': 7.9999999999999964, 'transition:START+B-positive': 0.0, 'emission:B-positive+i': 0.0, 'transition:START+B-neutral': 0.0, 'emission:B-neutral+i': 0.0, 'transition:START+I-neutral': 0.0, 'emission:I-neutral+i': 0.0, 'transition:START+B-negative': 0.0, 'emission:B-negative+i': 0.0, 'transition:START+I-negative': 0.0, 'emission:I-negative+i': 0.0, 'transition:START+I-positive': 0.0, 'emission:I-positive+i': 0.0, 'transition:O+O': 12.723720229180833, 'emission:O+came': 6.999999999999996, 'transition:B-positive+O': 12.723720229180833, 'transition:B-neutral+O': 12.723720229180833, 'transition:I-neutral+O': 12.723720229180833, 'transition:B-negative+O': 12.723720229180833, 'transition:I-negative+O': 12.723720229180833, 'transition:I-positive+O': 12.723720229180833, 'emission:O+across': 6.999999999999996, 'emission:O+village': 0.0, 'transition:O+B-positive': 1.0116773555336667, 'emission:B-positive+village':

defaultdict(<class 'float'>, {'transition:START+O': 0.9999999999999996, 'emission:O+same': 0.9999999999999996, 'transition:START+B-positive': 0.0, 'emission:B-positive+same': 0.0, 'transition:START+B-neutral': 0.0, 'emission:B-neutral+same': 0.0, 'transition:START+I-neutral': 0.0, 'emission:I-neutral+same': 0.0, 'transition:START+B-negative': 0.0, 'emission:B-negative+same': 0.0, 'transition:START+I-negative': 0.0, 'emission:I-negative+same': 0.0, 'transition:START+I-positive': 0.0, 'emission:I-positive+same': 0.0, 'transition:O+O': 21.250636705993287, 'emission:O+owner': 1.8642148879408909, 'transition:B-positive+O': 21.250636705993287, 'transition:B-neutral+O': 21.250636705993287, 'transition:I-neutral+O': 21.250636705993287, 'transition:B-negative+O': 21.250636705993287, 'transition:I-negative+O': 21.250636705993287, 'transition:I-positive+O': 21.250636705993287, 'transition:O+B-positive': 0.46722582622731024, 'emission:B-positive+owner': 0.9320398253716672, 'transition:B-positive+B

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [123]:
feature_gradients[feature_key]

460.95522132801574

In [114]:
# Check against numerical gradient is equal
feature_key = 'emission:O+all'
new_f = f.copy()
delta = 1e-5

loss1 = compute_crf_loss(train_inputs, train_labels, f, states)
new_f[feature_key] += delta
loss2 = compute_crf_loss(train_inputs, train_labels, new_f, states)

numerical_gradient = (loss2 - loss) / delta
analytical_gradient = feature_gradients[feature_key]
print(numerical_gradient, analytical_gradient)

-0.19886547306668942 -4.112784632852414
