# Homework: Phrasal Chunking

This is your documentation for the chunker homework.

In [1]:
import perc
import default
import sys
from collections import defaultdict

feat_vec = {}
tagset = []
train_data = []

def perc_train(train_data, tagset, numepochs):
    feat_vec = defaultdict(int)
    # insert your code here
    # please limit the number of iterations of training to n iterations
    
    # Main loop
    for t in range(numepochs):
        mistakes = 0        
        for j in train_data:
            # Get output chunk tags from Viterbi
            labeled_list = j[0]
            feat_list = j[1]
            
            gold_tags = [i.split()[2] for i in labeled_list]
            output_tags = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag=tagset[0])

            # Update weight vector if the output is incorrect
            if output_tags != gold_tags:
                # Get feature IDs: from training data and from Viterbi output
                feat_ids_gold, feat_ids_output = get_feature_ids(feat_list, gold_tags, output_tags)
                
                for i in range(len(gold_tags)):
                    if output_tags[i] != gold_tags[i]:
                        for f in feat_ids_gold[i]:
                            feat_vec[f] = feat_vec[f] + 1
                        for f in feat_ids_output[i]:
                            feat_vec[f] = feat_vec[f] - 1
                
                mistakes += 1

        print('Mistakes in epoch {0}: {1} out of {2} sentences'.format(t, mistakes, len(train_data)))
    return feat_vec


# Helper function for perc_train. Get feature IDs from a piece of training data.
def get_feature_ids(feat_list, gold_tags, output_tags):
    feat_ids_gold = []
    feat_ids_output = []

    # Get list of list of features - each sublist corresponds to a word
    feat_index = 0
    feat_list_by_words = []
    for i in range(len(gold_tags)):
        (feat_index, feats) = perc.feats_for_word(feat_index, feat_list)
        feat_list_by_words.append(feats)

    # For each feature sublist, create a list of feature IDs ((feature, tag)).
    # One set based on gold tags, another based on argmax tags
    j = 0
    for sublist in feat_list_by_words:
        gold_sublist = []
        out_sublist = []
        for f in sublist:
            # Feature based on bigrams of output tags
            if f == 'B' and j > 0:
                curr_feat_g = '{0}:{1}'.format(f, gold_tags[j - 1])
                curr_feat_o = '{0}:{1}'.format(f, output_tags[j - 1])
            else:
                curr_feat_g = f
                curr_feat_o = f

            gold_sublist.append((curr_feat_g, gold_tags[j]))
            out_sublist.append((curr_feat_o, output_tags[j]))

        feat_ids_gold.append(gold_sublist)
        feat_ids_output.append(out_sublist) 
        j += 1
    return feat_ids_gold, feat_ids_output

In [2]:
tagset = perc.read_tagset("data/tagset.txt")
print("reading data ...", file=sys.stderr)
train_data = perc.read_labeled_data("data/train.txt.gz", "data/train.feats.gz", verbose=False)
#train_data = perc.read_labeled_data("data/train.dev", "data/train.feats.dev", verbose=False)
print("done.", file=sys.stderr)
feat_vec = perc_train(train_data, tagset, 10)
perc.perc_write_to_file(feat_vec, "default.model")
print("wrote model to disk")

reading data ...
done.


Mistakes in epoch 0: 5654 out of 8936 sentences
Mistakes in epoch 1: 3878 out of 8936 sentences
Mistakes in epoch 2: 2920 out of 8936 sentences
Mistakes in epoch 3: 2265 out of 8936 sentences
Mistakes in epoch 4: 1755 out of 8936 sentences
Mistakes in epoch 5: 1294 out of 8936 sentences
Mistakes in epoch 6: 1098 out of 8936 sentences
Mistakes in epoch 7: 971 out of 8936 sentences
Mistakes in epoch 8: 806 out of 8936 sentences
Mistakes in epoch 9: 663 out of 8936 sentences
wrote model to disk


In [3]:
%%capture --no-stderr output
print("reading test data ...", file=sys.stderr)
test_data = perc.read_labeled_data("data/dev.txt", "data/dev.feats", verbose=False)
print("done.", file=sys.stderr)
feat_vec = perc.perc_read_from_file("default.model")
perc.perc_testall(feat_vec, test_data, tagset)

reading test data ...
done.


In [4]:
import score_chunks
boundary = "-X-" # something to use as boundary between sentences
outside = "O" # tag used to mark the outside of any chunk
conlleval = False # use conlleval (should be False for most use cases)
numfeats = 2 # number of columns to consider as features, typically "word POStag"
(test, _) = score_chunks.readTestFile(str(output), boundary, outside, conlleval, numfeats)
with open("data/reference500.txt") as f:
    (reference, _) = score_chunks.readTestFile(f.read(), boundary, outside, conlleval, numfeats)
print("Score: %.2f" % score_chunks.corpus_fmeasure(reference, test, False))

processed 500 sentences with 10375 tokens and 5783 phrases; found phrases: 5747; correct phrases: 5338
             ADJP: precision:  71.43%; recall:  75.76%; F1:  73.53; found:    105; correct:     99
             ADVP: precision:  75.63%; recall:  73.76%; F1:  74.69; found:    197; correct:    202
            CONJP: precision: 100.00%; recall:  60.00%; F1:  75.00; found:      3; correct:      5
             INTJ: precision:   0.00%; recall:   0.00%; F1:   0.00; found:      0; correct:      1
               NP: precision:  93.43%; recall:  92.60%; F1:  93.01; found:   2999; correct:   3026
               PP: precision:  96.98%; recall:  97.46%; F1:  97.22; found:   1227; correct:   1221
              PRT: precision:  68.42%; recall:  59.09%; F1:  63.41; found:     19; correct:     22
             SBAR: precision:  78.45%; recall:  85.05%; F1:  81.61; found:    116; correct:    107
               VP: precision:  93.89%; recall:  92.27%; F1:  93.08; found:   1081; correct:   1100
accura