# Homework: Phrasal Chunking

This is your documentation for the chunker homework.

In [9]:
import perc
import default
import sys
from collections import defaultdict

feat_vec = {}
tagset = []
train_data = []

def perc_train(train_data, tagset, numepochs):
    feat_vec = defaultdict(int)
    # insert your code here
    # please limit the number of iterations of training to n iterations
    
    target_tags = ['B-SBAR', 'B-ADJP', 'I-ADJP', 'B-ADVP', 'I-ADVP', 'I-PP', 'B-PRT', 'I-SBAR', 'B-CONJP', 'I-CONJP']
    to_recycle= []
    for j in train_data:
        gold_tags = [i.split()[2] for i in j[0]]
        
        if any(t in gold_tags for t in target_tags):
            to_recycle.append(j)
            
    train_data += to_recycle
    
    # Main loop
    for t in range(numepochs):
        mistakes = 0        
        for j in train_data:
            # Get output chunk tags from Viterbi
            labeled_list = j[0]
            feat_list = j[1]
            
            gold_tags = [i.split()[2] for i in labeled_list]
            output_tags = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag=tagset[0])

            # Update weight vector if the output is incorrect
            if output_tags != gold_tags:
                # Get feature IDs: from training data and from Viterbi output
                feat_ids_gold, feat_ids_output = get_feature_ids(feat_list, gold_tags, output_tags)
                
                for i in range(len(gold_tags)):
                    if output_tags[i] != gold_tags[i]:
                        for f in feat_ids_gold[i]:
                            feat_vec[f] = feat_vec[f] + 1
                        for f in feat_ids_output[i]:
                            feat_vec[f] = feat_vec[f] - 1
                
                mistakes += 1

        print('Mistakes in epoch {0}: {1} out of {2} sentences'.format(t, mistakes, len(train_data)))
    return feat_vec


# Helper function for perc_train. Get feature IDs from a piece of training data.
def get_feature_ids(feat_list, gold_tags, output_tags):
    feat_ids_gold = []
    feat_ids_output = []

    # Get list of list of features - each sublist corresponds to a word
    feat_index = 0
    feat_list_by_words = []
    for i in range(len(gold_tags)):
        (feat_index, feats) = perc.feats_for_word(feat_index, feat_list)
        feat_list_by_words.append(feats)

    # For each feature sublist, create a list of feature IDs ((feature, tag)).
    # One set based on gold tags, another based on argmax tags
    j = 0
    for sublist in feat_list_by_words:
        gold_sublist = []
        out_sublist = []
        for f in sublist:
            # Feature based on bigrams of output tags
            if f == 'B' and j > 0:
                curr_feat_g = '{0}:{1}'.format(f, gold_tags[j - 1])
                curr_feat_o = '{0}:{1}'.format(f, output_tags[j - 1])
            else:
                curr_feat_g = f
                curr_feat_o = f

            gold_sublist.append((curr_feat_g, gold_tags[j]))
            out_sublist.append((curr_feat_o, output_tags[j]))

        feat_ids_gold.append(gold_sublist)
        feat_ids_output.append(out_sublist) 
        j += 1
    return feat_ids_gold, feat_ids_output

In [10]:
tagset = perc.read_tagset("data/tagset.txt")
print("reading data ...", file=sys.stderr)
train_data = perc.read_labeled_data("data/train.txt.gz", "data/train.feats.gz", verbose=False)
#train_data = perc.read_labeled_data("data/train.dev", "data/train.feats.dev", verbose=False)
print("done.", file=sys.stderr)
feat_vec = perc_train(train_data, tagset, 10)
perc.perc_write_to_file(feat_vec, "default.model")
print("wrote model to disk")

reading data ...
done.


Mistakes in epoch 0: 8510 out of 14275 sentences
Mistakes in epoch 1: 4842 out of 14275 sentences
Mistakes in epoch 2: 3059 out of 14275 sentences
Mistakes in epoch 3: 2061 out of 14275 sentences
Mistakes in epoch 4: 1575 out of 14275 sentences
Mistakes in epoch 5: 1255 out of 14275 sentences
Mistakes in epoch 6: 939 out of 14275 sentences
Mistakes in epoch 7: 796 out of 14275 sentences
Mistakes in epoch 8: 688 out of 14275 sentences
Mistakes in epoch 9: 578 out of 14275 sentences
wrote model to disk


In [11]:
%%capture --no-stderr output
print("reading test data ...", file=sys.stderr)
test_data = perc.read_labeled_data("data/dev.txt", "data/dev.feats", verbose=False)
print("done.", file=sys.stderr)
feat_vec = perc.perc_read_from_file("default.model")
perc.perc_testall(feat_vec, test_data, tagset)

reading test data ...
done.


In [12]:
import score_chunks
boundary = "-X-" # something to use as boundary between sentences
outside = "O" # tag used to mark the outside of any chunk
conlleval = False # use conlleval (should be False for most use cases)
numfeats = 2 # number of columns to consider as features, typically "word POStag"
(test, _) = score_chunks.readTestFile(str(output), boundary, outside, conlleval, numfeats)
with open("data/reference500.txt") as f:
    (reference, _) = score_chunks.readTestFile(f.read(), boundary, outside, conlleval, numfeats)
print("Score: %.2f" % score_chunks.corpus_fmeasure(reference, test, False))

processed 500 sentences with 10375 tokens and 5783 phrases; found phrases: 5792; correct phrases: 5333
             ADJP: precision:  66.04%; recall:  70.71%; F1:  68.29; found:    106; correct:     99
             ADVP: precision:  73.89%; recall:  74.26%; F1:  74.07; found:    203; correct:    202
            CONJP: precision:  75.00%; recall:  60.00%; F1:  66.67; found:      4; correct:      5
             INTJ: precision: 100.00%; recall: 100.00%; F1: 100.00; found:      1; correct:      1
               NP: precision:  93.43%; recall:  92.53%; F1:  92.98; found:   2997; correct:   3026
               PP: precision:  96.46%; recall:  98.12%; F1:  97.28; found:   1242; correct:   1221
              PRT: precision:  66.67%; recall:  63.64%; F1:  65.12; found:     21; correct:     22
             SBAR: precision:  81.31%; recall:  81.31%; F1:  81.31; found:    107; correct:    107
               VP: precision:  90.91%; recall:  91.82%; F1:  91.36; found:   1111; correct:   1100
accura