# Homework: Phrasal Chunking

This is your documentation for the chunker homework.

In [2]:
import perc
import default
import sys
from collections import defaultdict

feat_vec = {}
tagset = []
train_data = []

def perc_train(train_data, tagset, numepochs):
    feat_vec = defaultdict(int)
    # insert your code here
    # please limit the number of iterations of training to n iterations
    
    # Main loop
    for t in range(numepochs):
        mistakes = 0        
        for j in train_data:
            # Get output chunk tags from Viterbi
            labeled_list = j[0]
            feat_list = j[1]
            
            gold_tags = [i.split()[2] for i in labeled_list]
            output_tags = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag=tagset[0])

            # Update weight vector if the output is incorrect
            if output_tags != gold_tags:
                # Get feature IDs: from training data and from Viterbi output
                feat_ids_gold, feat_ids_output = get_feature_ids(feat_list, gold_tags, output_tags)
                
                for i in range(len(gold_tags)):
                    if output_tags[i] != gold_tags[i]:
                        for j in range(len(feat_ids_gold[i])):
                            update = get_update(gold_tags[i], output_tags[i], feat_ids_gold[i][j], feat_ids_output[i][j])
                            
                            feat_vec[feat_ids_gold[i][j]] = feat_vec[feat_ids_gold[i][j]] + update[0]
                            feat_vec[feat_ids_output[i][j]] = feat_vec[feat_ids_output[i][j]] - update[1]
                
                mistakes += 1

        print('Mistakes in epoch {0}: {1} out of {2} sentences'.format(t, mistakes, len(train_data)))
    return feat_vec


def get_update(gold_tag, output_tag, feat_id_gold, feat_id_output):
    # Current word
    if feat_id_gold[0][:3] == 'U02':
        return (4, 4)
    # Previous and next POS
    # if feat_id_gold[0][:3] in ['U11', 'U13']:
    #    return (2, 2)
    
    # Output and gold tags are of different POS
    #if gold_tag[1:] != output_tag[1:]:
    #    return (2, 2)
    
    # The POS of certain categories
    #if gold_tag[2:] in ['PP', 'ADVP', 'ADJP'] and 'U012' in feat_id_gold[0]:
    #    return (4, 4)
    
    # Bigram feature on chunk tags
    if feat_id_gold[0][0] == 'B':
        return (4, 4)
    
    # Features around conjunctions
    if 'CONJP' in gold_tag:
        if feat_id_gold[0][:3] in ['U02', 'U12', 'U11'] :
            #and feat_id_gold[0][3:].split('/')[0] == feat_id_gold[0][3:].split('/')[2]:
            return (6, 6)

    if 'ADJP' in gold_tag and (feat_id_gold[0][0] == 'B'):
        return (6, 6)
    
    # Surrounding POSs
    if feat_id_gold[0][:3] in ['U10', 'U12', 'U13']:
        return (4, 4)
    
    # POS after adjectives
    if 'ADJP' in gold_tag and feat_id_gold[0][:3] in ['U13']:
        return (4, 4)
        
    return (1, 1)

# Helper function for perc_train. Get feature IDs from a piece of training data.
def get_feature_ids(feat_list, gold_tags, output_tags):
    feat_ids_gold = []
    feat_ids_output = []

    # Get list of list of features - each sublist corresponds to a word
    feat_index = 0
    feat_list_by_words = []
    for i in range(len(gold_tags)):
        (feat_index, feats) = perc.feats_for_word(feat_index, feat_list)
        feat_list_by_words.append(feats)

    # For each feature sublist, create a list of feature IDs ((feature, tag)).
    # One set based on gold tags, another based on argmax tags
    j = 0
    for sublist in feat_list_by_words:
        gold_sublist = []
        out_sublist = []
        for f in sublist:
            # Feature based on bigrams of output tags
            if f == 'B' and j > 0:
                curr_feat_g = '{0}:{1}'.format(f, gold_tags[j - 1])
                curr_feat_o = '{0}:{1}'.format(f, output_tags[j - 1])
            else:
                curr_feat_g = f
                curr_feat_o = f

            gold_sublist.append((curr_feat_g, gold_tags[j]))
            out_sublist.append((curr_feat_o, output_tags[j]))

        feat_ids_gold.append(gold_sublist)
        feat_ids_output.append(out_sublist) 
        j += 1
    return feat_ids_gold, feat_ids_output

In [3]:
tagset = perc.read_tagset("data/tagset.txt")
print("reading data ...", file=sys.stderr)
train_data = perc.read_labeled_data("data/train.txt.gz", "data/train.feats.gz", verbose=False)
#train_data = perc.read_labeled_data("data/train.dev", "data/train.feats.dev", verbose=False)
print("done.", file=sys.stderr)
feat_vec = perc_train(train_data, tagset, 10)
perc.perc_write_to_file(feat_vec, "default.model")
print("wrote model to disk")

reading data ...
done.


Mistakes in epoch 0: 5672 out of 8936 sentences
Mistakes in epoch 1: 4289 out of 8936 sentences
Mistakes in epoch 2: 3545 out of 8936 sentences
Mistakes in epoch 3: 3004 out of 8936 sentences
Mistakes in epoch 4: 2565 out of 8936 sentences
Mistakes in epoch 5: 2174 out of 8936 sentences
Mistakes in epoch 6: 1845 out of 8936 sentences
Mistakes in epoch 7: 1571 out of 8936 sentences
Mistakes in epoch 8: 1455 out of 8936 sentences
Mistakes in epoch 9: 1242 out of 8936 sentences
wrote model to disk


In [4]:
%%capture --no-stderr output
print("reading test data ...", file=sys.stderr)
test_data = perc.read_labeled_data("data/dev.txt", "data/dev.feats", verbose=False)
print("done.", file=sys.stderr)
feat_vec = perc.perc_read_from_file("default.model")
perc.perc_testall(feat_vec, test_data, tagset)

reading test data ...
done.


In [5]:
import score_chunks
boundary = "-X-" # something to use as boundary between sentences
outside = "O" # tag used to mark the outside of any chunk
conlleval = False # use conlleval (should be False for most use cases)
numfeats = 2 # number of columns to consider as features, typically "word POStag"
(test, _) = score_chunks.readTestFile(str(output), boundary, outside, conlleval, numfeats)
with open("data/reference500.txt") as f:
    (reference, _) = score_chunks.readTestFile(f.read(), boundary, outside, conlleval, numfeats)
print("Score: %.2f" % score_chunks.corpus_fmeasure(reference, test, False))

processed 500 sentences with 10375 tokens and 5783 phrases; found phrases: 5619; correct phrases: 5174
             ADJP: precision:  71.57%; recall:  73.74%; F1:  72.64; found:    102; correct:     99
             ADVP: precision:  78.24%; recall:  74.75%; F1:  76.46; found:    193; correct:    202
            CONJP: precision:   0.00%; recall:   0.00%; F1:   0.00; found:      2; correct:      5
             INTJ: precision:   0.00%; recall:   0.00%; F1:   0.00; found:      0; correct:      1
               NP: precision:  92.51%; recall:  89.33%; F1:  90.89; found:   2922; correct:   3026
               PP: precision:  96.59%; recall:  95.25%; F1:  95.92; found:   1204; correct:   1221
              PRT: precision:  80.00%; recall:  36.36%; F1:  50.00; found:     10; correct:     22
             SBAR: precision:  66.67%; recall:  80.37%; F1:  72.88; found:    129; correct:    107
               VP: precision:  93.66%; recall:  90.00%; F1:  91.79; found:   1057; correct:   1100
accura