# Homework: Phrasal Chunking

This is your documentation for the chunker homework.

In [1]:
import perc
import default
import sys
from collections import defaultdict

feat_vec = {}
tagset = []
train_data = []

In [12]:
tagset = perc.read_tagset("data/tagset.txt")
print("reading data ...", file=sys.stderr)
train_data = perc.read_labeled_data("data/train.txt.gz", "data/train.feats.gz", verbose=False)
# train_data = perc.read_labeled_data("data/train.dev", "data/train.feats.dev", verbose=False)
print("done.", file=sys.stderr)

reading data ...
done.


In [13]:
# Helper function for perc_train. Get feature IDs from a piece of training data.
def get_feature_ids(feat_list, gold_tags, output_tags):
    feat_ids_gold = []
    feat_ids_output = []

    # Get list of list of features - each sublist corresponds to a word
    feat_index = 0
    feat_list_by_words = []
    for i in range(len(gold_tags)):
        (feat_index, feats) = perc.feats_for_word(feat_index, feat_list)
        feat_list_by_words.append(feats)

    # For each feature sublist, create a list of feature IDs ((feature, tag)).
    # One set based on gold tags, another based on argmax tags
    j = 0
    for sublist in feat_list_by_words:
        gold_sublist = []
        out_sublist = []
        for f in sublist:
            # Feature based on bigrams of output tags
            if f == 'B' and j > 0:
                curr_feat_g = '{0}:{1}'.format(f, gold_tags[j - 1])
                curr_feat_o = '{0}:{1}'.format(f, output_tags[j - 1])
            else:
                curr_feat_g = f
                curr_feat_o = f

            gold_sublist.append((curr_feat_g, gold_tags[j]))
            out_sublist.append((curr_feat_o, output_tags[j]))

        feat_ids_gold.append(gold_sublist)
        feat_ids_output.append(out_sublist)

        j += 1
    return feat_ids_gold, feat_ids_output

In [27]:
def perc_train(train_data, tagset, numepochs):
    feat_vec = defaultdict(int)
    sigma_vec = defaultdict(int)
    gamma_vec = defaultdict(int)
    tau_vec = defaultdict(int)
    T = numepochs
    m = len(train_data)

    # Main loop
    for t in range(T):
        print('EPOCH:',t+1)
        mistakes = 0
        print('train_data:',end=' ')
        
        for i in range(m):
            if i%1000 == 0:
                print(i,end=' ')

            # Get output chunk tags from Viterbi
            labeled_list = train_data[i][0]
            feat_list = train_data[i][1]

            gold_tags = [ll.split()[2] for ll in labeled_list]
            output_tags = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag=tagset[0])
            
            if t != T-1 & i != m-1:
                
                # Update weight vector if the output is incorrect
                if output_tags != gold_tags:

                    # Get feature IDs: from training data and from Viterbi output
                    feat_ids_gold, feat_ids_output = get_feature_ids(feat_list, gold_tags, output_tags)

                    for k in range(len(gold_tags)):
                        for f in feat_ids_gold[k]:
                            if f in tau_vec.keys():
                                sigma_vec[f] = sigma_vec[f] + feat_vec[f] * (t * m + i - tau_vec[f][1] * m - tau_vec[f][0])
                            
                        for f in feat_ids_output[k]:
                            if f in tau_vec.keys():
                                sigma_vec[f] = sigma_vec[f] + feat_vec[f] * (t * m + i - tau_vec[f][1] * m - tau_vec[f][0])

                        if output_tags[k] != gold_tags[k]:
                            for f in feat_ids_gold[k]:
                                feat_vec[f] = feat_vec[f] + 1
                                sigma_vec[f] = sigma_vec[f] + 1
                            for f in feat_ids_output[k]:
                                feat_vec[f] = feat_vec[f] - 1
                                sigma_vec[f] = sigma_vec[f] - 1
                        
                        # record the location where the dimension tag is updated
                        for f in feat_ids_gold[k]:
                            tau_vec[f] = [i, t]
                        for f in feat_ids_output[k]:
                            tau_vec[f] = [i, t]

                    mistakes += 1
            else:
               # to deal with the last sentence in the last iteration
               # Get feature IDs: from training data and from Viterbi output
                feat_ids_gold, feat_ids_output = get_feature_ids(feat_list, gold_tags, output_tags)

                for f in tau_vec.keys():
                    sigma_vec[f] = sigma_vec[f] + feat_vec[f] * (T * m + m - tau_vec[f][1] * m - tau_vec[f][0])

                for k in range(len(gold_tags)):
                    if output_tags[k] != gold_tags[k]:
                        for g in feat_ids_gold[k]:
                            feat_vec[g] = feat_vec[g] + 1
                            sigma_vec[g] = sigma_vec[g] + 1
                        for g in feat_ids_output[k]:
                            feat_vec[g] = feat_vec[g] - 1
                            sigma_vec[g] = sigma_vec[g] - 1

        print('\nMistakes in epoch {0}: {1} out of {2} sentences'.format(t+1, mistakes, len(train_data)))
    
    for key in sigma_vec.keys():
        sigma_vec[key] = sigma_vec[key]/(m*T)
        
    return sigma_vec

In [28]:
%%time
feat_vec = perc_train(train_data, tagset, 10)

EPOCH: 0
train_data: 0 1000 2000 3000 4000 5000 6000 7000 8000 
Mistakes in epoch 0: 4256 out of 8936 sentences
EPOCH: 1
train_data: 0 1000 2000 3000 4000 5000 6000 7000 8000 
Mistakes in epoch 1: 2906 out of 8936 sentences
EPOCH: 2
train_data: 0 1000 2000 3000 4000 5000 6000 7000 8000 
Mistakes in epoch 2: 2920 out of 8936 sentences
EPOCH: 3
train_data: 0 1000 2000 3000 4000 5000 6000 7000 8000 
Mistakes in epoch 3: 2265 out of 8936 sentences
EPOCH: 4
train_data: 0 1000 2000 3000 4000 5000 6000 7000 8000 
Mistakes in epoch 4: 1755 out of 8936 sentences
EPOCH: 5
train_data: 0 1000 2000 3000 4000 5000 6000 7000 8000 
Mistakes in epoch 5: 1294 out of 8936 sentences
EPOCH: 6
train_data: 0 1000 2000 3000 4000 5000 6000 7000 8000 
Mistakes in epoch 6: 1098 out of 8936 sentences
EPOCH: 7
train_data: 0 1000 2000 3000 4000 5000 6000 7000 8000 
Mistakes in epoch 7: 971 out of 8936 sentences
EPOCH: 8
train_data: 0 1000 2000 3000 4000 5000 6000 7000 8000 
Mistakes in epoch 8: 627 out of 8936 sent

In [29]:
perc.perc_write_to_file(feat_vec, "default.model")
print("wrote model to disk")

wrote model to disk


In [30]:
%%capture --no-stderr output
print("reading test data ...", file=sys.stderr)
test_data = perc.read_labeled_data("data/dev.txt", "data/dev.feats", verbose=False)
print("done.", file=sys.stderr)
feat_vec = perc.perc_read_from_file("default.model")
perc.perc_testall(feat_vec, test_data, tagset)

reading test data ...
done.


In [31]:
import score_chunks
boundary = "-X-" # something to use as boundary between sentences
outside = "O" # tag used to mark the outside of any chunk
conlleval = False # use conlleval (should be False for most use cases)
numfeats = 2 # number of columns to consider as features, typically "word POStag"
(test, _) = score_chunks.readTestFile(str(output), boundary, outside, conlleval, numfeats)
with open("data/reference500.txt") as f:
    (reference, _) = score_chunks.readTestFile(f.read(), boundary, outside, conlleval, numfeats)
print("Score: %.2f" % score_chunks.corpus_fmeasure(reference, test, False))

processed 500 sentences with 10375 tokens and 5783 phrases; found phrases: 5812; correct phrases: 5422
             ADJP: precision:  75.00%; recall:  75.76%; F1:  75.38; found:    100; correct:     99
             ADVP: precision:  77.18%; recall:  78.71%; F1:  77.94; found:    206; correct:    202
            CONJP: precision:  75.00%; recall:  60.00%; F1:  66.67; found:      4; correct:      5
             INTJ: precision:   0.00%; recall:   0.00%; F1:   0.00; found:      0; correct:      1
               NP: precision:  94.10%; recall:  94.35%; F1:  94.22; found:   3034; correct:   3026
               PP: precision:  96.85%; recall:  98.20%; F1:  97.52; found:   1238; correct:   1221
              PRT: precision:  80.00%; recall:  72.73%; F1:  76.19; found:     20; correct:     22
             SBAR: precision:  85.29%; recall:  81.31%; F1:  83.25; found:    102; correct:    107
               VP: precision:  92.78%; recall:  93.45%; F1:  93.12; found:   1108; correct:   1100
accura