# Homework: Phrasal Chunking

This is your documentation for the chunker homework.

In [1]:
import perc
import default
import sys
from collections import defaultdict

feat_vec = {}
tagset = []
train_data = []

def perc_train(train_data, tagset, numepochs):
    feat_vec = defaultdict(int)
    # insert your code here
    # please limit the number of iterations of training to n iterations

    # Main loop
    for t in range(numepochs):
        mistakes = 0

        for j in train_data:
            # Get output chunk tags from Viterbi
            labeled_list = j[0]
            feat_list = j[1]

            gold_tags = [i.split()[2] for i in labeled_list]
            output_tags = perc.perc_test(feat_vec, labeled_list, feat_list, tagset, default_tag=tagset[0])

            # Update weight vector if the output is incorrect
            if output_tags != gold_tags:
                # Get feature IDs: from training data and from Viterbi output
                feat_ids_gold, feat_ids_output = get_feature_ids(feat_list, gold_tags, output_tags)

                for i in range(len(gold_tags)):
                    if output_tags[i] != gold_tags[i]:
                        for f in feat_ids_gold[i]:
                            feat_vec[f] = feat_vec[f] + 1
                        for f in feat_ids_output[i]:
                            feat_vec[f] = feat_vec[f] - 1

                mistakes += 1

        print('Mistakes in epoch {0}: {1} out of {2} sentences'.format(t, mistakes, len(train_data)))
    return feat_vec


# Helper function for perc_train. Get feature IDs from a piece of training data.
def get_feature_ids(feat_list, gold_tags, output_tags):
    feat_ids_gold = []
    feat_ids_output = []

    # Get list of list of features - each sublist corresponds to a word
    feat_index = 0
    feat_list_by_words = []
    for i in range(len(gold_tags)):
        (feat_index, feats) = perc.feats_for_word(feat_index, feat_list)
        feat_list_by_words.append(feats)

    # For each feature sublist, create a list of feature IDs ((feature, tag)).
    # One set based on gold tags, another based on argmax tags
    j = 0
    for sublist in feat_list_by_words:
        gold_sublist = []
        out_sublist = []
        for f in sublist:
            # Feature based on bigrams of output tags
            if f == 'B' and j > 0:
                curr_feat_g = '{0}:{1}'.format(f, gold_tags[j - 1])
                curr_feat_o = '{0}:{1}'.format(f, output_tags[j - 1])
            else:
                curr_feat_g = f
                curr_feat_o = f

            gold_sublist.append((curr_feat_g, gold_tags[j]))
            out_sublist.append((curr_feat_o, output_tags[j]))

        feat_ids_gold.append(gold_sublist)
        feat_ids_output.append(out_sublist)

        j += 1
    return feat_ids_gold, feat_ids_output


def to_ioe2(train_data):
    labeled_list = [t[0] for t in train_data]
    feat_list = [t[1] for t in train_data]

    new_labeled_list = []

    for sent in labeled_list:
        new_sent = []
        for i in range(len(sent)):
            row = sent[i].split()
            tag = row[2]
            new_row = sent[i]

            if i < len(sent) - 1:
                next_tag = sent[i + 1].split()[2]

            if tag[0] == 'B':
                new_row = ' '.join([row[0], row[1], 'I' + tag[1:]])
            if 0 < i < len(sent) - 1 and next_tag[1:] != tag[1:]:
                new_row = ' '.join([row[0], row[1], 'E' + tag[1:]])

            new_sent.append(new_row)

        new_labeled_list.append(new_sent)

    return [(new_labeled_list[i], feat_list[i]) for i in range(len(train_data))]


def to_ioe1(train_data):
    labeled_list = [t[0] for t in train_data]
    feat_list = [t[1] for t in train_data]
    new_labeled_list = []

    for sent in labeled_list:
        new_sent = []
        for i in range(len(sent)):
            row = sent[i].split()
            tag = row[2]
            new_row = sent[i]

            if i < len(sent) - 1:
                next_tag = sent[i + 1].split()[2]
            if i > 0:
                prev_tag = sent[i - 1].split()[2]

            if tag[0] == 'B':
                new_row = ' '.join([row[0], row[1], 'I' + tag[1:]])
            if 0 < i < len(sent) - 1 and next_tag[1:] != tag[1:] and prev_tag[1:] == tag[1:]:
                new_row = ' '.join([row[0], row[1], 'E' + tag[1:]])

            new_sent.append(new_row)

        new_labeled_list.append(new_sent)

    return [(new_labeled_list[i], feat_list[i]) for i in range(len(train_data))]


def to_oc(train_data):
    labeled_list = [t[0] for t in train_data]
    feat_list = [t[1] for t in train_data]
    new_labeled_list = []

    for sent in labeled_list:
        new_sent = []
        for i in range(len(sent)):
            row = sent[i].split()
            tag = row[2]
            new_row = sent[i]

            if i < len(sent) - 1:
                next_tag = sent[i + 1].split()[2]
            if i > 0:
                prev_tag = sent[i - 1].split()[2]

            if tag[0] == 'B' and next_tag[1:] != tag[1:]:
                new_row = ' '.join([row[0], row[1], 'S' + tag[1:]])
            if 0 < i < len(sent) - 1 and tag[0] == 'I' and prev_tag[1:] == tag[1:] and next_tag[1:] != tag[1:]:
                new_row = ' '.join([row[0], row[1], 'E' + tag[1:]])

            new_sent.append(new_row)
        new_labeled_list.append(new_sent)
    return [(new_labeled_list[i], feat_list[i]) for i in range(len(train_data))]

def get_tagset(tagset, suffixes):
    return [s+t[1:] for s in suffixes for t in tagset if t != 'O'] + ['O']

In [16]:
feat_vec = {}
tagset = []
train_data = []

tagset = perc.read_tagset("data/tagset.txt")
tagset_ioe = get_tagset(tagset, ['I', 'E'])
tagset_oc = get_tagset(tagset, ['B', 'E', 'S'])

print("reading data ...", file=sys.stderr)
#train_data = perc.read_labeled_data("data/train.txt.gz", "data/train.feats.gz", verbose=False)
#train_data = perc.read_labeled_data("data/train.dev", "data/train.feats.dev", verbose=False)
train_data = perc.read_labeled_data("data/dev250_fixed.txt", "data/dev250.feats", verbose=False)
train_data_ioe1 = to_ioe1(train_data)
train_data_ioe2 = to_ioe2(train_data)
train_data_oc = to_oc(train_data)

print("done.", file=sys.stderr)
feat_vec = perc_train(train_data, tagset, 10)
perc.perc_write_to_file(feat_vec, "default.model")
print("wrote model to disk")

reading data ...
done.


Mistakes in epoch 0: 243 out of 250 sentences
Mistakes in epoch 1: 237 out of 250 sentences
Mistakes in epoch 2: 229 out of 250 sentences
Mistakes in epoch 3: 226 out of 250 sentences
Mistakes in epoch 4: 231 out of 250 sentences
Mistakes in epoch 5: 229 out of 250 sentences
Mistakes in epoch 6: 229 out of 250 sentences
Mistakes in epoch 7: 230 out of 250 sentences
Mistakes in epoch 8: 228 out of 250 sentences
Mistakes in epoch 9: 227 out of 250 sentences
wrote model to disk


In [13]:
%%capture --no-stderr output
print("reading test data ...", file=sys.stderr)
test_data = perc.read_labeled_data("data/dev.txt", "data/dev.feats", verbose=False)
print("done.", file=sys.stderr)
feat_vec = perc.perc_read_from_file("default.model")
perc.perc_testall(feat_vec, test_data, tagset)

reading test data ...
done.


In [14]:
import score_chunks
boundary = "-X-" # something to use as boundary between sentences
outside = "O" # tag used to mark the outside of any chunk
conlleval = False # use conlleval (should be False for most use cases)
numfeats = 2 # number of columns to consider as features, typically "word POStag"
(test, _) = score_chunks.readTestFile(str(output), boundary, outside, conlleval, numfeats)
with open("data/reference500.txt") as f:
    (reference, _) = score_chunks.readTestFile(f.read(), boundary, outside, conlleval, numfeats)
print("Score: %.2f" % score_chunks.corpus_fmeasure(reference, test, False))

processed 500 sentences with 10375 tokens and 5783 phrases; found phrases: 5727; correct phrases: 5412
             ADJP: precision:  87.21%; recall:  75.76%; F1:  81.08; found:     86; correct:     99
             ADVP: precision:  85.33%; recall:  77.72%; F1:  81.35; found:    184; correct:    202
            CONJP: precision: 100.00%; recall:  40.00%; F1:  57.14; found:      2; correct:      5
             INTJ: precision:   0.00%; recall:   0.00%; F1:   0.00; found:      0; correct:      1
               NP: precision:  94.80%; recall:  94.68%; F1:  94.74; found:   3022; correct:   3026
               PP: precision:  95.58%; recall:  97.38%; F1:  96.47; found:   1244; correct:   1221
              PRT: precision:  75.00%; recall:  68.18%; F1:  71.43; found:     20; correct:     22
             SBAR: precision:  91.36%; recall:  69.16%; F1:  78.72; found:     81; correct:    107
               VP: precision:  95.13%; recall:  94.09%; F1:  94.61; found:   1088; correct:   1100
accura