## Verb-Particle Constructions
### Distinguishing VPCs from verb-preposition combinations

We use the dataset in [Tu and Roth (2012)](http://www.aclweb.org/anthology/S12-1010) which constains 1,348 sentences from BNC. 65% of the sentences contain a VPC and 35% contain a verb-preposition combination. The dataset is focused on 23 different phrasal verbs that they define as the most confusing. They are derived from six of the most frequently used verbs: _take, make, have, get, do_ and _give_, and their combination with common prepositions or particles. 

In [None]:
import random
random.seed(133)

import os
import csv
import json
import codecs
import random

import numpy as np

from collections import Counter, defaultdict
from statsmodels.stats.inter_rater import fleiss_kappa

First, download the dataset:

In [None]:
if not os.path.exists('vpc/pvcData.txt'):
    !mkdir -p vpc
    !wget http://cogcomp.org/software/tools/pvcDataSubmission.tar.gz
    !tar -zxvf pvcDataSubmission.tar.gz
    !mv dataSubmission/pvcData.txt vpc
    !rm -r dataSubmission
    !rm -r pvcDataSubmission.tar.gz

In [None]:
with codecs.open('vpc/pvcData.txt', 'r', 'utf-8') as f_in:
    dataset = [line.strip().split('\t') for line in f_in]
    
# The dataset fields are: bnc_id, annotation confidence, label (true/false), 
# stem of the phrasal verb, pvcIndex
# We convert it to: bnc_id, label, and stem. We can disregard pvc_index because we re-tokenize.
dataset = [(bnc_id, label, stem) for bnc_id, _, label, stem, pvc_index in dataset]

print('Dataset size: {}'.format(len(dataset)))

Split the dataset to train/validation/test. The split is lexical by verb, to make it more difficult.

In [None]:
def split_lexically(dataset, word_index=0):
    """
    Split the dataset to train, test, and validation, such that
    the word in word_index (0 = verb, 1 = preposition) doesn't
    repeat across sets.
    """
    instances_per_w = defaultdict(list)
    [instances_per_w[span_text.split('_')[word_index]].append(
        (bnc_id, label, span_text)) 
     for (bnc_id, label, span_text) in dataset]

    train, test, val = [], [], []
    train_size = 8 * len(dataset) // 10
    val_size = test_size = len(dataset) // 10

    words = [w for w, examples in sorted(instances_per_w.items(), key=lambda x: len(x[1]))]
    w_index = 0

    while len(test) < test_size:
        test += instances_per_w[words[w_index]]
        w_index += 1

    print('Test set size: {} (needed: {})'.format(len(test), test_size))

    while len(val) < val_size:
        val += instances_per_w[words[w_index]]
        w_index += 1

    print('Validation set size: {} (needed: {})'.format(len(val), val_size))

    train = [example for i in range(w_index, len(words)) 
             for example in instances_per_w[words[i]]]
    print('Train set size: {} (needed: {})'.format(len(train), train_size))

    # Check the label distribution in the test set
    ctr = Counter([label for (bnc_id, label, span_text) in test])
    assert(ctr['false'] / ctr['true'] <= 4 and ctr['true'] / ctr['false'] <= 4)
    
    # Make sure the split is lexical among verbs
    test_words = [span_text.split('_')[word_index] for _, _, span_text in test]
    train_words = [span_text.split('_')[word_index] for _, _, span_text in train]
    val_words = [span_text.split('_')[word_index] for _, _, span_text in val]
    assert(len(set(train_words).intersection(set(val_words))) == 0)
    assert(len(set(train_words).intersection(set(test_words))) == 0)
    assert(len(set(test_words).intersection(set(val_words))) == 0)

    print(f'Sizes: train = {len(train)}, test = {len(test)}, validation = {len(val)}')
    return train, test, val
    

data_dir = '../diagnostic_classifiers/data/vpc_classification'
if not os.path.exists(data_dir):
    os.mkdir(data_dir)
    
train, test, val = split_lexically(dataset)

for s, filename in zip([train, test, val], ['train', 'test', 'val']):
    with codecs.open(os.path.join(data_dir, 'ids_{}.jsonl'.format(filename)), 'w', 'utf-8') as f_out:
        for bnc_id, label, span_text in s:
            example = {'bnc_id': bnc_id, 'span_text': span_text.replace('_', ' '), 'label': label}
            f_out.write(json.dumps(example) + '\n')

Sanity check: majority baseline is not too strong.

In [None]:
def get_majority_label_per_word(train_set, word_index=0):
    """
    Compute the majority label by word
    :word_index: 0 for verb, 1 for preposition
    """
    per_word_labels = defaultdict(list)
    for _, label, span_text in train_set:
        w = span_text.split('_')[word_index]
        per_word_labels[w].append(label)
        
    per_word_majority_label = {w: Counter(curr_labels).most_common(1)[0][0] 
                               for w, curr_labels in per_word_labels.items()}
    return per_word_majority_label   


test_labels = [label for _, label, _ in test]
overall_majority_label = Counter([label for _, label, _ in train]).most_common(1)[0][0]
test_predictions_all = [overall_majority_label] * len(test)
print('Majority overall: {:.2f}%'.format(
    100.0 * accuracy_score(test_labels, test_predictions_all)))

per_verb_majority_label = get_majority_label_per_word(train, word_index=0)
per_prep_majority_label = get_majority_label_per_word(train, word_index=1)

test_verbs = [span_text.split('_')[0] for _, _, span_text in test]
test_preps = [span_text.split('_')[1] for _, _, span_text in test]

test_predictions_verb = [per_verb_majority_label.get(v, overall_majority_label) 
                         for v in test_verbs]
print('Majority by verb: {:.2f}%'.format(
    100.0 * accuracy_score(test_labels, test_predictions_verb)))

test_predictions_prep = [per_prep_majority_label.get(p, overall_majority_label) 
                         for p in test_preps]
print('Majority by preposition: {:.2f}%'.format(
    100.0 * accuracy_score(test_labels, test_predictions_prep)))

Notice that the dataset is given with the sentence IDs and without the sentences themselves, to comply with the BNC corpus license. To get the sentences, follow the instructions in the repository README file.

We re-annotated a sample from the test set to compute human performance. 
We assume the annotation results are found under `preprocessing/annotation/vpc/batch_results`.

In [None]:
def load_batch_results(result_file, remove_bad_workers=False):
    """
    Load the batch results from the CSV
    :param result_file: the batch results CSV file from MTurk
    :return: the workers and the answers
    """
    answer_by_worker, answer_by_hit = defaultdict(dict), defaultdict(dict)
    workers = set()
    incorrect = set()
    span_texts = {}
    workers_wrong_answers = defaultdict(int)
    sent_id_orig_label = {}
    
    with codecs.open(result_file, 'r', 'utf-8') as f_in:
        reader = csv.DictReader(f_in)
        for row in reader:
            hit_id = row['HITId']
            worker_id = row['WorkerId']

            # Input fields
            sent = row['Input.sent']
            orig_label = row['Input.original_label']
            
            sent_id = row['Input.sent_id']
            sent_id_orig_label[sent_id] = orig_label
            v, p = row['Input.w_first'], row['Input.w_last']
            
            # Answer fields
            answer = row['Answer.answer.vpc'].lower()
            
            if row['Answer.answer.incorrect'].lower() == 'true':
                incorrect.add(sent_id)
                continue

            if orig_label != answer:
                workers_wrong_answers[worker_id] += 1
                
            span_texts[sent_id] = ' '.join((v, p))
            workers.add(worker_id)
            answer_by_worker[worker_id][sent_id] = answer
            answer_by_hit[sent_id][worker_id] = answer
            
    # Remove HITs that were annotated as incorrect by at least one worker
    answer_by_hit = {sent_id: answers_by_sent_id 
                     for sent_id, answers_by_sent_id in answer_by_hit.items()
                     if sent_id not in incorrect}
    
    new_answer_by_worker = {}
    for worker_id, curr_answers in answer_by_worker.items():
        new_answer_by_worker[worker_id] = {sent_id: answer 
                                           for sent_id, answer in curr_answers.items()
                                           if sent_id not in incorrect}
        
    answer_by_worker = new_answer_by_worker
    num_answers = sum([len(answers_by_worker_id) 
                       for answers_by_worker_id in answer_by_worker.values()])
    
    if remove_bad_workers:
        workers_wrong_answers = {worker_id: n * 100.0 / len(answer_by_worker[worker_id])
                                 for worker_id, n in workers_wrong_answers.items()}

        # Remove bad workers: workers that disagreed with many of the previous annotation 
        bad_workers = {worker_id 
                       for worker_id, per in workers_wrong_answers.items() if per > 35}
        print(f'Removing {len(bad_workers)} bad workers:\n{bad_workers}')

        answer_by_worker = {worker_id: answers_by_worker_id 
                            for worker_id, answers_by_worker_id in answer_by_worker.items()
                            if worker_id not in bad_workers}

        for sent_id in answer_by_hit.keys():
            answers_by_sent_id = answer_by_hit[sent_id]
            answer_by_hit[sent_id] = {worker_id: answer 
                                      for worker_id, answer in answers_by_sent_id.items()
                                      if worker_id not in bad_workers}

        num_answers_after_filtering = sum([len(answers_by_worker_id) 
                                           for answers_by_worker_id in answer_by_worker.values()])
        print('Final: {} answers, removed {}.'.format(
            num_answers_after_filtering, 
            num_answers - num_answers_after_filtering))
    
    return workers, answer_by_worker, answer_by_hit, incorrect, span_texts, sent_id_orig_label


results_file = 'vpc/batch_results.csv'
workers, answer_by_worker, answer_by_hit, incorrect, span_texts, sent_id_orig_label = load_batch_results(
    results_file, remove_bad_workers=True)
print(f'Loaded results from {results_file}')
print(f'Removed {len(incorrect)}/{len(dataset)} incorrect instances.')

Computes Fleiss Kappa and percent of agreement between the workers.

In [None]:
def compute_agreement(answer_by_hit):
    """
    Compute workers' agreement (Fleiss Kappa and percent) 
    """
    data = []
    percent = 0
    
    for sent_id, worker_answers in answer_by_hit.items():
        curr = [0, 0]

        for answer in worker_answers.values():
            label = 1 if answer == 'true' else 0
            curr[label] += 1

        if sum(curr) == 3:
            data.append(curr)
            curr_agreement = sum([max(0, a-1) for a in curr])        
            percent += curr_agreement

    kappa = fleiss_kappa(data)
    percent = percent * 100.0 / (len(data) * 2)
    return kappa, percent


kappa, percent = compute_agreement(answer_by_hit)
print('Fleiss Kappa={:.3f}, Percent={:.2f}%'.format(kappa, percent))

Compute the workers majority which we will use to estimate human performance.

In [None]:
def compute_majority(results):
    """
    Compute the majority label from the worker answers    
    :param results: sentence ID to worker answers dictionary
    """
    distribution = { sent_id : Counter(sent_results.values())
                    for sent_id, sent_results in results.items() }
    
    dataset = [{'sent_id': sent_id, 
                'span_text': span_texts[sent_id],
                'label': dist.most_common(1)[0][0]}
               for sent_id, dist in distribution.items()
               if len(dist) > 0 and dist.most_common(1)[0][1] >= 2]
   
    return dataset

human_annotations = compute_majority(answer_by_hit)

Compute the human performance on the test set.

In [None]:
gold_by_sent_id = {e['sent_id']: e['label'] for e in human_annotations}

test_annotations = [(sent_id, label) 
                    for (sent_id, label, span_text) in test
                    if sent_id in gold_by_sent_id]

human_accuracy = sum([1 if label == gold_by_sent_id[sent_id] else 0 
                      for (sent_id, label) in test_annotations
                     ]) * 100.0 / len(test_annotations)

print('Number of examples: {}, accuracy: {:.3f}'.format(len(test_annotations), human_accuracy))