In [3]:
import numpy as np
import os
import json
import collections as col
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction import DictVectorizer
from scipy.sparse import csr_matrix
from sklearn.linear_model import Perceptron
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [4]:
TRAINING_PATH = '/home/jack/Downloads/snli_1.0/snli_1.0_train.jsonl'
DEV_PATH = '/home/jack/Downloads/snli_1.0/snli_1.0_dev.jsonl'
TEST_PATH = '/home/jack/Downloads/removable/USB_DISK/snli_1.0/snli_1.0_test.jsonl'

TRAINING_VECTORS_PATH = '/home/jack/NLP/POS_csr.npz'
DEV_VECTORS_PATH = '/home/jack/NLP/POS_dev_csr.npz'
TEST_VECTORS_PATH = ''

SAVE_TO_FILE = False
LOAD_FROM_FILE = False

STOP = stopwords.words('english')

In [5]:
def save_sparse_vectors(file_name, array, labels):
    np.savez(file_name, data = array.data, indices = array.indices, indptr = array.indptr, shape = array.shape, label_data = labels)

def load_sparse_vectors(file_name):
    loader = np.load(file_name)
    return (csr_matrix((loader['data'], loader['indices'], loader['indptr']), shape = loader['shape']), loader['label_data'])

In [6]:
def pos_stoplist(pos):
    return pos == 'NN' or pos == 'JJ' or 'VB' in pos

def pos_strip(sentence_parse, stoplist_function):
    m = re.finditer('\((\w+) (\w+)\)', sentence_parse)
    tags = [(match.group(2), match.group(1)) for match in m if stoplist_function(match.group(1))]
    return tags

def extract_pair(line):
    json_pair = json.loads(line)
    pair = {'sentence1': json_pair['sentence1'],
            'sentence1_pos': pos_strip(json_pair['sentence1_parse'], pos_stoplist),
            'sentence2': json_pair['sentence2'],
            'sentence2_pos': pos_strip(json_pair['sentence2_parse'], pos_stoplist),
            'label': json_pair['gold_label']}
    return pair

In [7]:
def cross_unigram_counter(pair):
    cross_unigrams = []
    for word1 in pair['sentence1']:
        for word2 in pair['sentence2']:
            cross_unigrams.append((word1, word2))

    return col.Counter(cross_unigrams)

def cross_unigram_pos_counter(pair):
    cross_unigrams = []
    for word1 in pair['sentence1_pos']:
        for word2 in pair['sentence2_pos']:
            cross_unigrams.append((word1, word2))

    return col.Counter(cross_unigrams)

def calculate_vectors(training_path, testing_path, feature_extractor):
    print 'Calculating Training Vectors'
    pairs = []
    with open(training_path, 'rb') as f:
        count = 50000
        for line in f:
            pairs.append(extract_pair(line))
            if count == 0:
                break
            else:
                count -= 1

    print 'Calculating Testing Vectors'
    dev_pairs = []
    with open(testing_path, 'rb') as f:
        count = 50000
        for line in f:
            dev_pairs.append(extract_pair(line))
            if count == 0:
                break
            else:
                count -= 1


    print 'Calculating Cross Unigrams'
    labels = [pair['label'] for pair in pairs]
    dev_labels = [pair['label'] for pair in dev_pairs]
    
    counters = [feature_extractor(pair) for pair in pairs]
    dev_counters = [feature_extractor(pair) for pair in dev_pairs]
 
    print 'Training Vectorizer'
    vectorizer = DictVectorizer()
    vectorizer.fit(counters)

    print 'Vectorizing unigrams'
    vectors = vectorizer.transform(counters)
    dev_vectors = vectorizer.transform(dev_counters)

    if SAVE_TO_FILE:
        print 'Saving to File'
        save_sparse_vectors(TRAINING_PATH, vectors, labels)
        save_sparse_vectors(DEV_PATH, dev_vectors, dev_labels)

    return (vectors, labels, dev_vectors, dev_labels)

In [98]:
if LOAD_FROM_FILE:
    try:
        vectors, labels = load_sparse_vectors(TRAINING_VECTORS_PATH)
        dev_vectors, dev_labels = load_sparse_vectors(DEV_VECTORS_PATH)
        print 'Loaded Vectors from File'
    except Exception as e:
        print 'Failed to load from File calculating feature vectors'
        vectors, labels, dev_vectors, dev_labels = calculate_vectors(TRAINING_PATH, DEV_PATH)
else:
    print 'Computing feature vectors'
    vectors, labels, dev_vectors, dev_labels = calculate_vectors(TRAINING_PATH, DEV_PATH)

Computing feature vectors
Calculating Training Vectors
Calculating Testing Vectors
Calculating Cross Unigrams
Training Vectorizer
Vectorizing unigrams


In [8]:
%time vectors, labels, dev_vectors, dev_labels = calculate_vectors(TRAINING_PATH, DEV_PATH, cross_unigram_counter)
print vectors.shape

Calculating Training Vectors
Calculating Testing Vectors
Calculating Cross Unigrams
Training Vectorizer
Vectorizing unigrams
CPU times: user 6min 31s, sys: 47.1 s, total: 7min 18s
Wall time: 7min 49s
(50001, 4667)


In [10]:
%time vectors, labels, dev_vectors, dev_labels = calculate_vectors(TRAINING_PATH, DEV_PATH, cross_unigram_pos_counter)
print vectors.shape

Calculating Training Vectors
Calculating Testing Vectors
Calculating Cross Unigrams
Training Vectorizer
Vectorizing unigrams
CPU times: user 37.2 s, sys: 2 s, total: 39.2 s
Wall time: 39.8 s
(50001, 398977)


In [11]:
#random_state gives the seeed, none seams to always give the same result
perceptron = Perceptron(shuffle=True, n_iter=5, random_state=1000)
print 'Training Perceptron'
perceptron = perceptron.fit(vectors, labels)
print 'Testing perceptron'
predictions = perceptron.predict(dev_vectors)
score = perceptron.score(dev_vectors, dev_labels)
print score

Training Perceptron
Testing perceptron
0.5167


In [8]:
print confusion_matrix(dev_labels, predictions, labels=['entailment', 'contradiction', 'neutral'])
print classification_report(dev_labels, predictions,labels=['entailment', 'contradiction', 'neutral'])

[[2383  386  552]
 [ 440 2365  466]
 [ 717  573 1931]]
             precision    recall  f1-score   support

 entailment       0.66      0.72      0.69      3329
contradiction       0.70      0.72      0.71      3278
    neutral       0.64      0.60      0.62      3235

avg / total       0.67      0.68      0.67      9842

