In [76]:
import numpy as np
import os
import json
import collections as col
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction import DictVectorizer
from scipy.sparse import csr_matrix
from sklearn.linear_model import Perceptron
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [50]:
TRAINING_PATH = '/media/removable/USB DISK/snli_1.0/snli_1.0_train.jsonl'
DEV_PATH = '/media/removable/USB DISK/snli_1.0/snli_1.0_dev.jsonl'
TEST_PATH = '/media/removable/USB DISK/snli_1.0/snli_1.0_test.jsonl'

STOP = stopwords.words('english')

In [51]:
pairs = []
with open(TRAINING_PATH, 'rb') as f:
    for line in f:
        json_pair = json.loads(line)
        pair = {'sentence1':json_pair['sentence1'],
                'sentence2':json_pair['sentence2'],
                'label':json_pair['gold_label']}
        pairs.append(pair)

In [52]:
dev_pairs = []
with open(DEV_PATH, 'rb') as f:
    for line in f:
        json_pair = json.loads(line)
        pair = {'sentence1':json_pair['sentence1'],
                'sentence2':json_pair['sentence2'],
                'label':json_pair['gold_label']}
        dev_pairs.append(pair)

In [53]:
labels = [pair['label'] for pair in pairs]
dev_labels = [pair['label'] for pair in dev_pairs]

In [57]:
print len(pairs)
print pairs[0]['label']

print len(dev_pairs)
print dev_pairs[0]

550152
neutral
10000
{'sentence1': u'Two women are embracing while holding to go packages.', 'sentence2': u'The sisters are hugging goodbye while holding to go packages after just eating lunch.', 'label': u'neutral'}


In [54]:
def create_bag_of_words(sentence):
    lower = sentence.lower()
    split = re.findall("\w+", lower)
    bag = col.Counter(split)
    return bag

In [55]:
def create_bag_of_common_words(sentence1, sentence2):
    bag1 = create_bag_of_words(sentence1)
    bag2 = create_bag_of_words(sentence2)
    intersection = bag1 & bag2
    return intersection

In [56]:
def remove_stop_words_from_counter(counter, stop_list):
    new_keys = set(counter.keys()) - set(stop_list)
    new_counter = {x: counter[x] for x in new_keys}
    return new_counter

In [None]:
for i in range(0,2):
    bag = create_bag_of_common_words(pairs[i]['sentence1'], pairs[i]['sentence2'])
    bag = remove_stop_words_from_counter(bag, STOP)
    print bag

In [57]:
def compute_bags_of_words(path_to_dir, key_set, positive_review):
    bags = []

    for path in os.listdir(FOLDER_PATH + path_to_dir):
        complete_path = path_to_dir + path
        bag = create_bag_of_words(complete_path)
        bags.append((bag, positive_review))
        key_set.update(bag.keys())

    return bags

In [61]:
counters = [cross_unigram_counter(pair['sentence1'],pair['sentence2']) for pair in pairs]
dev_counters = [cross_unigram_counter(pair['sentence1'],pair['sentence2']) for pair in dev_pairs]

In [62]:
vectorizer = DictVectorizer()

vectorizer.fit(counters)

vectors = vectorizer.transform(counters)
dev_vectors = vectorizer.transform(dev_counters)

In [67]:
save_sparse_vectors('/home/jack/NLP/csr.npz', vectors, labels)
save_sparse_vectors('/home/jack/NLP/dev_csr.npz', dev_vectors, dev_labels)

In [68]:
loaded_vectors, loaded_labels = load_sparse_vectors('/home/jack/NLP/csr.npz')
loaded_dev_vectors, loaded_dev_labels = load_sparse_vectors('/home/jack/NLP/dev_csr.npz')

In [64]:
print loaded_vectors.shape
print vectors.shape

(550152, 3149630)
(550152, 3104451)


In [80]:
#random_state gives the seeed, none seams to always give the same result
perceptron = Perceptron(shuffle=True, n_iter=5, random_state=1000)
perceptron = perceptron.fit(loaded_vectors, loaded_labels)
predictions = perceptron.predict(loaded_dev_vectors)
score = perceptron.score(loaded_dev_vectors, loaded_dev_labels)
print score

0.6679


In [81]:
confusion_matrix(loaded_dev_labels, predictions, labels=['entailment', 'contradiction', 'neutral'])

array([[2383,  386,  552],
       [ 440, 2365,  466],
       [ 717,  573, 1931]])

In [86]:
print classification_report(loaded_dev_labels, predictions,labels=['entailment', 'contradiction', 'neutral'])

             precision    recall  f1-score   support

 entailment       0.66      0.72      0.69      3329
contradiction       0.70      0.72      0.71      3278
    neutral       0.64      0.60      0.62      3235

avg / total       0.67      0.68      0.67      9842



In [70]:
correct = 0
entailment = 0
contradiction = 0
neutral = 0
correct_entailment = 0
wrong_entailment = 0
correct_contradiction = 0
wrong_contradiction = 0
correct_neutral = 0
wrong_neutral = 0
for i in range(0,10000):
    if predictions[i] == 'entailment':
        entailment += 1
        if predictions[i] == loaded_dev_labels[i]:
            correct_entailment += 1
        else:
            wrong_entailment += 1
    elif predictions[i] == 'contradiction':
        contradiction += 1
        if predictions[i] == loaded_dev_labels[i]:
            correct_contradiction += 1
        else:
            wrong_contradiction += 1
    elif predictions[i] == 'neutral':
        neutral += 1
        if predictions[i] == loaded_dev_labels[i]:
            correct_neutral += 1
        else:
            wrong_neutral += 1
    
    if predictions[i] == loaded_dev_labels[i]:
        correct += 1

print correct / 10000.0
print 'Entailment'
print 'Correct: ' + str(correct_entailment*100/entailment) + ' Wrong: ' + str(wrong_entailment*100/entailment)
print 'Contradiction'
print 'Correct: ' + str(correct_contradiction*100/contradiction) + ' Wrong: ' + str(wrong_contradiction*100/contradiction)
print 'Neutral'
print 'Correct: ' + str(correct_neutral*100/neutral) + ' Wrong: ' + str(wrong_neutral*100/neutral)

0.6685
Entailment
Correct: 65 Wrong: 34
Contradiction
Correct: 72 Wrong: 27
Neutral
Correct: 64 Wrong: 35


In [58]:
def save_sparse_vectors(file_name, array, labels):
    np.savez(file_name, data = array.data, indices = array.indices, indptr = array.indptr, shape = array.shape, label_data = labels)

def load_sparse_vectors(file_name):
    loader = np.load(file_name)
    return (csr_matrix((loader['data'], loader['indices'], loader['indptr']), shape = loader['shape']), loader['label_data'])

In [59]:
def cross_unigram(sentence1, sentence2, stop_list):
    split1 = [i for i in re.findall("\w+", sentence1.lower()) if i not in stop_list]
    split2 = [i for i in re.findall("\w+", sentence2.lower()) if i not in stop_list]
    
    cross_unigrams = []
    for word1 in split1:
        for word2 in split2:
            cross_unigrams.append((word1,word2))
    
    return cross_unigrams

In [60]:
def cross_unigram_counter(sentence1, sentence2):
    unigrams = [hash(i) for i in cross_unigram(sentence1, sentence2, STOP)]
    counter = col.Counter(unigrams)
    return counter