In [55]:
import numpy as np
import os
import json
import collections as col
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction import DictVectorizer
from scipy.sparse import csr_matrix
from sklearn.linear_model import Perceptron

In [56]:
TRAINING_PATH = '/media/removable/USB DISK/snli_1.0/snli_1.0_train.jsonl'
DEV_PATH = '/media/removable/USB DISK/snli_1.0/snli_1.0_dev.jsonl'
TEST_PATH = '/media/removable/USB DISK/snli_1.0/snli_1.0_test.jsonl'

STOP = stopwords.words('english')

In [4]:
pairs = []
with open(TRAINING_PATH, 'rb') as f:
    for line in f:
        json_pair = json.loads(line)
        pair = {'sentence1':json_pair['sentence1'],
                'sentence2':json_pair['sentence2'],
                'label':json_pair['gold_label']}
        pairs.append(pair)

In [5]:
dev_pairs = []
with open(DEV_PATH, 'rb') as f:
    for line in f:
        json_pair = json.loads(line)
        pair = {'sentence1':json_pair['sentence1'],
                'sentence2':json_pair['sentence2'],
                'label':json_pair['gold_label']}
        dev_pairs.append(pair)

In [6]:
labels = [pair['label'] for pair in pairs]
dev_labels = [pair['label'] for pair in dev_pairs]

In [57]:
print len(pairs)
print pairs[0]['label']

print len(dev_pairs)
print dev_pairs[0]

550152
neutral
10000
{'sentence1': u'Two women are embracing while holding to go packages.', 'sentence2': u'The sisters are hugging goodbye while holding to go packages after just eating lunch.', 'label': u'neutral'}


In [58]:
def create_bag_of_words(sentence):
    lower = sentence.lower()
    split = re.findall("\w+", lower)
    bag = col.Counter(split)
    return bag

In [59]:
def create_bag_of_common_words(sentence1, sentence2):
    bag1 = create_bag_of_words(sentence1)
    bag2 = create_bag_of_words(sentence2)
    intersection = bag1 & bag2
    return intersection

In [None]:
def remove_stop_words_from_counter(counter, stop_list):
    new_keys = set(counter.keys()) - set(stop_list)
    new_counter = {x: counter[x] for x in new_keys}
    return new_counter

In [None]:
for i in range(0,2):
    bag = create_bag_of_common_words(pairs[i]['sentence1'], pairs[i]['sentence2'])
    bag = remove_stop_words_from_counter(bag, STOP)
    print bag

In [60]:
def compute_bags_of_words(path_to_dir, key_set, positive_review):
    bags = []

    for path in os.listdir(FOLDER_PATH + path_to_dir):
        complete_path = path_to_dir + path
        bag = create_bag_of_words(complete_path)
        bags.append((bag, positive_review))
        key_set.update(bag.keys())

    return bags

In [65]:
counters = [cross_unigram_counter(pair['sentence1'],pair['sentence2']) for pair in pairs]
dev_counters = [cross_unigram_counter(pair['sentence1'],pair['sentence2']) for pair in dev_pairs]

In [66]:
vectorizer = DictVectorizer()

vectorizer.fit(counters + dev_counters)

vectors = vectorizer.transform(counters)
dev_vectors = vectorizer.transform(dev_counters)

In [69]:
save_sparse_vectors('/home/jack/NLP/csr.npz', vectors, labels)
save_sparse_vectors('/home/jack/NLP/dev_csr.npz', dev_vectors, dev_labels)

In [70]:
loaded_vectors, loaded_labels = load_sparse_csr('/home/jack/NLP/csr.npz')
loaded_dev_vectors, loaded_dev_labels = load_sparse_csr('/home/jack/NLP/dev_csr.npz')

In [86]:
#random_state gives the seeed, none seams to always give the same result
perceptron = Perceptron(shuffle=True, n_iter=5, random_state=None)
perceptron = perceptron.fit(loaded_vectors, loaded_labels)
score = perceptron.score(loaded_dev_vectors, loaded_dev_labels)
print score

In [25]:
correct = 0
for i in range(0,10000):
    if predictions[i] == dev_labels[i]:
        correct += 1

print correct / 10000.0

0.6652


In [68]:
def save_sparse_vectors(file_name, array, labels):
    np.savez(file_name, data = array.data, indices = array.indices, indptr = array.indptr, shape = array.shape, label_data = labels)

def load_sparse_vectors(file_name):
    loader = np.load(file_name)
    return (csr_matrix((loader['data'], loader['indices'], loader['indptr']), shape = loader['shape']), loader['label_data'])

In [64]:
def cross_unigram(sentence1, sentence2, stop_list):
    split1 = [i for i in re.findall("\w+", sentence1.lower()) if i not in stop_list]
    split2 = [i for i in re.findall("\w+", sentence2.lower()) if i not in stop_list]
    
    cross_unigrams = []
    for word1 in split1:
        for word2 in split2:
            cross_unigrams.append((word1,word2))
    
    return cross_unigrams

In [62]:
def cross_unigram_counter(sentence1, sentence2):
    unigrams = [hash(i) for i in cross_unigram(sentence1, sentence2, STOP)]
    counter = col.Counter(unigrams)
    return counter