In [64]:
import pandas as pd
import numpy as np
import json
import pickle
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.pipeline import Pipeline
import dynet as dy
import string
import random


PAD_TOKEN = '_PAD_'
UNK_TOKEN = '_UNK_'

In [25]:
def extract_data():
    data = []
    with open('dataset/review.json') as f:
        for line in f:
            data.append(json.loads(line))

    dataframe = pd.DataFrame(data)

    df = dataframe[['text','stars']]
    train, validate, test = np.split(df.sample(frac=.20), [int(.15*len(df)), int(.175*len(df))])

    pickle.dump(test, open('data/test.dat', 'wb'))
    pickle.dump(validate, open('data/validate.dat', 'wb'))
    pickle.dump(train, open('data/train.dat', 'wb'))

In [49]:
train = pickle.load(open('data/train.dat', 'rb'))
validate = pickle.load(open('data/validate.dat', 'rb'))
test = pickle.load(open('data/test.dat', 'rb'))

In [3]:
def binarize(data):
    if 'stars' not in data.columns: return
    data.loc[:, 'positive'] = data.stars.apply(lambda x: 1 if x >=4 else 0)
    del data['stars']

In [50]:
binarize(train)
binarize(validate)
binarize(test)

In [51]:
def df2list(data):
    examples=[]
    for _, example in data.iterrows():
        d=dict()
        table = str.maketrans({key: None for key in string.punctuation})
        d['tokens'] = example['text'].lower().translate(table).split()
        d['label'] = example['positive']
        examples.append(d)
    return examples
train = df2list(train)
validate = df2list(validate)
test = df2list(test)

In [53]:
def build_vocab(datasets):
    vocab = dict()
    vocab[PAD_TOKEN] = len(vocab)
    vocab[UNK_TOKEN] = len(vocab)
    for data in datasets:
        for example in data:
            for word in example['tokens']:
                if word not in vocab:
                    vocab[word] = len(vocab)

    print('Vocab size: {}'.format(len(vocab)))
    return vocab

vocab = build_vocab([train, validate, test])

Vocab size: 692708


In [54]:
class TokenConverter(object):
    def __init__(self, vocab):
        self.vocab = vocab
        self.unknown = 0

    def convert(self, token):
        if token in self.vocab:
            id = self.vocab.get(token)
        else:
            id = self.vocab.get(UNK_TOKEN)
            self.unknown += 1
        return id


def convert2ids(data, vocab):
    converter = TokenConverter(vocab)
    for example in data:
        example['tokens'] = list(map(converter.convert, example['tokens']))
    print('Found {} unknown tokens.'.format(converter.unknown))
    return data

In [55]:
train = convert2ids(train, vocab)
validate = convert2ids(validate, vocab)
test = convert2ids(test, vocab)

Found 0 unknown tokens.
Found 0 unknown tokens.
Found 0 unknown tokens.


In [57]:
def load_embeddings(path, vocab, cache=False, cache_path=None):
    if not path: return None

    rows = []

    if cache_path is None:
        cache_path = path + '.cache'

    # Use cache file if it exists.
    if os.path.exists(cache_path):
        path = cache_path

    print("Reading embeddings from {}".format(path))

    # first pass over the embeddings to vocab and relevant rows
    with open(path) as f:
        for line in f:
            word, row = line.split(' ', 1)
            if word in vocab:
                rows.append(line)

    # optionally save relevant rows to cache file.
    if cache and not os.path.exists(cache_path):
        with open(cache_path, 'w') as f:
            for line in rows:
                f.write(line)
            print("Cached embeddings to {}".format(cache_path))

    # create embeddings matrix
    embeddings = np.zeros((len(vocab), 300), dtype=np.float32)
    for line in rows:
        word = line.split(' ', 1)[0]
        embeddings[vocab[word]] = list(map(float, line.strip().split(' ')[1:]))

    return embeddings

embeddings = load_embeddings('glove.42B.300d.txt', vocab, cache=True)

Reading embeddings from glove.42B.300d.txt
Cached embeddings to glove.42B.300d.txt.cache


In [59]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable

class  CNNClassifier(nn.Module):
        
    def __init__(self, vocab_size, embedding_dim, output_size, kernel_dim=100, kernel_sizes=(3, 4, 5), dropout=0.5):
        super(CNNClassifier,self).__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.convs = nn.ModuleList([nn.Conv2d(1, kernel_dim, (K, embedding_dim)) for K in kernel_sizes])

        # kernal_size = (K,D) 
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(len(kernel_sizes) * kernel_dim, output_size)
    
    
    def init_weights(self, pretrained_word_vectors, is_static=False):
        self.embedding.weight = nn.Parameter(torch.from_numpy(pretrained_word_vectors).float())
        if is_static:
            self.embedding.weight.requires_grad = False


    def forward(self, inputs, is_training=False):
        inputs = self.embedding(inputs).unsqueeze(1) # (B,1,T,D)
        inputs = [F.relu(conv(inputs)).squeeze(3) for conv in self.convs] #[(N,Co,W), ...]*len(Ks)
        inputs = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in inputs] #[(N,Co), ...]*len(Ks)

        concated = torch.cat(inputs, 1)

        if is_training:
            concated = self.dropout(concated) # (N,len(Ks)*Co)
        out = self.fc(concated) 
        return F.log_softmax(out,1)

In [72]:

def prepare_data(data):
    # pad data
    maxlen = max(max(map(len, data)), 5)
    data = [ex + [0] * (maxlen-len(ex)) for ex in data]

    # wrap in tensor
    return torch.LongTensor(data)


def prepare_labels(labels):
    try:
        return torch.LongTensor(labels)
    except:
        return labels


def batch_iterator(dataset, batch_size, epoch=1):
    dataset_size = len(dataset)
    order = None
    nbatches = dataset_size // batch_size
    print("number of batches:", nbatches)
    curr_epoch=0

    def init_order():
        return random.sample(range(dataset_size), dataset_size)

    def get_batch(start, end):
        batch = [dataset[ii] for ii in order[start:end]]
        data = prepare_data([ex['tokens'] for ex in batch])
        labels = prepare_labels([ex['label'] for ex in batch])
        example_ids = [1 for ex in batch]
        return data, labels, example_ids

    while True:
        order = init_order()
        curr_epoch += 1

        for i in range(nbatches):
            start = i*batch_size
            end = (i+1)*batch_size
            yield get_batch(start, end)

        if nbatches*batch_size < dataset_size:
            yield get_batch(nbatches*batch_size, dataset_size)

        if curr_epoch >= epoch:
            break

In [79]:
def run_validation(model, dataset):
    err = 0
    count = 0
    for data, labels, _ in batch_iterator(dataset, 64, epoch=1):
        if use_gpu:
            data,labels = data.cuda(),labels.cuda()
            
        outp = model(Variable(data))
        loss = nn.NLLLoss()(outp, Variable(labels))
        acc = (outp.data.max(1)[1] == labels).sum() / data.shape[0]
        err += (1-acc) * data.shape[0]
        count += data.shape[0]
    err = err / count
    print('Ev-Err={}'.format(err))
    return err

def checkpoint_model(step, val_err, model, opt, save_path):
    save_dict = dict(
        step=step,
        val_err=val_err,
        model_state_dict=model.state_dict(),
        opt_state_dict=opt.state_dict())
    torch.save(save_dict, save_path)


In [81]:
use_gpu = torch.cuda.is_available()
train_data, validation_data, test_data = train, validate, test

model = CNNClassifier(*embeddings.shape, 2)
model.init_weights(embeddings, True)

if use_gpu: model = model.cuda()

opt = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=3e-4)
step = 0
best_val_err = 1

for data, labels, example_ids in batch_iterator(train_data, 64, epoch=10):
    if use_gpu:
        data, labels = data.cuda(),labels.cuda()

    outp = model(Variable(data),True)
    loss = nn.NLLLoss()(outp,Variable(labels))
    acc = (outp.data.max(1)[1]== labels).sum()/data.shape[0]

    opt.zero_grad()
    loss.backward()
    opt.step()

    if step % 100 == 0:
        print('Step={} Tr-Loss={} Tr-Acc={}'.format(step, loss.data[0], acc))

    if step % 1000 == 0:
        val_err = run_validation(model, validation_data[:1000])
#         save_path(step, val_err)

        # early stopping
        if val_err < best_val_err:
            best_val_err = val_err
            print('Checkpointing model step={} best_val_err={}.'.format(step, best_val_err))
            checkpoint_model(step, val_err, model, opt, 'model.ckpt')
#             save_validation(model, validation_data, options)

    step += 1


number of batches: 12332
Step=0 Tr-Loss=0.7198463082313538 Tr-Acc=0.46875
number of batches: 15
Ev-Err=0.348
Checkpointing model step=0 best_val_err=0.348.
Step=100 Tr-Loss=0.49533411860466003 Tr-Acc=0.8125
Step=200 Tr-Loss=0.5217064619064331 Tr-Acc=0.75
Step=300 Tr-Loss=0.24462227523326874 Tr-Acc=0.9375
Step=400 Tr-Loss=0.38554006814956665 Tr-Acc=0.859375
Step=500 Tr-Loss=0.33596205711364746 Tr-Acc=0.859375
Step=600 Tr-Loss=0.28589946031570435 Tr-Acc=0.90625
Step=700 Tr-Loss=0.47102034091949463 Tr-Acc=0.8125
Step=800 Tr-Loss=0.37264174222946167 Tr-Acc=0.828125
Step=900 Tr-Loss=0.2897534668445587 Tr-Acc=0.859375
Step=1000 Tr-Loss=0.3996545672416687 Tr-Acc=0.828125
number of batches: 15
Ev-Err=0.137
Checkpointing model step=1000 best_val_err=0.137.
Step=1100 Tr-Loss=0.2985065281391144 Tr-Acc=0.875
Step=1200 Tr-Loss=0.3230428993701935 Tr-Acc=0.8125
Step=1300 Tr-Loss=0.2329605668783188 Tr-Acc=0.921875
Step=1400 Tr-Loss=0.3697943389415741 Tr-Acc=0.8125
Step=1500 Tr-Loss=0.45884546637535095

Ev-Err=0.101
Checkpointing model step=14000 best_val_err=0.101.
Step=14100 Tr-Loss=0.19340704381465912 Tr-Acc=0.921875
Step=14200 Tr-Loss=0.17206670343875885 Tr-Acc=0.9375
Step=14300 Tr-Loss=0.15311965346336365 Tr-Acc=0.953125
Step=14400 Tr-Loss=0.19227932393550873 Tr-Acc=0.921875
Step=14500 Tr-Loss=0.16396480798721313 Tr-Acc=0.9375
Step=14600 Tr-Loss=0.2885359227657318 Tr-Acc=0.875
Step=14700 Tr-Loss=0.2151484191417694 Tr-Acc=0.890625
Step=14800 Tr-Loss=0.21198344230651855 Tr-Acc=0.9375
Step=14900 Tr-Loss=0.2969956696033478 Tr-Acc=0.890625
Step=15000 Tr-Loss=0.25774499773979187 Tr-Acc=0.890625
number of batches: 15
Ev-Err=0.105
Step=15100 Tr-Loss=0.20557576417922974 Tr-Acc=0.90625
Step=15200 Tr-Loss=0.42879781126976013 Tr-Acc=0.859375
Step=15300 Tr-Loss=0.21409951150417328 Tr-Acc=0.90625
Step=15400 Tr-Loss=0.13900305330753326 Tr-Acc=0.96875
Step=15500 Tr-Loss=0.2906314730644226 Tr-Acc=0.90625
Step=15600 Tr-Loss=0.37191593647003174 Tr-Acc=0.90625
Step=15700 Tr-Loss=0.1431935429573059 T

KeyboardInterrupt: 

In [83]:
def load_model(model, opt, load_path):
    load_dict = torch.load(load_path)
    step = load_dict['step']
    val_err = load_dict['val_err']
    model.load_state_dict(load_dict['model_state_dict'])
    opt.load_state_dict(load_dict['opt_state_dict'])
    return step, val_err

step, best_val_err = load_model(model, opt, 'model.ckpt')
print('Model loaded from {}\nstep={} best_val_err={}'.format('model.ckpt', step, best_val_err))
val_err = run_validation(model, validation_data)
val_err

Model loaded from model.ckpt
step=26000 best_val_err=0.093
number of batches: 2055
Ev-Err=0.09007009168174424


0.09007009168174424

In [89]:
a=np.append(np.array([1]),np.array([1,2]))
a.shape

(3,)

In [16]:
train_data, train_labels = train['text'],train['positive']
test_data, test_labels = validate['text'],validate['positive']

# LogisticRegression, MultinomialNB, MLPClassifier
# , ngram_range=(1,2), max_features=20000
# SVC(gamma=2, C=1),
# QuadraticDiscriminantAnalysis()
# Perceptron(n_jobs=-1,tol=1e-4,max_iter=1000)
text_clf = Pipeline([('vec', CountVectorizer(stop_words='english'),
                    ('tfidf', TfidfTransformer()),
                    ('model', LogisticRegression()),
])

text_clf = text_clf.fit(train_data, train_labels)

predicted = text_clf.predict(test_data)

print("accuracy:", np.mean(predicted == test_labels))

accuracy: 0.8713338705508507


In [86]:
for i in range(1,6):
    print(i, mean(predicted[test_labels==i]==i))

1 0.828605586349
2 0.234343247737
3 0.3082620355
4 0.473060202872
5 0.875746162592
