# 1. Import, clean and split data

In [1]:
import os
import json
import re
import time
from bs4 import BeautifulSoup 

# Should have 82.83 million reviews
filename = "raw/aggressive_dedup.json"
print(filename)
print(os.path.getsize(filename)/(1024*1024*1024), " GB")

raw/aggressive_dedup.json
54.28077760338783  GB


In [2]:
def line_feeder(fname):
    with open(fname, 'r', encoding='utf-8') as f:
        for line in f:
            js_out = json.loads(line) 
            yield js_out   
        
def clean_review(review):
    temp = BeautifulSoup(review, "lxml").get_text()
    punctuation = """'.,?!:;(){}[]"""
    for char in punctuation:
        temp = temp.replace(char, ' ' + char + ' ')
    words = " ".join(temp.lower().split()) + "\n"
    return words

def example(cut=5):
    for c,x in enumerate(line_feeder(filename)):
        rev, rating = clean_review(x["reviewText"]), x["overall"]
        print("Raw:")
        print(x["reviewText"]), x["overall"]
        print("Clean:")
        print(rev, rating)          
        if c > cut:
            return
        
example()

Raw:
It is and does exactly what the description said it would be and would do. Couldn't be happier with it.
Clean:
it is and does exactly what the description said it would be and would do . couldn ' t be happier with it .
 5.0
Raw:
I was sketchy at first about these but once you wear them for a couple hours they break in they fit good on my board an have little wear from skating in them. They are a little heavy but won't get eaten up as bad by your grip tape like poser dc shoes.
Clean:
i was sketchy at first about these but once you wear them for a couple hours they break in they fit good on my board an have little wear from skating in them . they are a little heavy but won ' t get eaten up as bad by your grip tape like poser dc shoes .
 5.0
Raw:
Very mobile product. Efficient. Easy to use; however product needs a varmint guard. Critters are able to gorge themselves without a guard.
Clean:
very mobile product . efficient . easy to use ; however product needs a varmint guard . critter

In [None]:
good_rev = []
bad_rev = []
neut_rev = []
error_rev = []

gr = open('good_reviews.txt', 'w', encoding='utf-8')
br = open('bad_reviews.txt', 'w', encoding='utf-8')
nt = open('neutral_reviews.txt', 'w', encoding='utf-8')
er = open('error_reviews.txt', 'w', encoding='utf-8')

chunks = 0
stime = time.time()
for x in line_feeder(filename):
    
    chunks += 1
    rev, rating = clean_review(x["reviewText"]), x["overall"]
    
    if not len(rev) > 2:
        # Fewer than 3 characters not meangingful
        error_rev.append(rev)
    else:
        # Review long enough to consider
        if rating in [4,5]:
            good_rev.append(rev)
        elif rating in [1,2]:
            bad_rev.append(rev)
        else:
            neut_rev.append(rev)
            
    # Chunk every N=1000*000 reviews
    # Limited by IO, disk = 96%
    # Takes 305 seconds for 1mill, so around 420 minutes = 7 hours
    if chunks % (1000*1000) == 0:
        print("Processed: %d records" % chunks)
        print("Elapsed: %.2f" % (time.time() - stime))

        gr.writelines(good_rev)
        br.writelines(bad_rev)
        nt.writelines(neut_rev)
        er.writelines(error_rev)

        good_rev = []
        bad_rev = []
        neut_rev = []
        error_rev = []
            
# Any remaining
gr.writelines(good_rev)
gr.close()
br.writelines(bad_rev)
br.close()
nt.writelines(neut_rev)
nt.close()
er.writelines(error_rev)
er.close()

del good_rev
del bad_rev
del neut_rev
del error_rev

In [None]:
# Check sizes
def file_len(fname):
    with open(fname) as f:
        for i, l in enumerate(f):
            pass
    return i + 1

# Should add up:
# print("Raw contains %d lines" % file_len(filename))

# We have 64,439,865 good reviews
# print("Good contains %d lines" % file_len('good_reviews.txt'))
# We have 10,961,504 bad reviews
# print("Bad contains %d lines" % file_len('bad_reviews.txt'))


# print("Neutral contains %d lines" % file_len('neutral_reviews.txt'))
# print("Short contains %d lines" % file_len('error_reviews.txt'))

# 2. Train/Test split

In [3]:
# 1 mill 
_SAMPLE_SIZE = 1000000

In [4]:
# Split data into train and test (also use subsample):
import random

def train_test_split(train_ratio=0.5):
    # Train -> true
    return random.uniform(0,1) <= train_ratio

def line_feeder(fname, cutoff):
    i = 0
    with open(fname, 'r', encoding='utf-8') as f:
        for line in f:
            yield line
            i+=1
            if i == cutoff:
                break
            
def split_data(dataname, sample_size, train_ratio):
    with open('train_' + dataname, 'w', encoding='utf-8') as tr:
        with open('test_' + dataname, 'w', encoding='utf-8') as te:
            for line in line_feeder(dataname, sample_size):
                if train_test_split(0.5):
                    tr.write(line)
                else:
                    te.write(line)

In [None]:
split_data(dataname = 'good_reviews.txt', sample_size = _SAMPLE_SIZE, train_ratio = 0.5)
split_data(dataname = 'bad_reviews.txt', sample_size = _SAMPLE_SIZE, train_ratio = 0.5)

In [5]:
sources = {'test_bad_reviews.txt':'TE_B',
           'test_good_reviews.txt':'TE_G',
           'train_bad_reviews.txt':'TR_B',
           'train_good_reviews.txt':'TR_G'}

# 3. TFIDF Approach

In [None]:
from gensim import utils

def file_to_list(fname):
    with utils.smart_open(fname) as f:
        for rev in f:
            yield rev

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

# Training Data
clean_train_reviews = []
train_labels = []
for f in ['train_good_reviews.txt',
          'train_bad_reviews.txt']:
    for review in file_to_list(f):
        clean_train_reviews.append(utils.to_unicode(review))
        if "good" in f:
            train_labels.append(1)
        elif "bad" in f:
            train_labels.append(0)
        else:
            raise Exception
        
print("Sample review: %s" % clean_train_reviews[0])
print("Vectorising ... %d reviews" % len(clean_train_reviews))

## Defaults:
#TfidfVectorizer(input='content', encoding='utf-8', decode_error='strict',
#                strip_accents=None, lowercase=True, preprocessor=None,
#                tokenizer=None, analyzer='word', stop_words=None,
#                token_pattern='(?u)\b\w\w+\b', ngram_range=(1, 1),
#                max_df=1.0, min_df=1, max_features=None, vocabulary=None,
#                binary=False, dtype=<class 'numpy.int64'>, norm='l2',
#                use_idf=True, smooth_idf=True, sublinear_tf=False

# 40k, tri-grams, sublinear
vectorizer = TfidfVectorizer(max_features = 40000, ngram_range = (1, 3), sublinear_tf = True)

# Learn vocabulary and idf, return term-document matrix
train_data_features = vectorizer.fit_transform(clean_train_reviews)

del clean_train_reviews

# Training Model

## Defaults used
#LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True,
#                   intercept_scaling=1, class_weight=None, random_state=None,
#                   solver='liblinear', max_iter=100, multi_class='ovr',
#                   verbose=0, warm_start=False, n_jobs=1)

#GradientBoostingClassifier(loss='deviance', learning_rate=0.1, n_estimators=100,
#                           subsample=1.0, min_samples_split=2, min_samples_leaf=1,
#                           min_weight_fraction_leaf=0.0, max_depth=3, init=None,
#                           random_state=None, max_features=None, verbose=0, 
#                           max_leaf_nodes=None, warm_start=False, presort='auto')

# l2, max_iter = 100
classifier_tfidf = LogisticRegression()

classifier_tfidf.fit(train_data_features, train_labels)

del train_data_features
del train_labels
print("done")

In [None]:
classifier_tfidf

In [None]:
# Testing Data
clean_test_reviews = []
test_labels = []
for f in ['test_good_reviews.txt',
          'test_bad_reviews.txt']:
    for review in file_to_list(f):
        clean_test_reviews.append(utils.to_unicode(review))
        if "good" in f:
            test_labels.append(1)
        elif "bad" in f:
            test_labels.append(0)
        else:
            raise Exception
                   
print("Sample review: %s" % clean_test_reviews[0])
print("Vectorising ... %d reviews" % len(clean_test_reviews))  

# Transform documents to document-term matrix
test_data_features = vectorizer.transform(clean_test_reviews)

del clean_test_reviews
print("done")

In [None]:
# Run classifier
classifier_tfidf.score(test_data_features.toarray(), test_labels) 

# 50k gives 0.915
# 500k gives 0.928
# 1mill gives 0.931

In [None]:
tricky_sentences = [
    "Most movies are rubbish, however this one was good",
    "This is a product you would love to hate",
    "Rubbish from start to finish",
    "I was so happy when this ended",
    "Very good",
    "Horrible"
]

In [None]:
# Some tests ...
def tfidf_sample_sentiment(mystr):
    assert isinstance(mystr, str)
    test_data_feat = vectorizer.transform([mystr])
    pred = classifier_tfidf.predict(test_data_feat)
    return pred

for x in tricky_sentences:
    clean = clean_review(x)
    print(clean, tfidf_sample_sentiment(clean))    

In [None]:
del vectorizer
del classifier_tfidf
del test_data_features
del test_labels

# 4.  Doc2Vec Approach

## 1. Doc2Vec Model

In [6]:
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
from random import shuffle
import multiprocessing

import gensim.models.doc2vec
assert gensim.models.doc2vec.FAST_VERSION > -1

cores = multiprocessing.cpu_count()
print(cores)

model_feat_size = 400

class LabeledLineSentence(object):
    def __init__(self, sources):
        self.sources = sources
    
    def extract_sentences(self):
        self.sentences = []
        for sr, pr in self.sources.items():
            with utils.smart_open(sr) as fin:
                for item_no, line in enumerate(fin):
                    self.sentences.append(LabeledSentence(utils.to_unicode(line).split(),
                                                          [pr + '_%s' % item_no]))
    
    def sentences_perm(self):
        shuffle(self.sentences)
        return self.sentences        

24


In [7]:
# This will be a list of all sentences
sentences = LabeledLineSentence(sources)
sentences.extract_sentences()

![Mikov's params for IMDB](http://nbviewer.jupyter.org/github/fbkarsdorp/doc2vec/blob/master/quoc-response.png)

In [16]:
from collections import OrderedDict

# Creating models using params from Mikolov above and here:
# https://github.com/RaRe-Technologies/gensim/blob/develop/docs/notebooks/doc2vec-IMDB.ipynb

# Note window = 5 is both sides and approximates the above window = 10 (diff to Mikolov's scripts)

single_models = [
    # PV-DM with concatenation
    Doc2Vec(dm=1, dm_concat=1, size=100, window=5, negative=5, hs=0, min_count=2, workers=cores),
    # PV-DBOW
    Doc2Vec(dm=0, size=100, negative=5, hs=0, min_count=2, workers=cores),
    # PV-DM with averaging
    Doc2Vec(dm=1, dm_mean=1, size=100, window=10, negative=5, hs=0, min_count=2, workers=cores)
]

single_models[0].build_vocab(sentences.sentences)
print(single_models[0])

# Use learnt vocab for other models
for mod in single_models[1:]:
    mod.reset_from(single_models[0])
    print(mod)

nms = ['dm_concat', 'dbow', 'dm_averaging']
models = OrderedDict((str(name), model) for model, name in zip(single_models, nms))

Doc2Vec(dm/c,d100,n5,w5,mc2,t24)
Doc2Vec(dbow,d100,n5,mc2,t24)
Doc2Vec(dm/m,d100,n5,w10,mc2,t24)


In [17]:
# Also use concatenated vectors from each model
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec

models['dbow_dmm'] = ConcatenatedDoc2Vec([single_models[1], single_models[2]])
models['dbow_dmc'] = ConcatenatedDoc2Vec([single_models[1], single_models[0]])

print(models)

OrderedDict([('dm_concat', <gensim.models.doc2vec.Doc2Vec object at 0x0000019B4BF5B240>), ('dbow', <gensim.models.doc2vec.Doc2Vec object at 0x0000019B4BF5B278>), ('dm_averaging', <gensim.models.doc2vec.Doc2Vec object at 0x0000019B4BF5B2E8>), ('dbow_dmm', <gensim.test.test_doc2vec.ConcatenatedDoc2Vec object at 0x0000019B442C76A0>), ('dbow_dmc', <gensim.test.test_doc2vec.ConcatenatedDoc2Vec object at 0x0000019B442C75F8>)])


In [None]:
# Train model:
for nme, model in models.items():
    
    alpha, min_alpha, epochs = (0.025, 0.001, 20)
    alpha_delta = (alpha - min_alpha) / epochs
    
    print("Training model: bigreviews_subsample_1mill_400d_%s.d2v" % nme)
    
    for epoch in range(epochs):
        
        print("Training epoch: %d" % epoch)
        
        model.alpha, model.min_alpha = alpha, alpha
        model.train(sentences.sentences_perm())
        alpha -= alpha_delta
        
    print("Finished training model: %s" % nme)
    model.save("bigreviews_subsample_1mill_400d_%s.d2v" % nme)

Training model: bigreviews_subsample_1mill_400d_dm_concat.d2v
Training epoch: 0


In [None]:
del sentences

## 2. Play around with the features of the model

In [None]:
# Load a model
#model = Doc2Vec.load('bigreviews_subsample_1mill_1200d.d2v')

In [None]:
print(len(model.vocab))

In [None]:
model.most_similar('movie')

In [None]:
model.most_similar('toy')

In [None]:
model.most_similar('great')

In [None]:
model.most_similar('spielberg')

In [None]:
model.most_similar('refund')

In [None]:
def line_from_file(fname, line_no):
    with open(fname) as f:
        for i, l in enumerate(f):
            if i == line_no:
                return l
            
print(line_from_file('train_good_reviews.txt',0))

In [None]:
model.docvecs.most_similar('TR_G_0')

In [None]:
print(line_from_file('test_good_reviews.txt', 347295))
print(line_from_file('test_good_reviews.txt', 39105))
print(line_from_file('train_good_reviews.txt', 10921))

In [None]:
# Infer a vector ...

## 3. Classification

In [None]:
# Load best model
model = Doc2Vec.load('bigreviews_subsample_1mill_1200d.d2v')

In [None]:
# Check sizes
def file_len(fname):
    with open(fname) as f:
        for i, l in enumerate(f):
            pass
    return i + 1

# Dictionary of file-lengths
file_lengths = {}
for k,v in sources.items():
    file_lengths[v] = file_len(k)

file_lengths

In [None]:
import numpy

# Train vectors
no_training = file_lengths['TR_G'] + file_lengths['TR_B']
print("%d for training" % no_training)

train_arrays = numpy.zeros((no_training, model_feat_size))
train_labels = numpy.concatenate((numpy.ones(file_lengths['TR_G']),
                                  numpy.zeros(file_lengths['TR_B'])))

for i in range(file_lengths['TR_G']):
    train_arrays[i] = model.docvecs['TR_G_' + str(i)]
    
for i in range(file_lengths['TR_B']):
    train_arrays[file_lengths['TR_G'] + i] = model.docvecs['TR_B_' + str(i)]
    
assert len(train_arrays) == len(train_labels)

In [None]:
from sklearn.linear_model import LogisticRegression
#from sklearn.ensemble import RandomForestClassifier
#from sklearn.linear_model import SGDClassifier
#from sklearn.svm import LinearSVC
#from sklearn.neural_network import MLPClassifier

classifier = LogisticRegression()  # 0.84834202356611155
#classifier = RandomForestClassifier
#classifier = SGDClassifier(loss='log', penalty='l1')  # 0.84822804237300842
#classifier = LinearSVC() # 0.84843300855358861
#classifier = MLPClassifier(hidden_layer_sizes = (50,))

classifier.fit(train_arrays, train_labels)

del train_arrays
del train_labels

In [None]:
# Test vectors
no_testing = file_lengths['TE_G'] + file_lengths['TE_B']
print("%d for testing" % no_testing)

test_arrays = numpy.zeros((no_testing, model_feat_size))
test_labels = numpy.concatenate((numpy.ones(file_lengths['TE_G']),
                                  numpy.zeros(file_lengths['TE_B'])))

for i in range(file_lengths['TE_G']):
    test_arrays[i] = model.docvecs['TE_G_' + str(i)]
    
for i in range(file_lengths['TE_B']):
    test_arrays[file_lengths['TE_G'] + i] = model.docvecs['TE_B_' + str(i)]

assert len(test_arrays) == len(test_labels)

In [None]:
classifier.score(test_arrays, test_labels)  # 0.85499592567226412