# Sentiment Classification Using a Convolutional Neural Network

Based on paper by Yoon Kim (2014) 

# Let's grab a Dataset<a id='lesson_1'></a>
Comes from Lesson "Sentiment Classification" of Udacity (taught by Andrew Trask) 

In [121]:
data_dir = "./data"

def pretty_print_review_and_label(i):
    print(labels[i] + "\t:\t" + reviews[i][:80] + "...")

g = open('{}/reviews.txt'.format(data_dir),'r') # What we know!
reviews = list(map(lambda x:x[:-1],g.readlines()))
g.close()

g = open('{}/labels.txt'.format(data_dir),'r') # What we WANT to know!
labels = list(map(lambda x:x[:-1].upper(),g.readlines()))
g.close()

**Note:** The data in `reviews.txt` we're using has already been preprocessed a bit and contains only lower case characters. If we were working from raw data, where we didn't know it was all lower case, we would want to add a step here to convert it. That's so we treat different variations of the same word, like `The`, `the`, and `THE`, all the same way.

In [122]:
len(reviews)

25000

In [123]:
reviews[10]

'this isn  t the comedic robin williams  nor is it the quirky  insane robin williams of recent thriller fame . this is a hybrid of the classic drama without over  dramatization  mixed with robin  s new love of the thriller . but this isn  t a thriller  per se . this is more a mystery  suspense vehicle through which williams attempts to locate a sick boy and his keeper .  br    br   also starring sandra oh and rory culkin  this suspense drama plays pretty much like a news report  until william  s character gets close to achieving his goal .  br    br   i must say that i was highly entertained  though this movie fails to teach  guide  inspect  or amuse . it felt more like i was watching a guy  williams   as he was actually performing the actions  from a third person perspective . in other words  it felt real  and i was able to subscribe to the premise of the story .  br    br   all in all  it  s worth a watch  though it  s definitely not friday  saturday night fare .  br    br   it rates

In [124]:
labels[10]

'POSITIVE'

# Load Word2Vec model trained with 100B words


from https://code.google.com/archive/p/word2vec/ 
## and wrap-it up in a ready-to-use class 


In [117]:
import gensim 
import bisect 
import logging
import numpy as np

logger = logging.getLogger(__name__)
logging.basicConfig(
    format='%(asctime)s : %(levelname)s : %(module)s:%(lineno)d : %(funcName)s(%(threadName)s) : %(message)s',
    level=logging.DEBUG)


class ModelWrapper():
    
    def __init__(self, m):
        if m is None:
            print("Loading model...")
            self.model = gensim.models.word2vec.KeyedVectors.load_word2vec_format('{}/GoogleNews-vectors-negative300.bin.gz'.format(data_dir), binary=True)
            print("Cleaning up un-needed details from model...")
            try:
                del self.model.syn0  # not needed => free up mem
                del self.model.syn1
            except:
                pass
        else:
            print("[init] Model provided. If you want me to FORCE re-load it, call ModelWrapper's constructor with 'None'")
            self.model = m            
        # sort all the words in the model, so that we can auto-complete queries quickly
        print("Sort all the words in the model, so that we can auto-complete queries quickly...")
        self.orig_words = [gensim.utils.to_unicode(word) for word in self.model.index2word]
        indices = [i for i, _ in sorted(enumerate(self.orig_words), key=lambda item: item[1].lower())]
        self.all_words = [self.orig_words[i].lower() for i in indices]  # lowercased, sorted as lowercased
        self.orig_words = [self.orig_words[i] for i in indices]  # original letter casing, but sorted as if lowercased            
        
    def suggest(self, term):
        """
        For a given prefix, return 10 words that exist in the model start start with that prefix
        """
        prefix = gensim.utils.to_unicode(term).strip().lower()
        count = 10
        pos = bisect.bisect_left(self.all_words, prefix)
        result = self.orig_words[pos: pos + count]
        logger.info("suggested %r: %s" % (prefix, result))
        return result      
    
    def most_similar(self, positive, negative):
        """
            positive: an array of positive words
            negative: an array of negative words 
        """                
        try:
            result = self.model.most_similar(
                positive=[word.strip() for word in positive if word],
                negative=[word.strip() for word in negative if word],
                topn=5)
        except:
            result = []
        logger.info("similars for %s vs. %s: %s" % (positive, negative, result))
        return {'similars': result}    
    
    def vec_repr(self, word):
        """
            If 'word' belongs in the vocabulary, returns its 
            word2vec representation. Otherwise returns a vector of 0's
            of the same length of the other words. 
        """
        try:
            return model.word_vec(word)
        except KeyError:
            logger.info("'{}' not in Model. Returning [0]'s vector.".format(word))
            return np.zeros(model.vector_size)
            

In [118]:
mw = ModelWrapper(model)
model = mw.model # just cache in case I re-call this cell

[init] Model provided. If you want me to FORCE re-load it, call ModelWrapper's constructor with 'None'
Sort all the words in the model, so that we can auto-complete queries quickly...


In [119]:
mw.most_similar(positive = ['soccer'], negative = ['messi'])

2017-05-23 14:47:02,607 : INFO : <ipython-input-117-d285f72acaad>:57 : most_similar(MainThread) : similars for ['soccer'] vs. ['messi']: [('Soccer', 0.48688480257987976), ('lacrosse', 0.4622202515602112), ('softball', 0.4572678506374359), ('Lacrosse', 0.4419728219509125), ('basketball', 0.4305872321128845)]


{'similars': [('Soccer', 0.48688480257987976),
  ('lacrosse', 0.4622202515602112),
  ('softball', 0.4572678506374359),
  ('Lacrosse', 0.4419728219509125),
  ('basketball', 0.4305872321128845)]}

In [125]:
mw.vec_repr('piripiri')

2017-05-23 14:58:56,476 : INFO : <ipython-input-117-d285f72acaad>:69 : vec_repr(MainThread) : 'piripiri' not in Model. Returning [0]'s vector.


array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0

In [126]:
mw.vec_repr('dog')

array([  5.12695312e-02,  -2.23388672e-02,  -1.72851562e-01,
         1.61132812e-01,  -8.44726562e-02,   5.73730469e-02,
         5.85937500e-02,  -8.25195312e-02,  -1.53808594e-02,
        -6.34765625e-02,   1.79687500e-01,  -4.23828125e-01,
        -2.25830078e-02,  -1.66015625e-01,  -2.51464844e-02,
         1.07421875e-01,  -1.99218750e-01,   1.59179688e-01,
        -1.87500000e-01,  -1.20117188e-01,   1.55273438e-01,
        -9.91210938e-02,   1.42578125e-01,  -1.64062500e-01,
        -8.93554688e-02,   2.00195312e-01,  -1.49414062e-01,
         3.20312500e-01,   3.28125000e-01,   2.44140625e-02,
        -9.71679688e-02,  -8.20312500e-02,  -3.63769531e-02,
        -8.59375000e-02,  -9.86328125e-02,   7.78198242e-03,
        -1.34277344e-02,   5.27343750e-02,   1.48437500e-01,
         3.33984375e-01,   1.66015625e-02,  -2.12890625e-01,
        -1.50756836e-02,   5.24902344e-02,  -1.07421875e-01,
        -8.88671875e-02,   2.49023438e-01,  -7.03125000e-02,
        -1.59912109e-02,