# Sentiment Classification Using a Convolutional Neural Network

Based on paper by Yoon Kim (2014) 

# Let's grab a Dataset<a id='lesson_1'></a>
Comes from Lesson "Sentiment Classification" of Udacity (taught by Andrew Trask) 

In [329]:
data_dir = "./data"

def pretty_print_review_and_label(i):
    print(labels[i] + "\t:\t" + reviews[i][:80] + "...")

g = open('{}/reviews.txt'.format(data_dir),'r') # What we know!
reviews = list(map(lambda x:x[:-1],g.readlines()))
g.close()

g = open('{}/labels.txt'.format(data_dir),'r') # What we WANT to know!
labels = list(map(lambda x:x[:-1].upper(),g.readlines()))
g.close()

**Note:** The data in `reviews.txt` we're using has already been preprocessed a bit and contains only lower case characters. If we were working from raw data, where we didn't know it was all lower case, we would want to add a step here to convert it. That's so we treat different variations of the same word, like `The`, `the`, and `THE`, all the same way.

In [2]:
len(reviews)

25000

In [3]:
reviews[10]

'this isn  t the comedic robin williams  nor is it the quirky  insane robin williams of recent thriller fame . this is a hybrid of the classic drama without over  dramatization  mixed with robin  s new love of the thriller . but this isn  t a thriller  per se . this is more a mystery  suspense vehicle through which williams attempts to locate a sick boy and his keeper .  br    br   also starring sandra oh and rory culkin  this suspense drama plays pretty much like a news report  until william  s character gets close to achieving his goal .  br    br   i must say that i was highly entertained  though this movie fails to teach  guide  inspect  or amuse . it felt more like i was watching a guy  williams   as he was actually performing the actions  from a third person perspective . in other words  it felt real  and i was able to subscribe to the premise of the story .  br    br   all in all  it  s worth a watch  though it  s definitely not friday  saturday night fare .  br    br   it rates

In [4]:
labels[10]

'POSITIVE'

# Load Word2Vec model trained with 100B words


from https://code.google.com/archive/p/word2vec/ 
## and wrap-it up in a ready-to-use class 


In [5]:
model = None # cache of Model 

In [308]:
import logging

logger = logging.getLogger(__name__)

def set_logging_as(a_level):
    logger.setLevel(a_level)

    # logging.basicConfig(format='%(asctime)s : %(levelname)s : %(module)s:%(lineno)d : %(funcName)s(%(threadName)s) : %(message)s')

#     ,
#         level=a_level)

# initialization: 
set_logging_as(logging.DEBUG)    

In [305]:
logging.getLevelName(logger.getEffectiveLevel())

'DEBUG'

In [306]:
set_logging_as(logging.DEBUG)
logger.info("lalala")

2017-05-26 06:54:57,330 - __main__ - INFO - lalala
2017-05-26 06:54:57,330 : INFO : <ipython-input-306-c0a20823c41c>:2 : <module>(MainThread) : lalala


In [307]:
set_logging_as(logging.CRITICAL)
logger.info("lalala")

In [275]:
# create logger
alogger = logging.getLogger(__name__)
alogger.setLevel(logging.DEBUG)

# create console handler and set level to debug
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)

# create formatter
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')

# add formatter to ch
ch.setFormatter(formatter)
# add ch to logger
alogger.addHandler(ch)


In [278]:
logging.getLevelName(alogger.getEffectiveLevel())

'CRITICAL'

In [281]:
alogger.setLevel(logging.DEBUG)

In [282]:
alogger.info("lala")

2017-05-26 06:51:25,759 - __main__ - INFO - lala
2017-05-26 06:51:25,759 : INFO : <ipython-input-282-ebb7babb726a>:1 : <module>(MainThread) : lala


In [316]:
import gensim 
import bisect 
import numpy as np

class ModelWrapper():
    
    def __init__(self, m):
        if m is None:
            print("Loading model...")
            self.model = gensim.models.word2vec.KeyedVectors.load_word2vec_format('{}/GoogleNews-vectors-negative300.bin.gz'.format(data_dir), binary=True)
            print("Model succesfully loaded")
#             print("Cleaning up un-needed details from model...")
#             try:
#                 del self.model.syn0  # not needed => free up mem
#                 del self.model.syn1
#             except:
#                 pass
        else:
            print("[init] Model provided. If you want me to FORCE re-load it, call ModelWrapper's constructor with 'None'")
            self.model = m            
        # sort all the words in the model, so that we can auto-complete queries quickly
        print("Sort all the words in the model, so that we can auto-complete queries quickly...")
        self.orig_words = [gensim.utils.to_unicode(word) for word in self.model.index2word]
        indices = [i for i, _ in sorted(enumerate(self.orig_words), key=lambda item: item[1].lower())]
        self.all_words = [self.orig_words[i].lower() for i in indices]  # lowercased, sorted as lowercased
        self.orig_words = [self.orig_words[i] for i in indices]  # original letter casing, but sorted as if lowercased            
        
    def suggest(self, term):
        """
        For a given prefix, return 10 words that exist in the model start start with that prefix
        """
        prefix = gensim.utils.to_unicode(term).strip().lower()
        count = 10
        pos = bisect.bisect_left(self.all_words, prefix)
        result = self.orig_words[pos: pos + count]
        logger.info("suggested %r: %s" % (prefix, result))
        return result      
    
    def most_similar(self, positive, negative):
        """
            positive: an array of positive words
            negative: an array of negative words 
        """                
        try:
            result = self.model.most_similar(
                positive=[word.strip() for word in positive if word],
                negative=[word.strip() for word in negative if word],
                topn=5)
        except:
            result = []
        logger.info("similars for %s vs. %s: %s" % (positive, negative, result))
        return {'similars': result}    
    
    def vec_repr(self, word):
        """
            If 'word' belongs in the vocabulary, returns its 
            word2vec representation. Otherwise returns a vector of 0's
            of the same length of the other words. 
        """
        try:
            return self.model.word_vec(word)
        except KeyError:
            logger.debug("'{}' not in Model. Returning [0]'s vector.".format(word))
            return np.zeros(self.model.vector_size)
            

In [317]:
mw = ModelWrapper(model)
model = mw.model # just cache in case I re-call this cell

[init] Model provided. If you want me to FORCE re-load it, call ModelWrapper's constructor with 'None'
Sort all the words in the model, so that we can auto-complete queries quickly...


In [335]:
mw.model.syn0.shape


(3000000, 300)

In [343]:
type(mw.model.vocab.items())

dict_items

In [350]:
# my_dictionary = {k: f(v) for k, v in my_dictionary.items()}
mw.model.index2word[:10]

['</s>', 'in', 'for', 'that', 'is', 'on', '##', 'The', 'with', 'said']

In [359]:
random.uniform(0, 1)

0.0245896079533392

In [360]:
{random.uniform(0, 1): w for w in mw.model.index2word[:10]}

{0.12894453711363352: 'on',
 0.1398216756517866: 'said',
 0.26174952224180825: 'that',
 0.2791587442891682: 'in',
 0.38916649884456034: 'The',
 0.42978237804896535: 'is',
 0.5504951118531066: '##',
 0.6961363439704658: 'with',
 0.6964632561858928: '</s>',
 0.8162632196567988: 'for'}

In [355]:
mw.model['w']

array([-0.23339844,  0.06152344, -0.3046875 ,  0.22460938,  0.06591797,
        0.26171875, -0.17089844, -0.13378906, -0.05664062, -0.06884766,
       -0.359375  , -0.16503906, -0.20898438, -0.30273438, -0.08740234,
        0.0546875 ,  0.16503906,  0.296875  , -0.11425781, -0.08300781,
       -0.31835938, -0.15722656,  0.22363281,  0.04150391, -0.22265625,
        0.0279541 , -0.36914062,  0.19628906, -0.06787109,  0.0559082 ,
        0.10839844,  0.02478027, -0.12011719, -0.17480469, -0.23535156,
       -0.00191498, -0.07373047,  0.27734375, -0.22460938, -0.04272461,
       -0.01141357, -0.05444336,  0.29296875, -0.03015137,  0.18945312,
       -0.17285156, -0.17382812, -0.38671875,  0.01940918,  0.06396484,
       -0.31640625,  0.31640625, -0.10742188,  0.140625  ,  0.04956055,
        0.25195312, -0.06298828,  0.10009766,  0.0123291 , -0.33203125,
       -0.10791016,  0.0246582 , -0.32617188, -0.25585938, -0.10791016,
       -0.34960938,  0.11279297, -0.03112793, -0.17578125,  0.30

### Graphemes 2 Phonemes 

In [362]:
from subprocess import check_output
def graphs2phones(s): 
    """
        Takes a sentences, returns an array of graphemes strings (one per number of words in original sentence)
    """
    phs = check_output(["speak", "-q", "-x",'-v', 'en-us',s]).decode('utf-8')
    return [w for w in phs.strip().split(" ") if w != ' ']

# example: 
graphs2phones('hello world bla and ble')

["h@l'oU", "w'3:ld", "bl'A:", '_:_:and', "bl'i:"]

In [363]:
graphs2phones('fuckk fuck fuc fuk')

["f'Vkk", "f'Vk", "f'Vk", "f'Vk"]

In [8]:
mw.most_similar(positive = ['soccer'], negative = ['messi'])

2017-05-25 12:00:54,132 : INFO : keyedvectors:807 : init_sims(MainThread) : precomputing L2-norms of word weight vectors
2017-05-25 12:01:05,476 : INFO : <ipython-input-6-22cae4a71586>:57 : most_similar(MainThread) : similars for ['soccer'] vs. ['messi']: [('Soccer', 0.48688480257987976), ('lacrosse', 0.4622202515602112), ('softball', 0.4572678506374359), ('Lacrosse', 0.4419728219509125), ('basketball', 0.4305872321128845)]


{'similars': [('Soccer', 0.48688480257987976),
  ('lacrosse', 0.4622202515602112),
  ('softball', 0.4572678506374359),
  ('Lacrosse', 0.4419728219509125),
  ('basketball', 0.4305872321128845)]}

## Let's sanity check the Word2Vec model we just wrapped up

In [9]:
assert np.count_nonzero(mw.vec_repr('piripiri')) == 0, "'piripiri' is present in this model???"
print("Sanity check: all good, 'piripiri' was assigned the empty vector as representation")

2017-05-25 12:01:05,483 : INFO : <ipython-input-6-22cae4a71586>:69 : vec_repr(MainThread) : 'piripiri' not in Model. Returning [0]'s vector.


Sanity check: all good, 'piripiri' was assigned the empty vector as representation


In [10]:
assert np.count_nonzero(mw.vec_repr('dog')) > 0, "'dog' is not present in this model???"
print("Sanity check: all good, 'dog' has a meaningful representation")

Sanity check: all good, 'dog' has a meaningful representation


## Build representation of data 

In [11]:
import spacy # nl processing
nlp = spacy.load('en')


In [163]:
n_words_in_review = 20 # maximum number of words I will take from the review 
empty_string = '<word_not_in_model>' # need a fill-up word for too-short sentences 
assert np.count_nonzero(mw.vec_repr(empty_string)) == 0, "'{}' is present in this model. Choose another empty string".format(empty_string)
print("All good: will use '{}' as empty string.".format(empty_string))


2017-05-25 15:39:53,459 : INFO : <ipython-input-6-22cae4a71586>:69 : vec_repr(MainThread) : '<word_not_in_model>' not in Model. Returning [0]'s vector.


All good: will use '<word_not_in_model>' as empty string.


In [164]:
rev = 10 # tmp variable, just to explore what's going on
reviews[rev]

'this isn  t the comedic robin williams  nor is it the quirky  insane robin williams of recent thriller fame . this is a hybrid of the classic drama without over  dramatization  mixed with robin  s new love of the thriller . but this isn  t a thriller  per se . this is more a mystery  suspense vehicle through which williams attempts to locate a sick boy and his keeper .  br    br   also starring sandra oh and rory culkin  this suspense drama plays pretty much like a news report  until william  s character gets close to achieving his goal .  br    br   i must say that i was highly entertained  though this movie fails to teach  guide  inspect  or amuse . it felt more like i was watching a guy  williams   as he was actually performing the actions  from a third person perspective . in other words  it felt real  and i was able to subscribe to the premise of the story .  br    br   all in all  it  s worth a watch  though it  s definitely not friday  saturday night fare .  br    br   it rates

### Option: Filter words that are "too common" 

In [165]:
probs = [lex.prob for lex in nlp.vocab]

In [166]:
probs.sort()

In [167]:
probs[-1000] # this is the log probability of the 1000th word more common in English 

-9.41363525390625

In [168]:
# rev_as_list = [w.text for w in nlp(reviews[rev])]

In [169]:
rev_as_list = [w.text for w in nlp(reviews[rev]) if nlp.vocab[w.text].prob < probs[-1000]]

In [170]:
# rev_as_list = reviews[rev].split(".")

In [171]:
avail_words = len(rev_as_list[:n_words_in_review])

In [172]:
num_empty_words = n_words_in_review - avail_words
rev_right_size = rev_as_list[:n_words_in_review] + [empty_string]*num_empty_words

In [173]:
assert len(rev_right_size) == n_words_in_review, "{} != {}".format(len(rev_right_size), n_words_in_review)
print("review[{}]: got {} full words => padded with {} 'empty' words".format(rev, avail_words, num_empty_words))

review[10]: got 20 full words => padded with 0 'empty' words


In [174]:
revs_as_list = [mw.vec_repr(w) for w in rev_right_size]

In [175]:
# revs_as_list[21]

In [177]:
from tqdm import tqdm

In [214]:
batch_size = 100
reviews_in_batch = reviews[:batch_size]
batch_as_words = [ [w.text for w in nlp(rev) if nlp.vocab[w.text].prob < probs[-1000]] for rev in tqdm(reviews_in_batch, total=len(reviews_in_batch))]
batch_repr = np.array([words2repr(words_in_review, n_words_in_review) for words_in_review in tqdm(batch_as_words, total=len(batch_as_words))])

labels_slice = labels[:batch_size]
batch_labels = np.array([[1, 0] if (w == 'POSITIVE') else [0, 1] for w in labels_slice]).reshape([len(labels_slice),2])


  0%|          | 0/100 [00:00<?, ?it/s][A
  6%|▌         | 6/100 [00:00<00:01, 56.15it/s][A
 14%|█▍        | 14/100 [00:00<00:01, 61.59it/s][A
 20%|██        | 20/100 [00:00<00:01, 57.17it/s][A
 27%|██▋       | 27/100 [00:00<00:01, 60.48it/s][A
 37%|███▋      | 37/100 [00:00<00:00, 68.58it/s][A
 44%|████▍     | 44/100 [00:00<00:00, 67.04it/s][A
 52%|█████▏    | 52/100 [00:00<00:00, 65.43it/s][A
 60%|██████    | 60/100 [00:00<00:00, 66.36it/s][A
 67%|██████▋   | 67/100 [00:00<00:00, 66.66it/s][A
 74%|███████▍  | 74/100 [00:01<00:00, 58.50it/s][A
 81%|████████  | 81/100 [00:01<00:00, 58.96it/s][A
 88%|████████▊ | 88/100 [00:01<00:00, 61.65it/s][A
 95%|█████████▌| 95/100 [00:01<00:00, 49.52it/s][A
100%|██████████| 100/100 [00:01<00:00, 58.27it/s][A
  0%|          | 0/100 [00:00<?, ?it/s][A2017-05-26 05:56:21,801 : INFO : <ipython-input-6-22cae4a71586>:69 : vec_repr(MainThread) : 'bromwell' not in Model. Returning [0]'s vector.
2017-05-26 05:56:21,802 : INFO : <ipython-inp

In [216]:
batch_labels.shape

(100, 2)

In [205]:
everyone = [ [w.text for w in nlp(rev) if nlp.vocab[w.text].prob < probs[-1000]] for rev in tqdm(reviews_in_batch, total=len(reviews_in_batch))]


  0%|          | 0/100 [00:00<?, ?it/s][A
  1%|          | 1/100 [00:00<00:40,  2.43it/s][A
  2%|▏         | 2/100 [00:00<00:33,  2.97it/s][A
  3%|▎         | 3/100 [00:00<00:34,  2.79it/s][A
  4%|▍         | 4/100 [00:01<00:37,  2.54it/s][A
  6%|▌         | 6/100 [00:01<00:27,  3.42it/s][A
  9%|▉         | 9/100 [00:01<00:19,  4.62it/s][A
 11%|█         | 11/100 [00:01<00:15,  5.81it/s][A
 14%|█▍        | 14/100 [00:01<00:11,  7.51it/s][A
 16%|█▌        | 16/100 [00:02<00:09,  9.24it/s][A
 19%|█▉        | 19/100 [00:02<00:07, 11.02it/s][A
 22%|██▏       | 22/100 [00:02<00:06, 12.90it/s][A
 27%|██▋       | 27/100 [00:02<00:04, 16.25it/s][A
 33%|███▎      | 33/100 [00:02<00:03, 20.46it/s][A
 39%|███▉      | 39/100 [00:02<00:02, 25.01it/s][A
 43%|████▎     | 43/100 [00:02<00:02, 27.08it/s][A
 47%|████▋     | 47/100 [00:02<00:01, 28.48it/s][A
 52%|█████▏    | 52/100 [00:03<00:01, 29.70it/s][A
 58%|█████▊    | 58/100 [00:03<00:01, 32.17it/s][A
 64%|██████▍   | 64/100 [0

In [206]:
def padWords2size(rev_as_list, n_words_in_review):
    avail_words = len(rev_as_list[:n_words_in_review])
    num_empty_words = n_words_in_review - avail_words
    return rev_as_list[:n_words_in_review] + [empty_string]*num_empty_words
    return np.array([mw.vec_repr(w) for w in rev_right_size])

def words2repr(rev_as_list, n_words_in_review):
    rev_right_size = padWords2size(rev_as_list, n_words_in_review)
    return np.array([mw.vec_repr(w) for w in rev_right_size])

In [207]:
# let's take a slice of data
slice_size = 300

In [208]:
repr_slice = everyone # [:slice_size]

In [209]:
everyones_repr = np.array([words2repr(words_in_review, n_words_in_review) for words_in_review in tqdm(repr_slice, total=len(repr_slice))])


  0%|          | 0/100 [00:00<?, ?it/s][A2017-05-26 05:53:00,473 : INFO : <ipython-input-6-22cae4a71586>:69 : vec_repr(MainThread) : 'bromwell' not in Model. Returning [0]'s vector.
2017-05-26 05:53:00,478 : INFO : <ipython-input-6-22cae4a71586>:69 : vec_repr(MainThread) : 'bromwell' not in Model. Returning [0]'s vector.
2017-05-26 05:53:00,484 : INFO : <ipython-input-6-22cae4a71586>:69 : vec_repr(MainThread) : 'houselessness' not in Model. Returning [0]'s vector.
2017-05-26 05:53:00,486 : INFO : <ipython-input-6-22cae4a71586>:69 : vec_repr(MainThread) : 'carlin' not in Model. Returning [0]'s vector.
2017-05-26 05:53:00,488 : INFO : <ipython-input-6-22cae4a71586>:69 : vec_repr(MainThread) : '   ' not in Model. Returning [0]'s vector.
2017-05-26 05:53:00,490 : INFO : <ipython-input-6-22cae4a71586>:69 : vec_repr(MainThread) : '   ' not in Model. Returning [0]'s vector.
2017-05-26 05:53:00,491 : INFO : <ipython-input-6-22cae4a71586>:69 : vec_repr(MainThread) : '   ' not in Model. Return

In [210]:
everyones_repr.shape

(100, 20, 300)

In [158]:
rev = 10 # tmp variable, just to explore what's going on
# reviews[rev]
labels[222:234]

['POSITIVE',
 'NEGATIVE',
 'POSITIVE',
 'NEGATIVE',
 'POSITIVE',
 'NEGATIVE',
 'POSITIVE',
 'NEGATIVE',
 'POSITIVE',
 'NEGATIVE',
 'POSITIVE',
 'NEGATIVE']

In [159]:
labels_slice = labels[:slice_size]
everyones_labels = np.array([[1, 0] if (w == 'POSITIVE') else [0, 1] for w in labels_slice]).reshape([len(labels_slice),2])

In [160]:
everyones_labels.shape

(300, 2)

In [161]:
# Taken from 'Fake News Challenge' code 
def get_batches(x, y, batch_size=100):
    """Yields batches of size batch_size of features x and targets why by 
    splitting along axis 0.
    Args:
        x (ndarray): features
        y (ndarray): targets
        batch_size (int): size of the batches (Default 100)
    Yields:
        ndarray: features array of batch_size along axis 0
        ndarray: targets array of batch_size along axis 0
    """

    n_batches = len(x) // batch_size
    logger.debug("n_batches = {} // {} = {}".format(len(x), batch_size, n_batches))
    xx, yy = x[:n_batches * batch_size], y[:n_batches * batch_size]
    for ii in range(0, len(xx), batch_size):
        yield xx[ii:ii + batch_size], yy[ii:ii + batch_size]

# Let's build the CNN


In [240]:
import tensorflow as tf
from sklearn.model_selection import train_test_split
import random
import math

In [309]:
def get_x_and_y_from(idxs, a_size):
    assert a_size <= len(idxs), "Can't choose {} elts from set of {}".format(a_size, len(idxs))
    random.shuffle(idxs)
    batch_idxs = idxs[:a_size]
    # process data: x
    reviews_in_batch = [reviews[i] for i in batch_idxs] # reviews[batch_idxs]
    batch_as_words = [ [w.text for w in nlp(rev) if nlp.vocab[w.text].prob < probs[-1000]] for rev in reviews_in_batch]
    batch_x = np.array([words2repr(words_in_review, n_words_in_review) for words_in_review in batch_as_words])
#     batch_as_words = [ [w.text for w in nlp(rev) if nlp.vocab[w.text].prob < probs[-1000]] for rev in tqdm(reviews_in_batch, total=len(reviews_in_batch))]
#     batch_x = np.array([words2repr(words_in_review, n_words_in_review) for words_in_review in tqdm(batch_as_words, total=len(batch_as_words))])
    # y
    labels_slice = [labels[i] for i in batch_idxs] # labels[batch_idxs]
    batch_y = np.array([[1, 0] if (w == 'POSITIVE') else [0, 1] for w in labels_slice]).reshape([len(labels_slice),2])
    assert batch_x.shape[0] == batch_y.shape[0], "Sanity check failed: reviews and labels have different sizes"
    return (batch_x, batch_y)


In [None]:
unique(labels)

In [315]:
set_logging_as(logging.INFO) 
b_x, b_y = get_x_and_y_from(idxs = [1, 10, 20, 30, 40, 50], a_size = 3)

2017-05-26 10:20:25,194 - __main__ - INFO - 'collette' not in Model. Returning [0]'s vector.
2017-05-26 10:20:25,194 : INFO : <ipython-input-6-22cae4a71586>:69 : vec_repr(MainThread) : 'collette' not in Model. Returning [0]'s vector.
2017-05-26 10:20:25,196 - __main__ - INFO - 'armistead' not in Model. Returning [0]'s vector.
2017-05-26 10:20:25,196 : INFO : <ipython-input-6-22cae4a71586>:69 : vec_repr(MainThread) : 'armistead' not in Model. Returning [0]'s vector.
2017-05-26 10:20:25,197 - __main__ - INFO - 'maupins' not in Model. Returning [0]'s vector.
2017-05-26 10:20:25,197 : INFO : <ipython-input-6-22cae4a71586>:69 : vec_repr(MainThread) : 'maupins' not in Model. Returning [0]'s vector.
2017-05-26 10:20:25,199 - __main__ - INFO - '   ' not in Model. Returning [0]'s vector.
2017-05-26 10:20:25,199 : INFO : <ipython-input-6-22cae4a71586>:69 : vec_repr(MainThread) : '   ' not in Model. Returning [0]'s vector.
2017-05-26 10:20:25,200 - __main__ - INFO - 'armistead' not in Model. Retu

In [328]:
# let's calm down the console:
set_logging_as(logging.INFO)    

# overall 'with' is to clean graph in tensorboard 
# for other options: https://stackoverflow.com/questions/42847155/tensorboard-scalars-and-graphs-duplicated
#             (eg, another thing we could do --> tf.reset_default_graph()) 
with tf.Graph().as_default():
    n_features_in_word = mw.model.vector_size
    input_channels = 1 # no "color" channels since this is not a picture
    
    with tf.name_scope('x') as scope:
        x = tf.placeholder(tf.float32, [None, n_words_in_review, n_features_in_word, input_channels], name='embedded_sentences')
    
    # Convolution filter 
    filter_height = 5 # number of neighboring words I will take  <================================= TODO 
    filter_width = n_features_in_word # will take full words each time 
    filter_depth = 5 # I don't know what to put here <================================= TODO 
    # internal sanity check: 
    assert filter_height <= n_words_in_review  
    print('Taking {} words as neighborhood; generating {} features for filter {}x{}'.format(filter_height, filter_depth, filter_height, filter_width))
    
    
    # Convolutional Layer 
    with tf.name_scope('convolutions') as scope:
        # Weight and bias
        with tf.name_scope('filter_{}_words_to_{}_features'.format(filter_height, filter_depth)) as scope: # 
            weight = tf.Variable(
                tf.truncated_normal(
                    [filter_height, filter_width, input_channels, filter_depth]), name = 'filter')
            bias = tf.Variable(tf.zeros(filter_depth), name = 'bias2conv')
        conv_layer = tf.nn.conv2d(x, weight, strides = [1,1,filter_width,1], padding='SAME') # , padding='VALID')
        conv_layer = tf.nn.bias_add(conv_layer, bias)
        conv_layer = tf.nn.relu(conv_layer)   
    
#     # Max Pooling 
#     k = filter_width # is this it?  <================================= TODO  
#     conv_layer = tf.nn.max_pool(conv_layer,ksize=[1, k, k, 1],strides=[1, k, k, 1],padding='SAME') # 'same' padding? <================================= TODO          
    
    # Fully connected layer: 
    #      n_words_in_review (because paddding='SAME') * 1 (input_channels) * filter_depth to fc_num_neurons 
    fc_num_neurons = 1024  # <============ TODO: choose 
    with tf.name_scope('fully_connected') as scope:
        fc_weight = tf.Variable(
            tf.truncated_normal(
                [n_words_in_review * input_channels * filter_depth, fc_num_neurons]), name = 'conv_2_fully')
        fc_bias = tf.Variable(tf.zeros(fc_num_neurons), name = 'bias2fully')
        keep_prob = tf.placeholder(tf.float32, name = 'keep_prob')
        # 
        fc1 = tf.reshape(conv_layer, [-1, fc_weight.get_shape().as_list()[0]])
        fc1 = tf.add(tf.matmul(fc1, fc_weight), fc_bias)
        fc1 = tf.nn.relu(fc1)
        fc1 = tf.nn.dropout(fc1, keep_prob)    

        
    with tf.name_scope('output') as scope:
        # Output Layer - class prediction - fc_num_neurons to 2
        output_weight = tf.Variable(
            tf.truncated_normal(
                [fc_num_neurons, 2]), name = 'fully_2_output')
        output_bias = tf.Variable(tf.zeros(2), name = 'bias2output')
        out = tf.add(tf.matmul(fc1, output_weight), output_bias, name = 'pred_sucks_or_not')    
        # objective 
        truth = tf.placeholder(tf.float32, [None, 2], name = 'sucks_or_not') # because output is either 'Good' or 'Bad' 
    
    # define loss and optimizer
#     cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=out, labels=truth))
#     lr = 0.001 # learning rate
#     optimizer = tf.train.GradientDescentOptimizer(learning_rate = lr).minimize(cost)

    with tf.name_scope('cross_entropy'):
        cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=out, labels=truth))
        tf.summary.scalar('cross_entropy', cost)

    lr = 0.0001 # learning rate
    with tf.name_scope('train'):
        optimizer = tf.train.GradientDescentOptimizer(learning_rate = lr).minimize(cost)
        
        
    # Accuracy
    with tf.name_scope('accuracy'):
        with tf.name_scope('correct_prediction'):
            correct_pred = tf.equal(tf.argmax(out, 1), tf.argmax(truth, 1))
            accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
    tf.summary.scalar('accuracy', accuracy)

    # Merge all the summaries and write them out to /tmp/mnist_logs (by default)
    merged = tf.summary.merge_all()
    
    # where to log stuff
    log_dir = "/tmp/tensorflow"
    
    # OK, let's run this: 
    with tf.Session() as sess:
        # initialize variables
        sess.run(tf.global_variables_initializer())
        # writer for logs (to be picked up by our instance of the tensorboard)
        train_writer = tf.summary.FileWriter(log_dir + '/train',sess.graph)
        test_writer = tf.summary.FileWriter(log_dir + '/test')
        
        # get all sets ready to go:
#         X_train, X_test, y_train, y_test = train_test_split(everyones_repr, everyones_labels, test_size=0.33, random_state=42)        
#         batch_size = min(75,X_train.shape[0])
#         if (batch_size == X_train.shape[0]):
#             logger.info("Batch size == number of examples ({})".format(batch_size))
#         batches_enum = get_batches(X_train, y_train, batch_size) 
        train_prop = 0.75
        slice_size = 2000 # to test <==================== GET RID OF THIS 
        logger.warning("GET RID OF SLICE SIZE ,LUIS <===========================")
        slice_of_reviews = reviews[:slice_size]
        all_indexes = list(range(len(slice_of_reviews)))
        random.shuffle(all_indexes)
        max_idx_train = math.floor(len(all_indexes) * train_prop)
        all_training_idxs = all_indexes[:max_idx_train]
        all_test_idxs = all_indexes[max_idx_train:]
        # 
        batch_size = min(200,len(all_training_idxs))
        if (batch_size == len(all_training_idxs)):
            logger.info("Batch size == number of training examples ({})".format(len(all_training_idxs)))
        n_batches = len(all_training_idxs) // batch_size
        logger.info("Will train on {} batches of size {} (as I have {} examples to train on)".format(n_batches, batch_size, len(all_training_idxs)))
        # charge! 
        epochs = 5
        logger.info('Starting training/validation cycles for {} epochs (training: {} batches of size {} per epoch)'.format(epochs, n_batches, batch_size))
        for epoch in range(epochs):
            for batch_no in range(n_batches):
                batch_x, batch_y = get_x_and_y_from(all_training_idxs, batch_size)


                # run
                _, c, summary = sess.run([optimizer, cost, merged],
                                         feed_dict={
                                             x: batch_x.reshape(batch_x.shape + (1,)),
                                             truth: batch_y,
                                             keep_prob: 0.6})
            train_writer.add_summary(summary, epoch)
            # #### Calculate Accuracy ################################# 
            # data:
            test_x, test_y = get_x_and_y_from(all_test_idxs, batch_size) # len(all_test_idxs))
            # 
            summary, acc = sess.run([merged, accuracy], feed_dict={
                x: test_x.reshape(test_x.shape + (1,)),
                truth: test_y,
                keep_prob: 1.0})
            test_writer.add_summary(summary, epoch)
            logger.info('Accuracy at epoch {}/{}: {}'.format(epoch + 1, epochs, acc))
        # not sure I need this. But whatever. 
        train_writer.close() 
        test_writer.close() 
    logger.info("All done!")

Taking 5 words as neighborhood; generating 5 features for filter 5x300


2017-05-26 12:00:33,631 - __main__ - INFO - Will train on 7 batches of size 200 (as I have 1500 examples to train on)
2017-05-26 12:00:33,631 : INFO : <ipython-input-328-eb2d6611fe80>:120 : <module>(MainThread) : Will train on 7 batches of size 200 (as I have 1500 examples to train on)
2017-05-26 12:00:33,632 - __main__ - INFO - Starting training/validation cycles for 5 epochs (training: 7 batches of size 200 per epoch)
2017-05-26 12:00:33,632 : INFO : <ipython-input-328-eb2d6611fe80>:123 : <module>(MainThread) : Starting training/validation cycles for 5 epochs (training: 7 batches of size 200 per epoch)
2017-05-26 12:00:56,273 - __main__ - INFO - Accuracy at epoch 1/5: 0.4650000035762787
2017-05-26 12:00:56,273 : INFO : <ipython-input-328-eb2d6611fe80>:156 : <module>(MainThread) : Accuracy at epoch 1/5: 0.4650000035762787
2017-05-26 12:01:18,848 - __main__ - INFO - Accuracy at epoch 2/5: 0.49000000953674316
2017-05-26 12:01:18,848 : INFO : <ipython-input-328-eb2d6611fe80>:156 : <modul

In [203]:
len(reviews)

25000

In [74]:
batch_x.reshape(batch_x.shape + (1,)).shape

(33, 50, 300, 1)

In [148]:
35 // 5

7

In [151]:
l.reshape([1,2])

array([[1, 0]])

In [13]:
wd1 = tf.Variable(tf.random_normal([7*7*64, 1024]))

In [14]:
wd1.get_shape().as_list()[0] # weights['wd1'].get_shape().as_list()[0]

3136