In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tflearn
from tflearn.data_utils import to_categorical

  from ._conv import register_converters as _register_converters


curses is not supported on this machine (please install/reinstall curses for an optimal experience)


In [2]:
reviews = pd.read_csv('reviews.txt', header=None)
labels = pd.read_csv('labels.txt', header=None)

In [18]:
from collections import Counter

total_counts = Counter() #total_counts is a dictionary that keeps a set of all the words with no of occurances
for _, row in reviews.iterrows():   #reading one row at a time
    total_counts.update(row[0].split(' '))   #adding the words of a row to total_counts

print("Total words in data set: ", len(total_counts))

Total words in data set:  74074


In [24]:
vocab = sorted(total_counts, key=total_counts.get, reverse=True)[:10000] # making list of all possible words
print(vocab[:5])
print(len(vocab))

['', 'the', '.', 'and', 'a']
10000


In [20]:
print(vocab[-1], ': ', total_counts[vocab[-1]])

fulfilled :  30


In [21]:
word2idx = {word: i for i, word in enumerate(vocab)} #indexing the words 
word2idx

{'': 0,
 'the': 1,
 '.': 2,
 'and': 3,
 'a': 4,
 'of': 5,
 'to': 6,
 'is': 7,
 'br': 8,
 'it': 9,
 'in': 10,
 'i': 11,
 'this': 12,
 'that': 13,
 's': 14,
 'was': 15,
 'as': 16,
 'for': 17,
 'with': 18,
 'movie': 19,
 'but': 20,
 'film': 21,
 'you': 22,
 'on': 23,
 't': 24,
 'not': 25,
 'he': 26,
 'are': 27,
 'his': 28,
 'have': 29,
 'be': 30,
 'one': 31,
 'all': 32,
 'at': 33,
 'they': 34,
 'by': 35,
 'an': 36,
 'who': 37,
 'so': 38,
 'from': 39,
 'like': 40,
 'there': 41,
 'her': 42,
 'or': 43,
 'just': 44,
 'about': 45,
 'out': 46,
 'if': 47,
 'has': 48,
 'what': 49,
 'some': 50,
 'good': 51,
 'can': 52,
 'more': 53,
 'she': 54,
 'when': 55,
 'very': 56,
 'up': 57,
 'time': 58,
 'no': 59,
 'even': 60,
 'my': 61,
 'would': 62,
 'which': 63,
 'story': 64,
 'only': 65,
 'really': 66,
 'see': 67,
 'their': 68,
 'had': 69,
 'we': 70,
 'were': 71,
 'me': 72,
 'well': 73,
 'than': 74,
 'much': 75,
 'get': 76,
 'bad': 77,
 'been': 78,
 'people': 79,
 'will': 80,
 'do': 81,
 'other': 82,
 'a

In [28]:
def text_to_vector(text):
    word_vector = np.zeros(len(vocab), dtype=np.int_)
    for word in text.split(' '):
        idx = word2idx.get(word, None)
        if idx is None:
            continue
        else:
            word_vector[idx] += 1
    return np.array(word_vector)

In [34]:
text_to_vector('The tea is for a party to celebrate '
               'the movie so she has no time for a cake fulfilled')[:65]

array([0, 1, 0, 0, 2, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0])

In [35]:
word_vectors = np.zeros((len(reviews), len(vocab)), dtype=np.int_)
for ii, (_, text) in enumerate(reviews.iterrows()):
    word_vectors[ii] = text_to_vector(text[0])

In [36]:
word_vectors[:5, :23]

array([[ 18,   9,  27,   1,   4,   4,   6,   4,   0,   2,   2,   5,   0,
          4,   1,   0,   2,   0,   0,   0,   0,   0,   0],
       [  5,   4,   8,   1,   7,   3,   1,   2,   0,   4,   0,   0,   0,
          1,   2,   0,   0,   1,   3,   0,   0,   0,   1],
       [ 78,  24,  12,   4,  17,   5,  20,   2,   8,   8,   2,   1,   1,
          2,   8,   0,   5,   5,   4,   0,   2,   1,   4],
       [167,  53,  23,   0,  22,  23,  13,  14,   8,  10,   8,  12,   9,
          4,  11,   2,  11,   5,  11,   0,   5,   3,   0],
       [ 19,  10,  11,   4,   6,   2,   2,   5,   0,   1,   2,   3,   1,
          0,   0,   0,   3,   1,   0,   1,   0,   0,   0]])

In [37]:
Y = (labels=='positive').astype(np.int_)
records = len(labels)

shuffle = np.arange(records)
np.random.shuffle(shuffle)
test_fraction = 0.9

train_split, test_split = shuffle[:int(records*test_fraction)], shuffle[int(records*test_fraction):]
trainX, trainY = word_vectors[train_split,:], to_categorical(Y.values[train_split], 2)
testX, testY = word_vectors[test_split,:], to_categorical(Y.values[test_split], 2)

In [38]:
def build_model():
    # This resets all parameters and variables, leave this here
    tf.reset_default_graph()
    
    # Inputs
    net = tflearn.input_data([None, 10000])

    # Hidden layer(s)
    net = tflearn.fully_connected(net, 200, activation='ReLU')
    net = tflearn.fully_connected(net, 35, activation='ReLU')


    # Output layer
    net = tflearn.fully_connected(net, 2, activation='softmax')
    net = tflearn.regression(net, optimizer='sgd', 
                             learning_rate=0.05, 
                             loss='categorical_crossentropy')
    
    model = tflearn.DNN(net)
    return model

In [39]:
model  =  build_model()

In [40]:
model.fit(trainX, trainY, validation_set=0.1, show_metric=True, batch_size=64, n_epoch=5)

Training Step: 1584  | total loss: [1m[32m0.54689[0m[0m | time: 9.496s
| SGD | epoch: 005 | loss: 0.54689 - acc: 0.7529 -- iter: 20224/20250
Training Step: 1585  | total loss: [1m[32m0.54312[0m[0m | time: 10.555s
| SGD | epoch: 005 | loss: 0.54312 - acc: 0.7605 | val_loss: 0.49378 - val_acc: 0.7747 -- iter: 20250/20250
--


In [41]:
predictions = (np.array(model.predict(testX))[:,0] >= 0.5).astype(np.int_)
test_accuracy = np.mean(predictions == testY[:,0], axis=0)
print("Test accuracy: ", test_accuracy)

Test accuracy:  0.7632


In [42]:
def test_sentence(sentence):
    positive_prob = model.predict([text_to_vector(sentence.lower())])[0][1]
    print('Sentence: {}'.format(sentence))
    print('P(positive) = {:.3f} :'.format(positive_prob), 
          'Positive' if positive_prob > 0.5 else 'Negative')

In [48]:
sentence = "Moonlight is by far the best movie of 2016."
test_sentence(sentence)

sentence = "It's amazing anyone could be talented enough to make something this spectacularly awful"
test_sentence(sentence)

Sentence: Moonlight is by far the best movie of 2016.
P(positive) = 0.609 : Positive
Sentence: It's amazing anyone could be talented enough to make something this spectacularly awful
P(positive) = 0.347 : Negative


In [49]:
sentence = "MockingBot cannot make me satisfied, only fulfill my minor requirement"
test_sentence(sentence)

Sentence: MockingBot cannot make me satisfied, only fulfill my minor requirement
P(positive) = 0.501 : Positive
