# Sentiment Classification and Analysis

In [1]:
def pretty_print_review_and_label(i):
    print(labels[i] + "\t:\t" + reviews[i][:80] + "...")
#imdb movie reviews. Positive - 3+, negative - 2-
g = open('reviews.txt','r') # What we know!
reviews = list(map(lambda x:x[:-1],g.readlines()))
g.close()

g = open('labels.txt','r') # What we WANT to know!
labels = list(map(lambda x:x[:-1].upper(),g.readlines()))
g.close()

In [2]:
len(reviews)

25000

In [3]:
reviews[3655]

'this is a review of  freddy mercury the untold story   theatrical release  chicago int . film festival      one of the phoniest  uninspired and most tedious biographical documentaries i have seen . if the film i saw in a movie theater was originally released on tv  i would plead with its producers and distributors to not fool a paying audience with the false promise of a cinematically worthy documentary feature . even as a made  for  tv documentary  the sentimental piano solos accompanying interviewees sitting in front of flower arrangements in hotel rooms and the pompous  pseudo  literary narration rang more true of a sleepapedic bed infomercial . the only redeeming aspects of this  the untold story of freddy mercury    or  uhm  was it  the untold story of princess diana  are the original concert  video and tv footage   unabridged freddy mercury and queen . testimonial interviews with irrelevant eye witnesses with insights  such as  he was a free spirit    really . . i thought freddy

In [4]:
labels[3655]

'NEGATIVE'

# Reviewing Data, Developing a Predictive Theory

In [5]:
print("labels.txt \t : \t reviews.txt\n")
pretty_print_review_and_label(2137)
pretty_print_review_and_label(12816)
pretty_print_review_and_label(6267)
pretty_print_review_and_label(21934)
pretty_print_review_and_label(5297)
pretty_print_review_and_label(4998)

labels.txt 	 : 	 reviews.txt

NEGATIVE	:	this movie is terrible but it has some good effects .  ...
POSITIVE	:	adrian pasdar is excellent is this film . he makes a fascinating woman .  ...
NEGATIVE	:	comment this movie is impossible . is terrible  very improbable  bad interpretat...
POSITIVE	:	excellent episode movie ala pulp fiction .  days   suicides . it doesnt get more...
NEGATIVE	:	if you haven  t seen this  it  s terrible . it is pure trash . i saw this about ...
POSITIVE	:	this schiffer guy is a real genius  the movie is of excellent quality and both e...


In [7]:
from collections import Counter
import numpy as np

# Creating three Counters for positive counts, negative counts and total counts
positive_counts = Counter()
negative_counts = Counter()
total_counts = Counter()

In [8]:
for r in range(0,len(reviews)):
    if(labels[r]=='POSITIVE'):
        for word in reviews[r].split(" "):
            positive_counts[word]+=1
            total_counts[word] +=1
    if(labels[r]=='NEGATIVE'):
        for word in reviews[r].split(" "):
            negative_counts[word]+=1
            total_counts[word] +=1

In [9]:
# most common words in positive reviews
positive_counts.most_common()

[('', 550468),
 ('the', 173324),
 ('.', 159654),
 ('and', 89722),
 ('a', 83688),
 ('of', 76855),
 ('to', 66746),
 ('is', 57245),
 ('in', 50215),
 ('br', 49235),
 ('it', 48025),
 ('i', 40743),
 ('that', 35630),
 ('this', 35080),
 ('s', 33815),
 ('as', 26308),
 ('with', 23247),
 ('for', 22416),
 ('was', 21917),
 ('film', 20937),
 ('but', 20822),
 ('movie', 19074),
 ('his', 17227),
 ('on', 17008),
 ('you', 16681),
 ('he', 16282),
 ('are', 14807),
 ('not', 14272),
 ('t', 13720),
 ('one', 13655),
 ('have', 12587),
 ('be', 12416),
 ('by', 11997),
 ('all', 11942),
 ('who', 11464),
 ('an', 11294),
 ('at', 11234),
 ('from', 10767),
 ('her', 10474),
 ('they', 9895),
 ('has', 9186),
 ('so', 9154),
 ('like', 9038),
 ('about', 8313),
 ('very', 8305),
 ('out', 8134),
 ('there', 8057),
 ('she', 7779),
 ('what', 7737),
 ('or', 7732),
 ('good', 7720),
 ('more', 7521),
 ('when', 7456),
 ('some', 7441),
 ('if', 7285),
 ('just', 7152),
 ('can', 7001),
 ('story', 6780),
 ('time', 6515),
 ('my', 6488),
 ('g

In [10]:
# most common words in negative reviews
negative_counts.most_common()

[('', 561461),
 ('.', 167538),
 ('the', 163389),
 ('a', 79321),
 ('and', 74385),
 ('of', 69009),
 ('to', 68974),
 ('br', 52637),
 ('is', 50083),
 ('it', 48327),
 ('i', 46880),
 ('in', 43753),
 ('this', 40920),
 ('that', 37615),
 ('s', 31546),
 ('was', 26291),
 ('movie', 24965),
 ('for', 21927),
 ('but', 21781),
 ('with', 20878),
 ('as', 20625),
 ('t', 20361),
 ('film', 19218),
 ('you', 17549),
 ('on', 17192),
 ('not', 16354),
 ('have', 15144),
 ('are', 14623),
 ('be', 14541),
 ('he', 13856),
 ('one', 13134),
 ('they', 13011),
 ('at', 12279),
 ('his', 12147),
 ('all', 12036),
 ('so', 11463),
 ('like', 11238),
 ('there', 10775),
 ('just', 10619),
 ('by', 10549),
 ('or', 10272),
 ('an', 10266),
 ('who', 9969),
 ('from', 9731),
 ('if', 9518),
 ('about', 9061),
 ('out', 8979),
 ('what', 8422),
 ('some', 8306),
 ('no', 8143),
 ('her', 7947),
 ('even', 7687),
 ('can', 7653),
 ('has', 7604),
 ('good', 7423),
 ('bad', 7401),
 ('would', 7036),
 ('up', 6970),
 ('only', 6781),
 ('more', 6730),
 ('

In [10]:
pos_neg_ratios = Counter()
# We consider words to be "common" if they've been used at least 100 times
for term,cnt in list(total_counts.most_common()):
    if(cnt > 100):
        pos_neg_ratio = positive_counts[term] / float(negative_counts[term]+1)
        pos_neg_ratios[term] = pos_neg_ratio

In [11]:
print("Pos-to-neg ratio for 'the' = {}".format(pos_neg_ratios["the"]))
print("Pos-to-neg ratio for 'amazing' = {}".format(pos_neg_ratios["amazing"]))
print("Pos-to-neg ratio for 'terrible' = {}".format(pos_neg_ratios["terrible"]))

Pos-to-neg ratio for 'the' = 1.0607993145235326
Pos-to-neg ratio for 'amazing' = 4.022813688212928
Pos-to-neg ratio for 'terrible' = 0.17744252873563218


In [13]:
for term in pos_neg_ratios:
    
    pos_neg_ratios[term] = np.log(pos_neg_ratios[term])

In [12]:
print("Pos-to-neg ratio for 'the' = {}".format(pos_neg_ratios["the"]))
print("Pos-to-neg ratio for 'amazing' = {}".format(pos_neg_ratios["amazing"]))
print("Pos-to-neg ratio for 'terrible' = {}".format(pos_neg_ratios["terrible"]))

Pos-to-neg ratio for 'the' = 1.0607993145235326
Pos-to-neg ratio for 'amazing' = 4.022813688212928
Pos-to-neg ratio for 'terrible' = 0.17744252873563218


# Transforming Text into Numbers

In [19]:
vocab = set()
for r in reviews:
    for word in r.split(" "):
        vocab.add(word)

In [20]:
vocab_size = len(vocab)
print(vocab_size)

74074


In [22]:
layer_0 = np.zeros(shape=(1,vocab_size))
layer_0

array([[0., 0., 0., ..., 0., 0., 0.]])

In [23]:
layer_0.shape

(1, 74074)

In [25]:
# dictionary of words in the vocabulary mapped to index positions
word2index = {}
for i,word in enumerate(vocab):
    word2index[word] = i

word2index

{'': 0,
 'prerequisites': 1,
 'incandescent': 2,
 'locally': 3,
 'huckleberry': 4,
 'suleiman': 5,
 'puleeze': 6,
 'unfortunates': 7,
 'mistook': 8,
 'horn': 9,
 'geyser': 10,
 'disproven': 11,
 'burglaries': 12,
 'okanagan': 13,
 'monopoly': 14,
 'behl': 15,
 'maillard': 16,
 'plights': 17,
 'abstracted': 18,
 'halfwit': 19,
 'photon': 20,
 'unrushed': 21,
 'kaikini': 22,
 'panabaker': 23,
 'rubric': 24,
 'tropa': 25,
 'meara': 26,
 'rippingly': 27,
 'righteous': 28,
 'confederation': 29,
 'amplified': 30,
 'elly': 31,
 'sisson': 32,
 'icf': 33,
 'naked': 34,
 'unjustly': 35,
 'ode': 36,
 'unflattering': 37,
 'lucifer': 38,
 'gina': 39,
 'othello': 40,
 'grana': 41,
 'voting': 42,
 'imbalance': 43,
 'caroline': 44,
 'peewee': 45,
 'gens': 46,
 'roxann': 47,
 'logics': 48,
 'uninvolved': 49,
 'bret': 50,
 'remembered': 51,
 'tribesmen': 52,
 'arbus': 53,
 'smears': 54,
 'conferred': 55,
 'cocks': 56,
 'pilippinos': 57,
 'donned': 58,
 'timid': 59,
 'assured': 60,
 'roosa': 61,
 'summer

In [28]:
def get_target_for_label(label):
    if(label=='NEGATIVE'): return 0
    if(label=='POSITIVE'): return 1

In [29]:
labels[0]

'POSITIVE'

In [30]:
get_target_for_label(labels[0])

1

In [31]:
labels[1]

'NEGATIVE'

In [32]:
get_target_for_label(labels[1])

0

## Base Model

In [77]:
class SentimentNetwork:
    def __init__(self, reviews, labels, hidden_nodes = 10, learning_rate = 0.1):
        np.random.seed(1)
        self.pre_process_data(reviews, labels)
        self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate)

    def pre_process_data(self, reviews, labels):
        
        review_vocab = set()
        for review in reviews:
            for word in review.split(" "):
                review_vocab.add(word)
        self.review_vocab = list(review_vocab)
        
        label_vocab = set()
        for l in labels:
            label_vocab.add(l)
        self.label_vocab = list(label_vocab)
        self.review_vocab_size = len(self.review_vocab)
        self.label_vocab_size = len(self.label_vocab)
        self.word2index = {}
        for j, wd in enumerate(self.review_vocab):
            self.word2index[wd] = j
        # Create a dictionary of labels mapped to index positions
        self.label2index = {}
        for i, lbl in enumerate(self.label_vocab):
            self.label2index[lbl] = i
        
    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes
        self.learning_rate = learning_rate
        self.weights_0_1 = np.zeros((self.input_nodes,self.hidden_nodes))
        self.weights_1_2 = self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5, 
                                                (self.hidden_nodes, self.output_nodes))
        self.layer_1 = np.zeros((1,hidden_nodes))
  
                
    def get_target_for_label(self,label):
        if(label=='NEGATIVE'): 
            return 0
        if(label=='POSITIVE'): 
            return 1
        
    def sigmoid(self,x):
        return (1 / (1+np.exp(-x)))
    
    def sigmoid_output_2_derivative(self,output):
        return output * (1-output)

    def trainold(self, training_reviews_raw, training_labels):
        
        train_reviews = []
        for review in training_reviews_raw:
            indices = set()
            for word in review.split(" "):
                if(word in self.word2index.keys()):
                    indices.add(self.word2index[word])
            train_reviews.append(indices)
        
        assert(len(training_reviews_raw) == len(training_labels))
        correct_so_far = 0
        start = time.time()
        for i in range(len(training_reviews_raw)):
            review = training_reviews_raw[i]
            label = training_labels[i]
            self.layer_1 *= 0
            for index in review:
                self.layer_1 += self.weights_0_1[index]
            layer_2 = self.sigmoid(layer_1.dot(self.weights_1_2))
            layer2_error = layer_2 - self.get_target_for_label(label)
            layer2_del = layer2_error * self.sigmoid_output_2_derivative(layer_2)
            layer1_error = layer2_del.dot(self.weights_1_2.T)
            layer1_del = layer1_error
            self.weights_1_2 -= self.layer_1.T.dot(layer2_del) * self.learning_rate # update hidden-to-output weights with gradient descent step
            
            for index in review:
                self.weights_0_1 -= layer1_del[0] * self.learning_rate

            if(layer_2>=0.5 and label=='POSITIVE'):
                correct_so_far+=1
            elif(layer_2<0.5 and label =='NEGATIVE'):
                correct_so_far+=1

            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) \
                             + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
            if(i % 2500 == 0):
                print("")
    
    def train(self, training_reviews_raw, training_labels):
        training_reviews = list()
        for review in training_reviews_raw:
            indices = set()
            for word in review.split(" "):
                if(word in self.word2index.keys()):
                    indices.add(self.word2index[word])
            training_reviews.append(list(indices))

        assert(len(training_reviews) == len(training_labels))
        correct_so_far = 0
        start = time.time()
        for i in range(len(training_reviews)):
            review = training_reviews[i]
            label = training_labels[i]
            self.layer_1 *= 0
            for index in review:
                self.layer_1 += self.weights_0_1[index]

            layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))            
            layer_2_error = layer_2 - self.get_target_for_label(label) # Output layer error is the difference between desired target and actual output.
            layer_2_delta = layer_2_error * self.sigmoid_output_2_derivative(layer_2)
            layer_1_error = layer_2_delta.dot(self.weights_1_2.T) # errors propagated to the hidden layer
            layer_1_delta = layer_1_error 
            self.weights_1_2 -= self.layer_1.T.dot(layer_2_delta) * self.learning_rate
            for index in review:
                self.weights_0_1[index] -= layer_1_delta[0] * self.learning_rate 
            if(layer_2 >= 0.5 and label == 'POSITIVE'):
                correct_so_far += 1
            elif(layer_2 < 0.5 and label == 'NEGATIVE'):
                correct_so_far += 1
            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) \
                             + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
            if(i % 2500 == 0):
                print("")
            
    def test(self, testing_reviews, testing_labels):
        
        correct = 0
        start = time.time()
        for i in range(len(testing_reviews)):
            pred = self.run(testing_reviews[i])
            if(pred == testing_labels[i]):
                correct += 1
            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct) + " #Tested:" + str(i+1) \
                             + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
    
    def run_old(self, review):
        self.layer_1 *= 0
        indices = set()
        for word in review.lower.split(" "):
            if(word in self.word2index.leys()):
                indices.add(self.word2index[word])
        for index in indices:
            self.layer_1 += self.weights_0_1[index]
        
        layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))
        if(self.layer_2[0]>=0.5):
            return 'POSITIVE'
        else:
            return 'NEGATIVE'
    
    def run(self, review):
        self.layer_1 *= 0
        unique_indices = set()
        for word in review.split(" "):
            word = word.lower()
            if word in self.word2index.keys():
                unique_indices.add(self.word2index[word])
        for index in unique_indices:
            self.layer_1 += self.weights_0_1[index]
        layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))
        if(layer_2[0] >= 0.5):
            return "POSITIVE"
        else:
            return "NEGATIVE"

In [81]:
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000], learning_rate=0.1)
mlp.train(reviews[:-1000],labels[:-1000])
positive_counts = Counter()
negative_counts = Counter()
total_counts = Counter()
for r in range(0,len(reviews)):
    if(labels[r]=='POSITIVE'):
        for word in reviews[r].split(" "):
            positive_counts[word]+=1
            total_counts[word] +=1
    if(labels[r]=='NEGATIVE'):
        for word in reviews[r].split(" "):
            negative_counts[word]+=1
            total_counts[word] +=1

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:10.4% Speed(reviews/sec):760.5 #Correct:1804 #Trained:2501 Training Accuracy:72.1%
Progress:20.8% Speed(reviews/sec):720.2 #Correct:3784 #Trained:5001 Training Accuracy:75.6%
Progress:31.2% Speed(reviews/sec):718.7 #Correct:5869 #Trained:7501 Training Accuracy:78.2%
Progress:41.6% Speed(reviews/sec):727.7 #Correct:8003 #Trained:10001 Training Accuracy:80.0%
Progress:52.0% Speed(reviews/sec):692.9 #Correct:10141 #Trained:12501 Training Accuracy:81.1%
Progress:62.5% Speed(reviews/sec):697.7 #Correct:12276 #Trained:15001 Training Accuracy:81.8%
Progress:72.9% Speed(reviews/sec):700.6 #Correct:14391 #Trained:17501 Training Accuracy:82.2%
Progress:83.3% Speed(reviews/sec):703.3 #Correct:16567 #Trained:20001 Training Accuracy:82.8%
Progress:93.7% Speed(reviews/sec):704.1 #Correct:18749 #Trained:22501 Training Accuracy:83.3%
Progress:99.9% Speed(reviews/sec):705.4 #Correct:20072 #Trained:24000 Training

In [82]:
mlp.test(reviews[-1000:],labels[-1000:])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Tested:1 Testing Accuracy:100.%Progress:0.1% Speed(reviews/sec):501.2 #Correct:1 #Tested:2 Testing Accuracy:50.0%Progress:0.2% Speed(reviews/sec):1002. #Correct:2 #Tested:3 Testing Accuracy:66.6%Progress:0.3% Speed(reviews/sec):1002. #Correct:3 #Tested:4 Testing Accuracy:75.0%Progress:0.4% Speed(reviews/sec):1336. #Correct:4 #Tested:5 Testing Accuracy:80.0%Progress:0.5% Speed(reviews/sec):1671. #Correct:5 #Tested:6 Testing Accuracy:83.3%Progress:0.6% Speed(reviews/sec):1504. #Correct:6 #Tested:7 Testing Accuracy:85.7%Progress:0.7% Speed(reviews/sec):1403. #Correct:7 #Tested:8 Testing Accuracy:87.5%Progress:0.8% Speed(reviews/sec):1604. #Correct:8 #Tested:9 Testing Accuracy:88.8%Progress:0.9% Speed(reviews/sec):1289. #Correct:9 #Tested:10 Testing Accuracy:90.0%Progress:1.0% Speed(reviews/sec):1244. #Correct:10 #Tested:11 Testing Accuracy:90.9%Progress:1.1% Speed(reviews/sec):1223. #Correct:11 #Tested:12 Testing Accuracy:91.6%

Progress:23.7% Speed(reviews/sec):1019. #Correct:207 #Tested:238 Testing Accuracy:86.9%Progress:23.8% Speed(reviews/sec):1019. #Correct:208 #Tested:239 Testing Accuracy:87.0%Progress:23.9% Speed(reviews/sec):1019. #Correct:209 #Tested:240 Testing Accuracy:87.0%Progress:24.0% Speed(reviews/sec):1019. #Correct:210 #Tested:241 Testing Accuracy:87.1%Progress:24.1% Speed(reviews/sec):1019. #Correct:210 #Tested:242 Testing Accuracy:86.7%Progress:24.2% Speed(reviews/sec):1015. #Correct:211 #Tested:243 Testing Accuracy:86.8%Progress:24.3% Speed(reviews/sec):1015. #Correct:212 #Tested:244 Testing Accuracy:86.8%Progress:24.4% Speed(reviews/sec):1015. #Correct:213 #Tested:245 Testing Accuracy:86.9%Progress:24.5% Speed(reviews/sec):1015. #Correct:214 #Tested:246 Testing Accuracy:86.9%Progress:24.6% Speed(reviews/sec):1019. #Correct:215 #Tested:247 Testing Accuracy:87.0%Progress:24.7% Speed(reviews/sec):1019. #Correct:216 #Tested:248 Testing Accuracy:87.0%Progress:24.8% Speed(reviews/se

Progress:43.1% Speed(reviews/sec):1023. #Correct:375 #Tested:432 Testing Accuracy:86.8%Progress:43.2% Speed(reviews/sec):1024. #Correct:376 #Tested:433 Testing Accuracy:86.8%Progress:43.3% Speed(reviews/sec):1023. #Correct:377 #Tested:434 Testing Accuracy:86.8%Progress:43.4% Speed(reviews/sec):1023. #Correct:378 #Tested:435 Testing Accuracy:86.8%Progress:43.5% Speed(reviews/sec):1023. #Correct:379 #Tested:436 Testing Accuracy:86.9%Progress:43.6% Speed(reviews/sec):1026. #Correct:380 #Tested:437 Testing Accuracy:86.9%Progress:43.7% Speed(reviews/sec):1023. #Correct:381 #Tested:438 Testing Accuracy:86.9%Progress:43.8% Speed(reviews/sec):1023. #Correct:382 #Tested:439 Testing Accuracy:87.0%Progress:43.9% Speed(reviews/sec):1023. #Correct:383 #Tested:440 Testing Accuracy:87.0%Progress:44.0% Speed(reviews/sec):1021. #Correct:384 #Tested:441 Testing Accuracy:87.0%Progress:44.1% Speed(reviews/sec):1023. #Correct:385 #Tested:442 Testing Accuracy:87.1%Progress:44.2% Speed(reviews/se

Progress:62.4% Speed(reviews/sec):1002. #Correct:539 #Tested:625 Testing Accuracy:86.2%Progress:62.5% Speed(reviews/sec):999.4 #Correct:539 #Tested:626 Testing Accuracy:86.1%Progress:62.6% Speed(reviews/sec):999.3 #Correct:539 #Tested:627 Testing Accuracy:85.9%Progress:62.7% Speed(reviews/sec):1000. #Correct:540 #Tested:628 Testing Accuracy:85.9%Progress:62.8% Speed(reviews/sec):1001. #Correct:541 #Tested:629 Testing Accuracy:86.0%Progress:62.9% Speed(reviews/sec):1001. #Correct:542 #Tested:630 Testing Accuracy:86.0%Progress:63.0% Speed(reviews/sec):1000. #Correct:542 #Tested:631 Testing Accuracy:85.8%Progress:63.1% Speed(reviews/sec):997.9 #Correct:543 #Tested:632 Testing Accuracy:85.9%Progress:63.2% Speed(reviews/sec):999.5 #Correct:544 #Tested:633 Testing Accuracy:85.9%Progress:63.3% Speed(reviews/sec):997.9 #Correct:544 #Tested:634 Testing Accuracy:85.8%Progress:63.4% Speed(reviews/sec):996.3 #Correct:544 #Tested:635 Testing Accuracy:85.6%Progress:63.5% Speed(reviews/se

Progress:82.0% Speed(reviews/sec):995.3 #Correct:694 #Tested:821 Testing Accuracy:84.5%Progress:82.1% Speed(reviews/sec):995.3 #Correct:694 #Tested:822 Testing Accuracy:84.4%Progress:82.2% Speed(reviews/sec):995.2 #Correct:694 #Tested:823 Testing Accuracy:84.3%Progress:82.3% Speed(reviews/sec):995.4 #Correct:694 #Tested:824 Testing Accuracy:84.2%Progress:82.4% Speed(reviews/sec):996.6 #Correct:695 #Tested:825 Testing Accuracy:84.2%Progress:82.5% Speed(reviews/sec):996.6 #Correct:695 #Tested:826 Testing Accuracy:84.1%Progress:82.6% Speed(reviews/sec):993.0 #Correct:696 #Tested:827 Testing Accuracy:84.1%Progress:82.7% Speed(reviews/sec):994.2 #Correct:697 #Tested:828 Testing Accuracy:84.1%Progress:82.8% Speed(reviews/sec):993.0 #Correct:698 #Tested:829 Testing Accuracy:84.1%Progress:82.9% Speed(reviews/sec):993.0 #Correct:699 #Tested:830 Testing Accuracy:84.2%Progress:83.0% Speed(reviews/sec):994.2 #Correct:700 #Tested:831 Testing Accuracy:84.2%Progress:83.1% Speed(reviews/se

# Further Noise Reduction<a id='lesson_6'></a>

In [14]:
# words most frequently seen in a review with a "POSITIVE" label
pos_neg_ratios.most_common()

[('edie', 109.0),
 ('paulie', 59.0),
 ('felix', 23.4),
 ('polanski', 16.833333333333332),
 ('matthau', 16.555555555555557),
 ('victoria', 14.6),
 ('mildred', 13.5),
 ('gandhi', 12.666666666666666),
 ('flawless', 11.6),
 ('superbly', 9.583333333333334),
 ('perfection', 8.666666666666666),
 ('astaire', 8.5),
 ('captures', 7.68),
 ('voight', 7.615384615384615),
 ('wonderfully', 7.552631578947368),
 ('powell', 7.230769230769231),
 ('brosnan', 7.0625),
 ('lily', 6.823529411764706),
 ('bakshi', 6.705882352941177),
 ('lincoln', 6.695652173913044),
 ('refreshing', 6.392857142857143),
 ('breathtaking', 6.3478260869565215),
 ('bourne', 6.346153846153846),
 ('lemmon', 6.333333333333333),
 ('delightful', 6.051282051282051),
 ('flynn', 6.0476190476190474),
 ('andrews', 5.909090909090909),
 ('homer', 5.866666666666666),
 ('beautifully', 5.828125),
 ('soccer', 5.8),
 ('elvira', 5.695652173913044),
 ('underrated', 5.583333333333333),
 ('gripping', 5.565217391304348),
 ('superb', 5.524271844660194),
 (

In [15]:
# words most frequently seen in a review with a "NEGATIVE" label
list(reversed(pos_neg_ratios.most_common()))[0:30]

[('boll', 0.006944444444444444),
 ('uwe', 0.00980392156862745),
 ('seagal', 0.026143790849673203),
 ('unwatchable', 0.038461538461538464),
 ('stinker', 0.04040404040404041),
 ('mst', 0.05232558139534884),
 ('incoherent', 0.05303030303030303),
 ('unfunny', 0.06772908366533864),
 ('waste', 0.0728476821192053),
 ('blah', 0.07650273224043716),
 ('horrid', 0.08333333333333333),
 ('pointless', 0.08583690987124463),
 ('atrocious', 0.08839779005524862),
 ('redeeming', 0.09364548494983277),
 ('prom', 0.09433962264150944),
 ('drivel', 0.09565217391304348),
 ('lousy', 0.09950248756218906),
 ('worst', 0.10157194679564692),
 ('laughable', 0.1038961038961039),
 ('awful', 0.10783055198973042),
 ('poorly', 0.10852713178294573),
 ('wasting', 0.11029411764705882),
 ('remotely', 0.1111111111111111),
 ('existent', 0.125),
 ('boredom', 0.136),
 ('miserably', 0.13636363636363635),
 ('sucks', 0.13709677419354838),
 ('uninspired', 0.13761467889908258),
 ('lame', 0.13782542113323124),
 ('insult', 0.13829787234

In [16]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

In [17]:
hist, edges = np.histogram(list(map(lambda x:x[1],pos_neg_ratios.most_common())), density=True, bins=100, normed=True)

p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="Word Positive/Negative Affinity Distribution")
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color="#555555")
show(p)

  """Entry point for launching an IPython kernel.


In [18]:
frequency_frequency = Counter()

for word, cnt in total_counts.most_common():
    frequency_frequency[cnt] += 1

In [19]:
hist, edges = np.histogram(list(map(lambda x:x[1],frequency_frequency.most_common())), density=True, bins=100, normed=True)

p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="The frequency distribution of the words in our corpus")
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color="#555555")
show(p)

  """Entry point for launching an IPython kernel.


# Reducing the Vocabulary for Noise Reduction<a id='project_6'></a>

In [27]:
import time
import sys
import numpy as np

class SentimentNetwork:
    def __init__(self, reviews, labels, min_count, polarity_cutoff, hidden_nodes = 10, learning_rate = 0.1):
        np.random.seed(1)
        self.min_count = min_count
        self.polarity_cutoff = polarity_cutoff
        self.pre_process_data(reviews, labels, min_count, polarity_cutoff)
        self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate)

    def pre_process_data(self, reviews, labels, min_count, polarity_cutoff):
        negative_counts = Counter()
        total_counts = Counter()
        for r in range(0,len(reviews)):
            if(labels[r]=='POSITIVE'):
                for word in reviews[r].split(" "):
                    positive_counts[word]+=1
                    total_counts[word] +=1
            if(labels[r]=='NEGATIVE'):
                for word in reviews[r].split(" "):
                    negative_counts[word]+=1
                    total_counts[word] +=1
        
        pos_neg_ratios = Counter()
        for term,cnt in list(total_counts.most_common()):
            if((cnt >= 50) and (np.abs(positive_counts[term])>=polarity_cutoff)):
                pos_neg_ratio = positive_counts[term] / float(negative_counts[term]+1)
                pos_neg_ratios[term] = pos_neg_ratio
        for term in pos_neg_ratios:
            pos_neg_ratios[term] = np.log(pos_neg_ratios[term])
        
             
        
        review_vocab = set()
        for review in reviews:
            for word in review.split(" "):
                review_vocab.add(word)
        self.review_vocab = list(review_vocab)
        
        label_vocab = set()
        for l in labels:
            label_vocab.add(l)
        self.label_vocab = list(label_vocab)
        self.review_vocab_size = len(self.review_vocab)
        self.label_vocab_size = len(self.label_vocab)
        self.word2index = {}

        for j, wd in enumerate(self.review_vocab):
            self.word2index[wd] = j
        self.label2index = {}
        for i, lbl in enumerate(self.label_vocab):
            self.label2index[lbl] = i
        
    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes
        self.learning_rate = learning_rate
        self.weights_0_1 = np.zeros((self.input_nodes,self.hidden_nodes))
        self.weights_1_2 = self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5, 
                                                (self.hidden_nodes, self.output_nodes))
        self.layer_1 = np.zeros((1,hidden_nodes))
  
                
    def get_target_for_label(self,label):
        if(label=='NEGATIVE'): 
            return 0
        if(label=='POSITIVE'): 
            return 1
        
    def sigmoid(self,x):
        return (1 / (1+np.exp(-x)))
    
    def sigmoid_output_2_derivative(self,output):
        return output * (1-output)

    def train(self, training_reviews_raw, training_labels):
        training_reviews = list()
        for review in training_reviews_raw:
            indices = set()
            for word in review.split(" "):
                if(word in self.word2index.keys()):
                    indices.add(self.word2index[word])
            training_reviews.append(list(indices))
        assert(len(training_reviews) == len(training_labels))
        correct_so_far = 0
        start = time.time()
        for i in range(len(training_reviews)):
            review = training_reviews[i]
            label = training_labels[i]
            self.layer_1 *= 0
            for index in review:
                self.layer_1 += self.weights_0_1[index]
            layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))            
            layer_2_error = layer_2 - self.get_target_for_label(label)
            layer_2_delta = layer_2_error * self.sigmoid_output_2_derivative(layer_2)

            layer_1_error = layer_2_delta.dot(self.weights_1_2.T)
            layer_1_delta = layer_1_error
            self.weights_1_2 -= self.layer_1.T.dot(layer_2_delta) * self.learning_rate 
            for index in review:
                self.weights_0_1[index] -= layer_1_delta[0] * self.learning_rate
            if(layer_2 >= 0.5 and label == 'POSITIVE'):
                correct_so_far += 1
            elif(layer_2 < 0.5 and label == 'NEGATIVE'):
                correct_so_far += 1
             
            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) \
                             + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
            if(i % 2500 == 0):
                print("")
            
    def test(self, testing_reviews, testing_labels):
        correct = 0
        start = time.time()
        for i in range(len(testing_reviews)):
            pred = self.run(testing_reviews[i])
            if(pred == testing_labels[i]):
                correct += 1
            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct) + " #Tested:" + str(i+1) \
                             + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
    
    
    def run(self, review):
        self.layer_1 *= 0
        indices = set()
        for word in review.split(" "):
            word = word.lower()
            if word in self.word2index.keys():
                indices.add(self.word2index[word])
        for index in indices:
            self.layer_1 += self.weights_0_1[index]
        layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))
        if(layer_2[0] >= 0.5):
            return "POSITIVE"
        else:
            return "NEGATIVE"

In [25]:
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000],min_count=20,polarity_cutoff=0.05,learning_rate=0.01)
mlp.train(reviews[:-1000],labels[:-1000])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:10.4% Speed(reviews/sec):692.3 #Correct:1962 #Trained:2501 Training Accuracy:78.4%
Progress:20.8% Speed(reviews/sec):677.2 #Correct:4002 #Trained:5001 Training Accuracy:80.0%
Progress:31.2% Speed(reviews/sec):677.9 #Correct:6120 #Trained:7501 Training Accuracy:81.5%
Progress:41.6% Speed(reviews/sec):682.7 #Correct:8271 #Trained:10001 Training Accuracy:82.7%
Progress:48.9% Speed(reviews/sec):682.5 #Correct:9780 #Trained:11758 Training Accuracy:83.1%Progress:52.0% Speed(reviews/sec):667.3 #Correct:10431 #Trained:12501 Training Accuracy:83.4%
Progress:62.5% Speed(reviews/sec):645.6 #Correct:12565 #Trained:15001 Training Accuracy:83.7%
Progress:72.9% Speed(reviews/sec):647.9 #Correct:14670 #Trained:17501 Training Accuracy:83.8%
Progress:83.3% Speed(reviews/sec):649.7 #Correct:16833 #Trained:20001 Training Accuracy:84.1%
Progress:93.7% Speed(reviews/sec):649.4 #Correct:19015 #Trained:22501 Training A

In [29]:
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000],min_count=20,polarity_cutoff=0.05,learning_rate=0.01)
mlp.test(reviews[-1000:],labels[-1000:])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Tested:1 Testing Accuracy:100.%Progress:0.1% Speed(reviews/sec):456.7 #Correct:1 #Tested:2 Testing Accuracy:50.0%Progress:0.2% Speed(reviews/sec):913.4 #Correct:2 #Tested:3 Testing Accuracy:66.6%Progress:0.3% Speed(reviews/sec):743.3 #Correct:2 #Tested:4 Testing Accuracy:50.0%Progress:0.4% Speed(reviews/sec):991.1 #Correct:3 #Tested:5 Testing Accuracy:60.0%Progress:0.5% Speed(reviews/sec):993.2 #Correct:3 #Tested:6 Testing Accuracy:50.0%Progress:0.6% Speed(reviews/sec):994.9 #Correct:4 #Tested:7 Testing Accuracy:57.1%Progress:0.7% Speed(reviews/sec):871.9 #Correct:4 #Tested:8 Testing Accuracy:50.0%Progress:0.8% Speed(reviews/sec):996.5 #Correct:5 #Tested:9 Testing Accuracy:55.5%Progress:0.9% Speed(reviews/sec):749.1 #Correct:5 #Tested:10 Testing Accuracy:50.0%Progress:1.0% Speed(reviews/sec):768.4 #Correct:6 #Tested:11 Testing Accuracy:54.5%Progress:1.1% Speed(reviews/sec):845.3 #Correct:6 #Tested:12 Testing Accuracy:50.0%Pr

Progress:19.4% Speed(reviews/sec):1007. #Correct:98 #Tested:195 Testing Accuracy:50.2%Progress:19.5% Speed(reviews/sec):1007. #Correct:98 #Tested:196 Testing Accuracy:50.0%Progress:19.6% Speed(reviews/sec):1007. #Correct:99 #Tested:197 Testing Accuracy:50.2%Progress:19.7% Speed(reviews/sec):1007. #Correct:99 #Tested:198 Testing Accuracy:50.0%Progress:19.8% Speed(reviews/sec):1012. #Correct:100 #Tested:199 Testing Accuracy:50.2%Progress:19.9% Speed(reviews/sec):1012. #Correct:100 #Tested:200 Testing Accuracy:50.0%Progress:20.0% Speed(reviews/sec):1017. #Correct:101 #Tested:201 Testing Accuracy:50.2%Progress:20.1% Speed(reviews/sec):1017. #Correct:101 #Tested:202 Testing Accuracy:50.0%Progress:20.2% Speed(reviews/sec):1017. #Correct:102 #Tested:203 Testing Accuracy:50.2%Progress:20.3% Speed(reviews/sec):1017. #Correct:102 #Tested:204 Testing Accuracy:50.0%Progress:20.4% Speed(reviews/sec):1016. #Correct:103 #Tested:205 Testing Accuracy:50.2%Progress:20.5% Speed(reviews/sec):1

Progress:38.5% Speed(reviews/sec):1005. #Correct:193 #Tested:386 Testing Accuracy:50.0%Progress:38.6% Speed(reviews/sec):1005. #Correct:194 #Tested:387 Testing Accuracy:50.1%Progress:38.7% Speed(reviews/sec):1007. #Correct:194 #Tested:388 Testing Accuracy:50.0%Progress:38.8% Speed(reviews/sec):1007. #Correct:195 #Tested:389 Testing Accuracy:50.1%Progress:38.9% Speed(reviews/sec):1007. #Correct:195 #Tested:390 Testing Accuracy:50.0%Progress:39.0% Speed(reviews/sec):1007. #Correct:196 #Tested:391 Testing Accuracy:50.1%Progress:39.1% Speed(reviews/sec):1010. #Correct:196 #Tested:392 Testing Accuracy:50.0%Progress:39.2% Speed(reviews/sec):1010. #Correct:197 #Tested:393 Testing Accuracy:50.1%Progress:39.3% Speed(reviews/sec):1010. #Correct:197 #Tested:394 Testing Accuracy:50.0%Progress:39.4% Speed(reviews/sec):1007. #Correct:198 #Tested:395 Testing Accuracy:50.1%Progress:39.5% Speed(reviews/sec):1008. #Correct:198 #Tested:396 Testing Accuracy:50.0%Progress:39.6% Speed(reviews/se

Progress:56.7% Speed(reviews/sec):993.8 #Correct:284 #Tested:568 Testing Accuracy:50.0%Progress:56.8% Speed(reviews/sec):993.8 #Correct:285 #Tested:569 Testing Accuracy:50.0%Progress:56.9% Speed(reviews/sec):993.8 #Correct:285 #Tested:570 Testing Accuracy:50.0%Progress:57.0% Speed(reviews/sec):990.5 #Correct:286 #Tested:571 Testing Accuracy:50.0%Progress:57.1% Speed(reviews/sec):990.5 #Correct:286 #Tested:572 Testing Accuracy:50.0%Progress:57.2% Speed(reviews/sec):990.5 #Correct:287 #Tested:573 Testing Accuracy:50.0%Progress:57.3% Speed(reviews/sec):990.5 #Correct:287 #Tested:574 Testing Accuracy:50.0%Progress:57.4% Speed(reviews/sec):989.9 #Correct:288 #Tested:575 Testing Accuracy:50.0%Progress:57.5% Speed(reviews/sec):990.6 #Correct:288 #Tested:576 Testing Accuracy:50.0%Progress:57.6% Speed(reviews/sec):990.4 #Correct:289 #Tested:577 Testing Accuracy:50.0%Progress:57.7% Speed(reviews/sec):992.1 #Correct:289 #Tested:578 Testing Accuracy:50.0%Progress:57.8% Speed(reviews/se

Progress:77.4% Speed(reviews/sec):1018. #Correct:388 #Tested:775 Testing Accuracy:50.0%Progress:77.5% Speed(reviews/sec):1018. #Correct:388 #Tested:776 Testing Accuracy:50.0%Progress:77.6% Speed(reviews/sec):1018. #Correct:389 #Tested:777 Testing Accuracy:50.0%Progress:77.7% Speed(reviews/sec):1018. #Correct:389 #Tested:778 Testing Accuracy:50.0%Progress:77.8% Speed(reviews/sec):1016. #Correct:390 #Tested:779 Testing Accuracy:50.0%Progress:77.9% Speed(reviews/sec):1016. #Correct:390 #Tested:780 Testing Accuracy:50.0%Progress:78.0% Speed(reviews/sec):1014. #Correct:391 #Tested:781 Testing Accuracy:50.0%Progress:78.1% Speed(reviews/sec):1015. #Correct:391 #Tested:782 Testing Accuracy:50.0%Progress:78.2% Speed(reviews/sec):1015. #Correct:392 #Tested:783 Testing Accuracy:50.0%Progress:78.3% Speed(reviews/sec):1015. #Correct:392 #Tested:784 Testing Accuracy:50.0%Progress:78.4% Speed(reviews/sec):1015. #Correct:393 #Tested:785 Testing Accuracy:50.0%Progress:78.5% Speed(reviews/se

Progress:97.7% Speed(reviews/sec):1014. #Correct:489 #Tested:978 Testing Accuracy:50.0%Progress:97.8% Speed(reviews/sec):1014. #Correct:490 #Tested:979 Testing Accuracy:50.0%Progress:97.9% Speed(reviews/sec):1013. #Correct:490 #Tested:980 Testing Accuracy:50.0%Progress:98.0% Speed(reviews/sec):1014. #Correct:491 #Tested:981 Testing Accuracy:50.0%Progress:98.1% Speed(reviews/sec):1012. #Correct:491 #Tested:982 Testing Accuracy:50.0%Progress:98.2% Speed(reviews/sec):1012. #Correct:492 #Tested:983 Testing Accuracy:50.0%Progress:98.3% Speed(reviews/sec):1011. #Correct:492 #Tested:984 Testing Accuracy:50.0%Progress:98.4% Speed(reviews/sec):1012. #Correct:493 #Tested:985 Testing Accuracy:50.0%Progress:98.5% Speed(reviews/sec):1012. #Correct:493 #Tested:986 Testing Accuracy:50.0%Progress:98.6% Speed(reviews/sec):1013. #Correct:494 #Tested:987 Testing Accuracy:50.0%Progress:98.7% Speed(reviews/sec):1013. #Correct:494 #Tested:988 Testing Accuracy:50.0%Progress:98.8% Speed(reviews/se

In [30]:
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000],min_count=20,polarity_cutoff=0.8,learning_rate=0.01)
mlp.train(reviews[:-1000],labels[:-1000])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:10.4% Speed(reviews/sec):683.5 #Correct:1962 #Trained:2501 Training Accuracy:78.4%
Progress:20.8% Speed(reviews/sec):681.2 #Correct:4002 #Trained:5001 Training Accuracy:80.0%
Progress:31.2% Speed(reviews/sec):677.9 #Correct:6120 #Trained:7501 Training Accuracy:81.5%
Progress:41.6% Speed(reviews/sec):685.1 #Correct:8271 #Trained:10001 Training Accuracy:82.7%
Progress:52.0% Speed(reviews/sec):681.9 #Correct:10431 #Trained:12501 Training Accuracy:83.4%
Progress:62.5% Speed(reviews/sec):682.7 #Correct:12565 #Trained:15001 Training Accuracy:83.7%
Progress:64.2% Speed(reviews/sec):680.2 #Correct:12913 #Trained:15419 Training Accuracy:83.7%Progress:72.9% Speed(reviews/sec):673.2 #Correct:14670 #Trained:17501 Training Accuracy:83.8%
Progress:83.3% Speed(reviews/sec):666.8 #Correct:16833 #Trained:20001 Training Accuracy:84.1%
Progress:93.7% Speed(reviews/sec):664.2 #Correct:19015 #Trained:22501 Training 

In [31]:
mlp.test(reviews[-1000:],labels[-1000:])

Progress:0.0% Speed(reviews/sec):0 #Correct:1 #Tested:1 Testing Accuracy:100.%Progress:0.1% Speed(reviews/sec):1002. #Correct:1 #Tested:2 Testing Accuracy:50.0%Progress:0.2% Speed(reviews/sec):2005. #Correct:2 #Tested:3 Testing Accuracy:66.6%Progress:0.3% Speed(reviews/sec):1504. #Correct:3 #Tested:4 Testing Accuracy:75.0%Progress:0.4% Speed(reviews/sec):2006. #Correct:4 #Tested:5 Testing Accuracy:80.0%Progress:0.5% Speed(reviews/sec):2507. #Correct:5 #Tested:6 Testing Accuracy:83.3%Progress:0.6% Speed(reviews/sec):1957. #Correct:6 #Tested:7 Testing Accuracy:85.7%Progress:0.7% Speed(reviews/sec):1754. #Correct:7 #Tested:8 Testing Accuracy:87.5%Progress:0.8% Speed(reviews/sec):2005. #Correct:8 #Tested:9 Testing Accuracy:88.8%Progress:0.9% Speed(reviews/sec):1480. #Correct:9 #Tested:10 Testing Accuracy:90.0%Progress:1.0% Speed(reviews/sec):1432. #Correct:10 #Tested:11 Testing Accuracy:90.9%Progress:1.1% Speed(reviews/sec):1575. #Correct:11 #Tested:12 Testing Accuracy:91.6%Pr

Progress:32.6% Speed(reviews/sec):1126. #Correct:285 #Tested:327 Testing Accuracy:87.1%Progress:32.7% Speed(reviews/sec):1126. #Correct:286 #Tested:328 Testing Accuracy:87.1%Progress:32.8% Speed(reviews/sec):1126. #Correct:287 #Tested:329 Testing Accuracy:87.2%Progress:32.9% Speed(reviews/sec):1125. #Correct:288 #Tested:330 Testing Accuracy:87.2%Progress:33.0% Speed(reviews/sec):1125. #Correct:289 #Tested:331 Testing Accuracy:87.3%Progress:33.1% Speed(reviews/sec):1125. #Correct:290 #Tested:332 Testing Accuracy:87.3%Progress:33.2% Speed(reviews/sec):1124. #Correct:291 #Tested:333 Testing Accuracy:87.3%Progress:33.3% Speed(reviews/sec):1127. #Correct:292 #Tested:334 Testing Accuracy:87.4%Progress:33.4% Speed(reviews/sec):1123. #Correct:293 #Tested:335 Testing Accuracy:87.4%Progress:33.5% Speed(reviews/sec):1121. #Correct:294 #Tested:336 Testing Accuracy:87.5%Progress:33.6% Speed(reviews/sec):1122. #Correct:295 #Tested:337 Testing Accuracy:87.5%Progress:33.7% Speed(reviews/se

Progress:50.4% Speed(reviews/sec):1054. #Correct:445 #Tested:505 Testing Accuracy:88.1%Progress:50.5% Speed(reviews/sec):1057. #Correct:446 #Tested:506 Testing Accuracy:88.1%Progress:50.6% Speed(reviews/sec):1052. #Correct:447 #Tested:507 Testing Accuracy:88.1%Progress:50.7% Speed(reviews/sec):1048. #Correct:448 #Tested:508 Testing Accuracy:88.1%Progress:50.8% Speed(reviews/sec):1047. #Correct:449 #Tested:509 Testing Accuracy:88.2%Progress:50.9% Speed(reviews/sec):1050. #Correct:450 #Tested:510 Testing Accuracy:88.2%Progress:51.0% Speed(reviews/sec):1047. #Correct:451 #Tested:511 Testing Accuracy:88.2%Progress:51.1% Speed(reviews/sec):1047. #Correct:452 #Tested:512 Testing Accuracy:88.2%Progress:51.2% Speed(reviews/sec):1047. #Correct:453 #Tested:513 Testing Accuracy:88.3%Progress:51.3% Speed(reviews/sec):1049. #Correct:454 #Tested:514 Testing Accuracy:88.3%Progress:51.4% Speed(reviews/sec):1047. #Correct:455 #Tested:515 Testing Accuracy:88.3%Progress:51.5% Speed(reviews/se

Progress:71.7% Speed(reviews/sec):1055. #Correct:622 #Tested:718 Testing Accuracy:86.6%Progress:71.8% Speed(reviews/sec):1057. #Correct:622 #Tested:719 Testing Accuracy:86.5%Progress:71.9% Speed(reviews/sec):1057. #Correct:623 #Tested:720 Testing Accuracy:86.5%Progress:72.0% Speed(reviews/sec):1058. #Correct:624 #Tested:721 Testing Accuracy:86.5%Progress:72.1% Speed(reviews/sec):1058. #Correct:624 #Tested:722 Testing Accuracy:86.4%Progress:72.2% Speed(reviews/sec):1058. #Correct:625 #Tested:723 Testing Accuracy:86.4%Progress:72.3% Speed(reviews/sec):1058. #Correct:625 #Tested:724 Testing Accuracy:86.3%Progress:72.4% Speed(reviews/sec):1059. #Correct:625 #Tested:725 Testing Accuracy:86.2%Progress:72.5% Speed(reviews/sec):1059. #Correct:626 #Tested:726 Testing Accuracy:86.2%Progress:72.6% Speed(reviews/sec):1059. #Correct:627 #Tested:727 Testing Accuracy:86.2%Progress:72.7% Speed(reviews/sec):1057. #Correct:628 #Tested:728 Testing Accuracy:86.2%Progress:72.8% Speed(reviews/se

Progress:90.9% Speed(reviews/sec):1043. #Correct:781 #Tested:910 Testing Accuracy:85.8%Progress:91.0% Speed(reviews/sec):1043. #Correct:782 #Tested:911 Testing Accuracy:85.8%Progress:91.1% Speed(reviews/sec):1043. #Correct:782 #Tested:912 Testing Accuracy:85.7%Progress:91.2% Speed(reviews/sec):1045. #Correct:783 #Tested:913 Testing Accuracy:85.7%Progress:91.3% Speed(reviews/sec):1043. #Correct:783 #Tested:914 Testing Accuracy:85.6%Progress:91.4% Speed(reviews/sec):1043. #Correct:784 #Tested:915 Testing Accuracy:85.6%Progress:91.5% Speed(reviews/sec):1043. #Correct:784 #Tested:916 Testing Accuracy:85.5%Progress:91.6% Speed(reviews/sec):1043. #Correct:784 #Tested:917 Testing Accuracy:85.4%Progress:91.7% Speed(reviews/sec):1043. #Correct:785 #Tested:918 Testing Accuracy:85.5%Progress:91.8% Speed(reviews/sec):1043. #Correct:786 #Tested:919 Testing Accuracy:85.5%Progress:91.9% Speed(reviews/sec):1043. #Correct:787 #Tested:920 Testing Accuracy:85.5%Progress:92.0% Speed(reviews/se

# Analysis<a id='lesson_7'></a>

In [32]:
mlp_full = SentimentNetwork(reviews[:-1000],labels[:-1000],min_count=0,polarity_cutoff=0,learning_rate=0.01)

In [33]:
mlp_full.train(reviews[:-1000],labels[:-1000])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:10.4% Speed(reviews/sec):1678. #Correct:1962 #Trained:2501 Training Accuracy:78.4%
Progress:20.8% Speed(reviews/sec):1764. #Correct:4002 #Trained:5001 Training Accuracy:80.0%
Progress:31.2% Speed(reviews/sec):1786. #Correct:6120 #Trained:7501 Training Accuracy:81.5%
Progress:41.6% Speed(reviews/sec):1824. #Correct:8271 #Trained:10001 Training Accuracy:82.7%
Progress:52.0% Speed(reviews/sec):1839. #Correct:10431 #Trained:12501 Training Accuracy:83.4%
Progress:62.5% Speed(reviews/sec):1846. #Correct:12565 #Trained:15001 Training Accuracy:83.7%
Progress:72.9% Speed(reviews/sec):1852. #Correct:14670 #Trained:17501 Training Accuracy:83.8%
Progress:83.3% Speed(reviews/sec):1857. #Correct:16833 #Trained:20001 Training Accuracy:84.1%
Progress:93.7% Speed(reviews/sec):1855. #Correct:19015 #Trained:22501 Training Accuracy:84.5%
Progress:99.9% Speed(reviews/sec):1857. #Correct:20335 #Trained:24000 Training

In [35]:
def get_most_similar_words(focus = "horrible"):
    most_similar = Counter()

    for word in mlp_full.word2index.keys():
        most_similar[word] = np.dot(mlp_full.weights_0_1[mlp_full.word2index[word]],mlp_full.weights_0_1[mlp_full.word2index[focus]])
    
    return most_similar.most_common()

In [36]:
get_most_similar_words("excellent")

[('excellent', 0.13672950757352478),
 ('perfect', 0.12548286087225946),
 ('amazing', 0.09182763392599971),
 ('today', 0.0902236626944142),
 ('wonderful', 0.08935597696221462),
 ('fun', 0.0875044666742069),
 ('great', 0.08714175888229206),
 ('best', 0.08581088561788064),
 ('liked', 0.07769762912384344),
 ('definitely', 0.07662878140696602),
 ('brilliant', 0.07342385876927904),
 ('loved', 0.07328542892812216),
 ('favorite', 0.07278113603616079),
 ('superb', 0.0717362071785051),
 ('fantastic', 0.07092219191626623),
 ('job', 0.06916061720763408),
 ('incredible', 0.06642407795261446),
 ('enjoyable', 0.0656325605028888),
 ('rare', 0.0648192126626151),
 ('highly', 0.06388945335097052),
 ('enjoyed', 0.06212754610181296),
 ('wonderfully', 0.062055178604090176),
 ('perfectly', 0.06109320881188739),
 ('fascinating', 0.060663547937493886),
 ('bit', 0.059655427045653076),
 ('gem', 0.0595108592961568),
 ('outstanding', 0.05886080814708303),
 ('beautiful', 0.05861393470316208),
 ('surprised', 0.05827

In [37]:
get_most_similar_words("terrible")

[('worst', 0.16966107259049845),
 ('awful', 0.12026847019691242),
 ('waste', 0.11945367265311004),
 ('poor', 0.09275888757443548),
 ('terrible', 0.09142538719772794),
 ('dull', 0.0842092716782236),
 ('poorly', 0.08124154451604203),
 ('disappointment', 0.08006475962136872),
 ('fails', 0.07859977372333748),
 ('disappointing', 0.07733948548032335),
 ('boring', 0.07712785874801287),
 ('unfortunately', 0.07550244970585908),
 ('worse', 0.07060183536419466),
 ('mess', 0.0705642996235904),
 ('stupid', 0.06948482283254306),
 ('badly', 0.06688890366622856),
 ('annoying', 0.06568702190337414),
 ('bad', 0.06309381453757214),
 ('save', 0.06288059749586575),
 ('disappointed', 0.06269235381207287),
 ('wasted', 0.061387183028051275),
 ('supposed', 0.06098545295772516),
 ('horrible', 0.06012177233938012),
 ('laughable', 0.05869840628546763),
 ('crap', 0.05810452866788457),
 ('basically', 0.05721884036963617),
 ('nothing', 0.057158220043034204),
 ('ridiculous', 0.056905481068931424),
 ('lacks', 0.055766

In [38]:
import matplotlib.colors as colors

words_to_visualize = list()
for word, ratio in pos_neg_ratios.most_common(500):
    if(word in mlp_full.word2index.keys()):
        words_to_visualize.append(word)
    
for word, ratio in list(reversed(pos_neg_ratios.most_common()))[0:500]:
    if(word in mlp_full.word2index.keys()):
        words_to_visualize.append(word)

In [39]:
pos = 0
neg = 0

colors_list = list()
vectors_list = list()
for word in words_to_visualize:
    if word in pos_neg_ratios.keys():
        vectors_list.append(mlp_full.weights_0_1[mlp_full.word2index[word]])
        if(pos_neg_ratios[word] > 0):
            pos+=1
            colors_list.append("#00ff00")
        else:
            neg+=1
            colors_list.append("#000000")

In [40]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
words_top_ted_tsne = tsne.fit_transform(vectors_list)

In [41]:
p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="vector T-SNE for most polarized words")

source = ColumnDataSource(data=dict(x1=words_top_ted_tsne[:,0],
                                    x2=words_top_ted_tsne[:,1],
                                    names=words_to_visualize,
                                    color=colors_list))

p.scatter(x="x1", y="x2", size=8, source=source, fill_color="color")

word_labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(word_labels)

show(p)

# green indicates positive words, black indicates negative words