# Extend Word2Vec Model with Phonemes 



In [1]:
import math
import random
import sys
import os
from util.Util import *

In [2]:
# sys.path.append(...) # in case we want to add something else

In [3]:
import phonemes_from_graphemes as pg

In [4]:
import importlib
importlib.reload(pg)
from phonemes_from_graphemes import *

### Where am I expecting to see the data

In [5]:
data_dir = "./data"

In [6]:
common_logger = get_logger(name = "common_logger", debug_log_file_name = "common_logger.log")

2017-06-09 14:25:02,609 - common_logger - INFO - 'common_logger': logging 'INFO'+ logs to Console, 'DEBUG'+ logs to '/Users/luisd/dev/cyberbullying-detection/common_logger.log'


Logger created
Creating debug handler at '/Users/luisd/dev/cyberbullying-detection/common_logger.log'
'common_logger': logging 'INFO'+ logs to Console, 'DEBUG'+ logs to '/Users/luisd/dev/cyberbullying-detection/common_logger.log'


In [7]:
common_logger.debug("hello, this is a debug msg")

In [8]:
common_logger.handlers[1].baseFilename

'/Users/luisd/dev/cyberbullying-detection/common_logger.log'

In [9]:
#  change 'None' to 'm' to reload 
mw = ModelWrapper(data_dir=data_dir, alogger = common_logger, m= None, sounds_dict = None)

2017-06-09 14:25:02,628 - common_logger - INFO - Loading model from ./data/GoogleNews-vectors-negative300.bin.gz...
2017-06-09 14:27:06,347 - common_logger - INFO - Model succesfully loaded
2017-06-09 14:27:06,348 - common_logger - INFO - Sort all the words in the model, so that we can auto-complete queries quickly...


In [10]:
m = mw.model # cache model definition

## Generation of phonemes from graphemes 

In [11]:
phonemesFactory = PhonemesFromGraphemes(alogger=mw.alogger)

### Helper function to generate sounds from Word2Vec model 

In [12]:
def generate_from_word2vec(pg, start, n, fn = None):
    # start = 2000000; n = 500000; shelf_filename = "shelf_from{}_for{}.shelf".format(start, n)
    if fn is None:
        shelf_filename = "shelf_from{}_for{}.shelf".format(start, n)
    else:
        shelf_filename = fn
    pg.alogger.setLevel(logging.INFO)
    pg.graphemes_to_phonemes_to_shelves(words_in_sent=mw.model.index2word[start:start + n], shelf_filename=shelf_filename)

## Creation of Sounds' Dictionary  

### Let's read the sounds' dictionary

In [13]:
shelves_names = list(map(
    lambda fn: '{}/'.format(data_dir) + fn, 
                     ['shelf_from0_for500000.shelf', 'shelf_from500001_for500000.shelf', 'shelf_from1000001_for500000.shelf', 'shelf_from1500001_for500000.shelf', 'shelf_from2000001_for500000.shelf', 'shelf_from2500000_for500000.shelf']
                        )
                    )

In [14]:
shelves_names

['./data/shelf_from0_for500000.shelf',
 './data/shelf_from500001_for500000.shelf',
 './data/shelf_from1000001_for500000.shelf',
 './data/shelf_from1500001_for500000.shelf',
 './data/shelf_from2000001_for500000.shelf',
 './data/shelf_from2500000_for500000.shelf']

In [15]:
sounds_dict = SoundsDict(file_names=shelves_names, alogger=phonemesFactory.alogger)

In [16]:
len(sounds_dict.all_shelves)

6

In [17]:
sounds_dict['ju:']

{'YOU', 'YOu', 'You', 'you'}

In [18]:
zero_keys = list(sounds_dict.all_shelves[0].keys())

In [19]:
zero_keys[0]

"a#k'adEmi; a#w'O@dz"

In [20]:
mw = ModelWrapper(data_dir=data_dir, alogger = common_logger, m= m, sounds_dict = sounds_dict)

2017-06-09 14:27:16,741 - common_logger - INFO - [init] Model provided. If you want me to FORCE re-load it, call ModelWrapper's constructor with 'None'
2017-06-09 14:27:16,743 - common_logger - INFO - Sort all the words in the model, so that we can auto-complete queries quickly...


In [21]:
mw.sounds_dict

<phonemes_from_graphemes.SoundsDict at 0x244c3a940>

In [22]:
mw.sound_to_word('ju:')

{'YOU', 'YOu', 'You', 'you'}

In [23]:
# mw.sound_to_word('bla:')

In [24]:
xml_file_name = '/Users/luisd/Downloads/FormspringLabeledForCyberbullying/XMLMergedFile.xml'

parser = Formspring_Data_Parser(xml_file_name, pg = phonemesFactory, mw = mw, alogger = mw.alogger)

In [25]:
doc2dicts = parser.doc2dict(an_id = 1)
# all_of_them

In [26]:
# doc2dicts

In [27]:
all_of_them = parser.questions_answers_labels(an_id = 1)
all_of_them

Unnamed: 0,answer,answer_raw,question,question_raw,threat,uuid
0,haha jk,</3 ? haha jk! <33,----------------------------------------------...,<3,False,d10dcd7f-f57a-4482-9e54-73e566add506
1,haha jk,</3 ? haha jk! <33,'3,<3,False,d10dcd7f-f57a-4482-9e54-73e566add506
2,Really Thanks haha,Really?!?! Thanks?! haha,angel you duh,"""hey angel you duh sexy""",False,a85c53bd-7fa4-477d-a64c-21f6bbf30817
3,,;(,,(:,False,b47de7fd-7814-4061-927b-9f3602f999d8
4,,*RAWR*?,,******************MEOWWW*************************,False,06efa862-9cd9-4c9d-b9be-d3d1bb5e1263
5,Sure Like tell me wht u know Like wht do you use,Sure! Like tell me wht u wnna know?! Like wht ...,any makeup tips i suck at doing my makeup laug...,any makeup tips? i suck at doing my makeup lol,False,650a9a3a-6720-4e99-98ce-8990f79cafaf
6,EMMA laughter I yue,EMMA hahahahah :D I MISSSSSeddd YUHHHHh 222222...,I miss It's Emma btw haha,Apriiiiiiiiiiiill!!! I miss uuuu! It's Emma bt...,False,ab757060-151a-4133-bfc9-291580a60b7f
7,EMMA laughter I u,EMMA hahahahah :D I MISSSSSeddd YUHHHHh 222222...,I miss It's Emma btw haha,Apriiiiiiiiiiiill!!! I miss uuuu! It's Emma bt...,False,ab757060-151a-4133-bfc9-291580a60b7f
8,EMMA laughter I ewe,EMMA hahahahah :D I MISSSSSeddd YUHHHHh 222222...,I miss It's Emma btw haha,Apriiiiiiiiiiiill!!! I miss uuuu! It's Emma bt...,False,ab757060-151a-4133-bfc9-291580a60b7f
9,EMMA laughter I yu,EMMA hahahahah :D I MISSSSSeddd YUHHHHh 222222...,I miss It's Emma btw haha,Apriiiiiiiiiiiill!!! I miss uuuu! It's Emma bt...,False,ab757060-151a-4133-bfc9-291580a60b7f


In [28]:
really_all = parser.all_questions_answers_labels()

KeyboardInterrupt: 

In [29]:
really_all = pd.read_csv("really_all_of_them.csv")

In [49]:
really_all.loc[really_all["threat"] == True]

Unnamed: 0.1,Unnamed: 0,answer,answer_raw,question,question_raw,threat,uuid
63,62,UM excuse yuh HE IS NOT Ur a fuckin idiot Kay ...,UM excuse yuh? HE IS NOT!! Ur a fuckin idiot! ...,Fuck Justin He's a fucking fag,Fuck Justin bieber. He's a fucking fag,True,867a3fdc-4d88-4617-8354-56f45688f2f5
110,109,and its also funny how u stalked my whole twit...,and its also funny how u stalked my whole twit...,Funny how u defend that beiber,Hahah. Funny how u defend that beiber kid.,True,3e4397f1-4680-4598-aa17-e3b22ace06b7
136,135,Why thank yuh,Why thank yuh!,Why you such a bitch,Hey. Why you such a bitch?,True,c2810c6a-eda8-4837-bc68-7c6adbe8781d
156,155,Who the hell are yuh And who tha fuck is michael,Who the hell are yuh!!! And who tha fuck is mi...,how old are u And how do you know michael,how old are u ? And how do you know michael?,True,04c679ba-185e-4ea6-b9d3-6659fb9a0580
992,991,Now how the fuck am i suppose to remember that...,Now how the fuck am i suppose to remember that...,What was your favorite toy to play with as a c...,What was your favorite toy to play with as a c...,True,54c49f7c-4cd4-4bb5-92eb-03c029b71498
1072,1071,im not quite sure ill tell ya when i find out I,im not quite sure! ill tell ya when i find out...,who decided to take a shit on your face,who decided to take a shit on your face?,True,aab0300c-b264-44d9-9e95-d418d94d51cd
1083,1082,Well im not following anything really BANDWAGO...,Well im not following anything and really BAND...,Why do u follow the bandwagon What's next gay ...,Why do u follow the bandwagon? Formspring. Twi...,True,f6a56421-5ef1-4d1b-b5ef-62148ac9d850
1774,41,reall funny ndd i want hatt guy too combee too...,ahhaha yuurr funnyy reall funny ndd i dntt wan...,reall funny skinny ass that isn't really much ...,"lmao.. ""yurr reall funny skinny ass bitchh"".. ...",True,f3dc8c1f-1830-4a0f-9bc2-767dbd57fecc
1775,42,reall funny ndd i want hatt guy too combee too...,ahhaha yuurr funnyy reall funny ndd i dntt wan...,reall funny skinny ass that isn't really much ...,"lmao.. ""yurr reall funny skinny ass bitchh"".. ...",True,f3dc8c1f-1830-4a0f-9bc2-767dbd57fecc
1776,43,reall funny ndd i want hatt guy too combee too...,ahhaha yuurr funnyy reall funny ndd i dntt wan...,reall funny skinny ass that isn't really much ...,"lmao.. ""yurr reall funny skinny ass bitchh"".. ...",True,f3dc8c1f-1830-4a0f-9bc2-767dbd57fecc


In [None]:
# really_all.to_csv("really_all_of_them.csv")

In [30]:
questions_answers = ["{}; {}".format(q, a) for q, a in list(zip(really_all['question'].tolist(), really_all['answer'].tolist()))] 

In [31]:
questions_answers[0:10] 

["what's your favorite song laughter; I like too many songs to have a favorite",
 '----------------------------------------------------------_3; haha jk',
 "'3; haha jk",
 'angel you duh; Really Thanks haha',
 'nan; nan',
 'nan; nan',
 'any makeup tips i suck at doing my makeup laughter; Sure Like tell me wht u know Like wht do you use',
 "I miss It's Emma btw haha; EMMA laughter I yu",
 "I miss It's Emma btw haha; EMMA laughter I ue",
 "I miss It's Emma btw haha; EMMA laughter I ew"]

In [32]:
labels = ["THREAT" if threat else "CLEAN" for threat in really_all['threat'].tolist()]

In [33]:
labels[0:10]

['CLEAN',
 'CLEAN',
 'CLEAN',
 'CLEAN',
 'CLEAN',
 'CLEAN',
 'CLEAN',
 'CLEAN',
 'CLEAN',
 'CLEAN']

In [34]:
from collections import Counter
import numpy as np

In [35]:
# Create three Counter objects to store positive, negative and total counts
positive_counts = Counter()
negative_counts = Counter()
total_counts = Counter()
for label, review in zip(labels, questions_answers):
    words = review.split(' ') 
    if label == 'THREAT':
        for word in words:
            negative_counts[word] += 1
    else:
        for word in words:
            positive_counts[word] += 1
    for word in words:
        total_counts[word] += 1

pos_neg_ratios = Counter()

unique_words = total_counts.keys()
for word in unique_words:
    if total_counts[word] >= 100:
        pos_neg_ratios[word] = positive_counts[word] / float(negative_counts[word]+1)
        
        
unique_words = pos_neg_ratios.keys()# set(pos_neg_ratios.elements())
for word in unique_words:
    if (pos_neg_ratios[word] > 1):
        pos_neg_ratios[word] = np.log(pos_neg_ratios[word])
    else:
        pos_neg_ratios[word] = -np.log(1/(pos_neg_ratios[word] + 0.01))        

In [36]:
negative_counts.most_common()

[('a', 6752),
 ('you', 6372),
 ('u', 6204),
 ('i', 5824),
 ('ur', 4956),
 ('at', 4873),
 ('threw', 3600),
 ('your', 3416),
 ('no', 3104),
 ('shit', 3101),
 ('hair', 3004),
 ('some', 2953),
 ('the', 2916),
 ('be', 2665),
 ('are', 2608),
 ('mah', 2600),
 ('was', 2577),
 ('then', 2572),
 ('would', 2534),
 ('if', 2532),
 ('say', 2519),
 ('head', 2501),
 ('girl', 2467),
 ('people', 2456),
 ('UR', 2453),
 ('stfu', 2451),
 ('make', 2443),
 ('good', 2435),
 ('saw', 2425),
 ('fat', 2424),
 ('mad', 2414),
 ('kno', 2413),
 ('sense', 2407),
 ('of;', 2402),
 ('named', 2401),
 ('corndog', 2400),
 ('CORNDOG', 2400),
 ('GOOR', 2400),
 ('twinkie', 2400),
 ('bitch', 1795),
 ('bich', 1456),
 ('her', 1364),
 ("didn't", 1224),
 ("did'nt", 1220),
 ('thru', 1200),
 ('off', 1047),
 ('fair', 961),
 ('phair', 960),
 ('phare', 960),
 ('fare', 960),
 ('faire', 960),
 ('ass', 853),
 ('lick', 807),
 ('lik', 804),
 ('lic', 803),
 ('on', 795),
 ('hoe', 771),
 ('so', 766),
 ('pussy', 727),
 ('love', 685),
 ('too', 638

In [37]:
# negative_counts.most_common()
print("Pos-to-neg ratio for 'the' = {}".format(pos_neg_ratios["the"]))
print("Pos-to-neg ratio for 'amazing' = {}".format(pos_neg_ratios["amazing"]))
print("Pos-to-neg ratio for 'shit' = {}".format(pos_neg_ratios["shit"]))

Pos-to-neg ratio for 'the' = 2.461319014418187
Pos-to-neg ratio for 'amazing' = 5.746203190540153
Pos-to-neg ratio for 'shit' = -2.5972980568759496


### What are the most common words for 'threats' and 'non-threats' 

In [38]:
pos_neg_ratios.most_common()

[('rock', 9.3969859002501916),
 ('badr', 8.7021778656296753),
 ('spend', 8.4314174143948328),
 ('Baby', 8.4213428657594029),
 ('glue', 8.4144957931778954),
 ('bed', 8.1028891346408685),
 ('Nicki', 8.0100275284817339),
 ('andr', 8.009363076630045),
 ('Young', 8.009363076630045),
 ('bakr', 8.009363076630045),
 ('rokr', 8.0090306850697299),
 ('love;', 7.640603826393634),
 ('money', 7.4073177104694174),
 ('gorgeous', 7.3895639536776354),
 ('walk', 7.3821243657375124),
 ('running', 7.3777589082278725),
 ('spoke', 7.3740018593501606),
 ('cold', 7.3607399030582776),
 ('hold', 7.3581937527330323),
 ('along', 7.3581937527330323),
 ('photo', 7.357556200910353),
 ('rest', 7.3562798765507482),
 ('room', 7.3485875309275928),
 ('race', 7.3466551631765391),
 ('lover', 7.3453648404168685),
 ('attractive', 7.3401868353201154),
 ('fly', 7.3401868353201154),
 ('sing', 7.333676395657684),
 ('lights', 7.3297496890415124),
 ('double', 7.329093736246592),
 ('Michael', 7.3251489579555749),
 ('annoys', 7.32514

In [39]:
list(reversed(pos_neg_ratios.most_common()))[0:30]

[('ol', -4.6051701859880918),
 ('matta', -4.6051701859880918),
 ('dawk', -4.6051701859880918),
 ('wiht', -4.6051701859880918),
 ('priscilla', -4.6051701859880918),
 ('prob', -4.6051701859880918),
 ('hare', -4.6051701859880918),
 ('twinkie', -4.6051701859880918),
 ('haire', -4.6051701859880918),
 ('GOOR', -4.6051701859880918),
 ('CORNDOG', -4.6051701859880918),
 ('corndog', -4.6051701859880918),
 ('hopp', -4.6051701859880918),
 ('alexis;', -4.6051701859880918),
 ('ghetto', -4.6051701859880918),
 ('threw', -4.2969425047274195),
 ('otha', -4.2388078038936694),
 ('stfu', -4.2067992413293345),
 ('bea', -4.1653125479201716),
 ('beah', -4.1653125479201716),
 ('yao', -4.1531850622450337),
 ('yow', -4.1531850622450337),
 ('yau', -4.1531850622450337),
 ('thru', -4.1459446655796457),
 ('mutha', -4.1047885213831838),
 ('kus', -3.9913597476646672),
 ('fade', -3.9753926193607354),
 ('UR', -3.9439031633388089),
 ('wdf', -3.9169857947702749),
 ('ah;', -3.9169857947702749)]

### Yeah... doesn't look like there is a pattern here... 
Maybe some more work on the pre-treatment is needed 

### Let's take a look 

In [40]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

In [41]:
hist, edges = np.histogram(list(map(lambda x:x[1],pos_neg_ratios.most_common())), density=True, bins=100, normed=True)

p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="Word Positive/Negative Affinity Distribution")
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color="#555555")
show(p)

In [42]:
frequency_frequency = Counter()

for word, cnt in total_counts.most_common():
    frequency_frequency[cnt] += 1

In [43]:
hist, edges = np.histogram(list(map(lambda x:x[1],frequency_frequency.most_common())), density=True, bins=100, normed=True)

p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="The frequency distribution of the words in our corpus")
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color="#555555")
show(p)

## Let's build a fully connected network to do this job

In [44]:
vocab = set(total_counts.keys())
vocab_size = len(vocab)
print(vocab_size)

19794


In [45]:
import time
import sys
import numpy as np

class CyberbullyingFullyConnectedNetwork:
    def __init__(self, reviews, labels, hidden_nodes = 10, learning_rate = 0.1):
        """Create a CyberbullyingFullyConnectedNetwork with the given settings
        Args:
            reviews(list) - List of reviews used for training
            labels(list) - List of THREAT/CLEAN labels associated with the given reviews
            hidden_nodes(int) - Number of nodes to create in the hidden layer
            learning_rate(float) - Learning rate to use while training
        
        """
        # Assign a seed to our random number generator to ensure we get
        # reproducable results during development 
        np.random.seed(1)

        # process the reviews and their associated labels so that everything
        # is ready for training
        self.pre_process_data(reviews, labels)
        
        # Build the network to have the number of hidden nodes and the learning rate that
        # were passed into this initializer. Make the same number of input nodes as
        # there are vocabulary words and create a single output node.
        self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate)

    def pre_process_data(self, reviews, labels):
        # reviews 
        review_vocab = set()
        for review in reviews:
            review_vocab = review_vocab.union(set(review.lower().split(' ')))
        self.review_vocab = list(review_vocab)
        # labels 
        label_vocab = set()
        for label in labels:
            label_vocab.add(label)
        self.label_vocab = list(label_vocab)        
        # 
        self.review_vocab_size = len(self.review_vocab)
        self.label_vocab_size = len(self.label_vocab)
        
        # Create a dictionary of words in the vocabulary mapped to index positions
        self.word2index = {}
        # TODO: populate self.word2index with indices for all the words in self.review_vocab
        #       like you saw earlier in the notebook
        for i,word in enumerate(self.review_vocab):
            self.word2index[word] = i

        # Create a dictionary of labels mapped to index positions
        self.label2index = {}
        for i,word in enumerate(self.label_vocab):
            self.label2index[word] = i
         
        
    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        # Store the number of nodes in input, hidden, and output layers.
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes

        # Store the learning rate
        self.learning_rate = learning_rate

        # Initialize weights
        
        # initialize self.weights_0_1 as a matrix of zeros. These are the weights between
        #       the input layer and the hidden layer.
        self.weights_0_1 = np.zeros((self.input_nodes,self.hidden_nodes))
        
        # initialize self.weights_1_2 as a matrix of random values. 
        #       These are the weights between the hidden layer and the output layer.
        self.weights_1_2 = np.random.rand(self.hidden_nodes, self.output_nodes)
        
        # Create the hidden layer, a two-dimensional matrix with shape 
        #       1 x hidden_nodes, with all values initialized to zero
        self.layer_1 = np.zeros((1,self.hidden_nodes))
    
    def get_target_for_label(self,label):
        if label == 'CLEAN':
            return 1
        else:
            return 0
        
    def sigmoid(self,x):
        return (1 / (1 + np.exp(-x)))
    
    def sigmoid_output_2_derivative(self,output):
        return output * (1 - output)

    def train(self, training_reviews_raw, training_labels):
        def safe_word_2_index(word) -> int:
            try:
                return self.word2index[word]
            except KeyError:
                return -1
            
        training_reviews = []
        for review_raw_str in training_reviews_raw:
            review_raw = review_raw_str.split(' ')
            # l = [i for i in [safe_word_2_index(word) for word in review_raw] if i != -1] 
            reviews_on = list(set([i for i in [safe_word_2_index(word) for word in review_raw] if i != -1]))
            # reviews_on = list(set([self.word2index[word] for word in review_raw]))
            training_reviews.append(reviews_on)
        
        # make sure out we have a matching number of reviews and labels
        assert(len(training_reviews) == len(training_labels))

        # Keep track of correct predictions to display accuracy during training 
        correct_so_far = 0
        
        # Remember when we started for printing time statistics
        start = time.time()

        # loop through all the given reviews and run a forward and backward pass,
        # updating weights for every item
        for i in range(len(training_reviews)):
            
            # TODO: Get the next review and its correct label
            reviews_on = training_reviews[i]

            # print("len(reviews_on) = {}".format(len(reviews_on)))
            label = training_labels[i]
            
            # TODO: Implement the forward pass through the network. 
            #       That means use the given review to update the input layer, 
            #       then calculate values for the hidden layer,
            #       and finally calculate the output layer.
            # 
            #       Do not use an activation function for the hidden layer,
            #       but use the sigmoid activation function for the output layer.
            
            # calculate hidden layer data
            # hidden_in = np.dot(self.layer_0, self.weights_0_1)
            hidden_in = self.layer_1 * 0 # I want it to have the same shape
            # print("hidden_in.shape = {}".format(hidden_in.shape))
            for index in reviews_on:
                # print("self.weights_0_1.shape = {}".format(self.weights_0_1.shape))
                # print("self.weights_0_1[{}].shape = {}".format(index, self.weights_0_1[index].shape))
                hidden_in += (self.weights_0_1[index]) 
            
            self.layer_1 = hidden_in # because activation == Id.
            # calculate output layer data
            output_in = np.dot(self.layer_1, self.weights_1_2)
            output_out = self.sigmoid(output_in) 
        
            # TODO: Implement the back propagation pass here. 
            #       That means calculate the error for the forward pass's prediction
            #       and update the weights in the network according to their
            #       contributions toward the error, as calculated via the
            #       gradient descent and back propagation algorithms you 
            #       learned in class.
            
            # Let's calculate errors:
            # at the output layer
            error = self.get_target_for_label(label) - output_out
            # print("error = {}".format(error))
            error_term = error * self.sigmoid_output_2_derivative(self.sigmoid(output_in))
            # print("error_term = {}".format(error_term))
            # at the hidden layer
            # hidden_error = np.dot(error_term, self.weights_1_2.T) # * hidden_in 
            hidden_error_term = np.dot(error_term, self.weights_1_2.T) * 1 # because activate function == Id, so derivative == 1
            # print("hidden_error_term = {}".format(hidden_error_term))
            # and now let's update weights:
            self.weights_1_2 += self.learning_rate * np.dot(self.layer_1.T, error_term)
            # print("BEFORE ==> non-zeros = {}".format(np.count_nonzero(self.weights_0_1)))
            # print("self.layer_0 has {} non-zeros; self.learning_rate = {}".format(np.count_nonzero(self.layer_0), self.learning_rate))
#             layer_0 = np.zeros((1, self.input_nodes))
#             layer_0[0][reviews_on] = 1
#             self.weights_0_1 += self.learning_rate * np.dot(layer_0.T,hidden_error_term)
            
            for index in reviews_on:
                self.weights_0_1[index] += hidden_error_term[0] * self.learning_rate # update input-to-hidden weights with gradient descent step
            
            
            
            # print("AFTER ==> non-zeros = {}".format(np.count_nonzero(self.weights_0_1)))
            # TODO: Keep track of correct predictions. To determine if the prediction was
            #       correct, check that the absolute value of the output error 
            #       is less than 0.5. If so, add one to the correct_so_far count.
            only_neuron_error = np.abs(error[0,0])
            # print("error's shape is {}, only_neuron_error == {}".format(error.shape, only_neuron_error))
            if only_neuron_error < 0.5:
                # print("correct_so_far++")
                correct_so_far += 1
            #if only_neuron_error > 0.85:
            #    print("dafuq error is {}".format(only_neuron_error))
            
            # For debug purposes, print out our prediction accuracy and speed 
            # throughout the training process. 

            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews_raw)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) \
                             + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
            if(i % 2500 == 0):
                print("")
    
    def test(self, testing_reviews, testing_labels):
        """
        Attempts to predict the labels for the given testing_reviews,
        and uses the test_labels to calculate the accuracy of those predictions.
        """
        
        # keep track of how many correct predictions we make
        correct = 0

        # we'll time how many predictions per second we make
        start = time.time()

        # Loop through each of the given reviews and call run to predict
        # its label. 
        for i in range(len(testing_reviews)):
            pred = self.run(testing_reviews[i])
            if(pred == testing_labels[i]):
                correct += 1
            
            # For debug purposes, print out our prediction accuracy and speed 
            # throughout the prediction process. 

            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct) + " #Tested:" + str(i+1) \
                             + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
    
    def run(self, review):
        """
        Returns a POSITIVE or NEGATIVE prediction for the given review.
        """
        # TODO: Run a forward pass through the network, like you did in the
        #       "train" function. That means use the given review to 
        #       update the input layer, then calculate values for the hidden layer,
        #       and finally calculate the output layer.
        #
        #       Note: The review passed into this function for prediction 
        #             might come from anywhere, so you should convert it 
        #             to lower case prior to using it.

        
        
        review_raw = review.lower().split(' ')
        # print("len(review_raw) = {}".format(len(review_raw)))
        reviews_on = []
        for word in review_raw:
            try:
                reviews_on.append(self.word2index[word])
            except KeyError:
                pass
        
        # reviews_on = [self.word2index[word] for word in review_raw] # contains the indices for words found in the review.
        # calculate hidden layer data
        # hidden_in = np.dot(self.layer_0, self.weights_0_1)
        hidden_in = self.layer_1 * 0 # I want it to have the same shape
        # print("hidden_in.shape = {}".format(hidden_in.shape))
        for index in reviews_on:
            # print("self.weights_0_1.shape = {}".format(self.weights_0_1.shape))
            # print("self.weights_0_1[{}].shape = {}".format(index, self.weights_0_1[index].shape))
            hidden_in += (self.weights_0_1[index]) 

        self.layer_1 = hidden_in # because activation == Id.
        # calculate output layer data
        output_in = np.dot(self.layer_1, self.weights_1_2)
        output_out = self.sigmoid(output_in) 
        
        
        
#         # create input
#         self.update_input_layer(review.lower())
#         # calculate hidden layer data
#         hidden_in = np.dot(self.layer_0, self.weights_0_1)
#         hidden_out = hidden_in # because activation == Id.
#         # calculate output layer data
#         output_in = np.dot(hidden_out, self.weights_1_2)
#         output_out = self.sigmoid(output_in) 
        
        # TODO: The output layer should now contain a prediction. 
        #       Return `POSITIVE` for predictions greater-than-or-equal-to `0.5`, 
        #       and `NEGATIVE` otherwise.
        if output_out >= 0.5:
            return 'CLEAN'
        else:
            return 'THREAT'


In [46]:
cyber_fc = CyberbullyingFullyConnectedNetwork(reviews=questions_answers, labels=labels,hidden_nodes=300,learning_rate=0.01)

In [47]:
cyber_fc.train(questions_answers, labels)

Progress:0.0% Speed(reviews/sec):0.0 #Correct:0 #Trained:1 Training Accuracy:0.0%
Progress:6.77% Speed(reviews/sec):5927. #Correct:2339 #Trained:2501 Training Accuracy:93.5%
Progress:13.5% Speed(reviews/sec):6088. #Correct:4784 #Trained:5001 Training Accuracy:95.6%
Progress:20.3% Speed(reviews/sec):6269. #Correct:7077 #Trained:7501 Training Accuracy:94.3%
Progress:27.1% Speed(reviews/sec):6384. #Correct:9484 #Trained:10001 Training Accuracy:94.8%
Progress:33.8% Speed(reviews/sec):5951. #Correct:11828 #Trained:12501 Training Accuracy:94.6%
Progress:40.6% Speed(reviews/sec):5176. #Correct:14262 #Trained:15001 Training Accuracy:95.0%
Progress:47.4% Speed(reviews/sec):4270. #Correct:16723 #Trained:17501 Training Accuracy:95.5%
Progress:54.2% Speed(reviews/sec):4459. #Correct:19028 #Trained:20001 Training Accuracy:95.1%
Progress:61.0% Speed(reviews/sec):4498. #Correct:21467 #Trained:22501 Training Accuracy:95.4%
Progress:67.7% Speed(reviews/sec):4506. #Correct:23901 #Trained:25001 Training 

In [48]:
len(cyber_fc.word2index)

15643