# Run neural networks to predict on Formspring Data



In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import math
import random
import sys
import os
from util.Util import *
from util import formspring_data_parser
import pandas as pd
from sklearn import model_selection

In [3]:
import sys
sys.path.append("/Users/luisd/dev/cyberbullying-detection")
from nn import *
import nn 
import fc_nn 

Using TensorFlow backend.


In [4]:
# sys.path.append(...) # in case we want to add something else

In [5]:
import phonemes_from_graphemes as pg
from words_2_vectors import * 
from util import formspring_data_parser

In [6]:
# import importlib
# importlib.reload(pg)
from phonemes_from_graphemes import *

### Where am I expecting to see the data

In [7]:
data_dir = "./data"

In [8]:
common_logger = get_logger(name = "common_logger", debug_log_file_name = "common_logger.log")

2017-06-14 06:07:23,993 - common_logger - INFO - 'common_logger': logging 'INFO'+ logs to Console, 'DEBUG'+ logs to '/Users/luisd/dev/cyberbullying-detection/common_logger.log'


Logger created
Creating debug handler at '/Users/luisd/dev/cyberbullying-detection/common_logger.log'
'common_logger': logging 'INFO'+ logs to Console, 'DEBUG'+ logs to '/Users/luisd/dev/cyberbullying-detection/common_logger.log'


In [9]:
common_logger.debug("hello, this is a debug msg")

In [10]:
common_logger.handlers[1].baseFilename

'/Users/luisd/dev/cyberbullying-detection/common_logger.log'

## Load Word2Vec model trained on Google News 

In [11]:
mw = ModelWrapper.from_google_news_model(data_dir=data_dir, alogger=common_logger)

2017-06-14 06:07:24,081 - common_logger - INFO - Loading model from ./data/GoogleNews-vectors-negative300.bin.gz...
2017-06-14 06:09:50,037 - common_logger - INFO - Model succesfully loaded
2017-06-14 06:09:50,040 - common_logger - INFO - Sort all the words in the model, so that we can auto-complete queries quickly...


In [12]:
m = mw.model # cache model definition

In [13]:
len(m.vocab)

3000000

## Creation of Sounds' Dictionary  

### Let's read the sounds' dictionary

In [14]:
sounds_dict = SoundsDict(a_dir=data_dir, alogger=common_logger)

In [15]:
sounds_dict['ju:']

{'YOU', 'YOu', 'You', 'you'}

In [16]:
mw.set_sounds_dict(sounds_dict=sounds_dict)

In [17]:
mw.sound_to_word('ju:')

{'YOU', 'YOu', 'You', 'you'}

## Read data from XML  

In [18]:
xml_file_name = '/Users/luisd/Downloads/FormspringLabeledForCyberbullying/XMLMergedFile.xml'
phonemesFactory = PhonemesFromGraphemes(alogger=common_logger)
parser = formspring_data_parser.Formspring_Data_Parser(xml_file_name, pg = phonemesFactory, mw = mw, alogger = mw.alogger)

### Let's parse (or read) the whole document 

* To generate it: `qal = parser.all_questions_answers_labels()` (and then you can save it with `qal.to_csv("a_file_name.csv")`) 
* To read it from somewhere: `all_my_data = pd.read_csv("from_somewhere.csv")`

In [19]:
# all_of_them = parser.questions_answers_labels(an_id = 1)
# all_of_them

In [20]:
# really_all = parser.all_questions_answers_labels()

In [21]:
really_all = pd.read_csv("really_all_of_them.csv")

In [22]:
threatening_items = really_all.loc[really_all["threat"] == True]

In [23]:
threatening_items.shape

(4879, 7)

In [24]:
really_all.shape

(36885, 7)

In [25]:
# really_all.to_csv("really_all_of_them.csv")

In [26]:
questions_answers = ["{}; {}".format(q, a) for q, a in list(zip(really_all['question'].tolist(), really_all['answer'].tolist()))] 

In [27]:
questions_answers[0:10] 

["what's your favorite song laughter; I like too many songs to have a favorite",
 '----------------------------------------------------------_3; haha jk',
 "'3; haha jk",
 'angel you duh; Really Thanks haha',
 'nan; nan',
 'nan; nan',
 'any makeup tips i suck at doing my makeup laughter; Sure Like tell me wht u know Like wht do you use',
 "I miss It's Emma btw haha; EMMA laughter I yu",
 "I miss It's Emma btw haha; EMMA laughter I ue",
 "I miss It's Emma btw haha; EMMA laughter I ew"]

In [28]:
labels = ["THREAT" if threat else "CLEAN" for threat in really_all['threat'].tolist()]

In [29]:
labels[0:10]

['CLEAN',
 'CLEAN',
 'CLEAN',
 'CLEAN',
 'CLEAN',
 'CLEAN',
 'CLEAN',
 'CLEAN',
 'CLEAN',
 'CLEAN']

## Let's build a fully connected network to do this job

In [31]:
cyber_fc = fc_nn.CyberbullyingFullyConnectedNetwork(reviews=questions_answers, labels=labels,hidden_nodes=300,learning_rate=0.01)

In [32]:
cyber_fc.train(questions_answers, labels)

Progress:0.0% Speed(reviews/sec):0.0 #Correct:0 #Trained:1 Training Accuracy:0.0%
Progress:6.77% Speed(reviews/sec):5479. #Correct:2339 #Trained:2501 Training Accuracy:93.5%
Progress:13.5% Speed(reviews/sec):5581. #Correct:4784 #Trained:5001 Training Accuracy:95.6%
Progress:20.3% Speed(reviews/sec):5817. #Correct:7077 #Trained:7501 Training Accuracy:94.3%
Progress:27.1% Speed(reviews/sec):5920. #Correct:9484 #Trained:10001 Training Accuracy:94.8%
Progress:33.8% Speed(reviews/sec):5533. #Correct:11828 #Trained:12501 Training Accuracy:94.6%
Progress:40.6% Speed(reviews/sec):4824. #Correct:14262 #Trained:15001 Training Accuracy:95.0%
Progress:47.4% Speed(reviews/sec):3970. #Correct:16723 #Trained:17501 Training Accuracy:95.5%
Progress:54.2% Speed(reviews/sec):4167. #Correct:19028 #Trained:20001 Training Accuracy:95.1%
Progress:61.0% Speed(reviews/sec):4194. #Correct:21467 #Trained:22501 Training Accuracy:95.4%
Progress:67.7% Speed(reviews/sec):4206. #Correct:23901 #Trained:25001 Training 

In [55]:
len(cyber_fc.word2index)

15643

## P-CNN section 

In [33]:
import spacy # nl processing
nlp = spacy.load('en')

In [34]:
import util.Data as util_data
sents_encoder = util_data.reviews_labels_encoder(
    mw, 
    n_words_in_review = 10, 
    reviews = list(map(lambda x: util_data.review(x), questions_answers)),  
    labels = list(map(lambda x: util_data.label(x), labels)),
    spacy_nlp = nlp)

### Let's create network 

In [35]:
cnn_k = CyberbullyingDetectionnNN(features_in_words=300, words_in_review=10)
# cnn_k.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_1 (InputLayer)             (None, 10, 300, 1)    0                                            
____________________________________________________________________________________________________
conv2D_1x1 (Conv2D)              (None, 10, 300, 100)  200                                          
____________________________________________________________________________________________________
conv2D_2x2 (Conv2D)              (None, 10, 300, 100)  500                                          
____________________________________________________________________________________________________
conv2D_3x3 (Conv2D)              (None, 10, 300, 100)  1000                                         
___________________________________________________________________________________________

### If you need sanity checking, run next cell

In [36]:
nn.sanity_check()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
input_2 (InputLayer)             (None, 10, 300, 1)    0                                            
____________________________________________________________________________________________________
conv2D_1x1 (Conv2D)              (None, 10, 300, 100)  200                                          
____________________________________________________________________________________________________
conv2D_2x2 (Conv2D)              (None, 10, 300, 100)  500                                          
____________________________________________________________________________________________________
conv2D_3x3 (Conv2D)              (None, 10, 300, 100)  1000                                         
___________________________________________________________________________________________

In [37]:
reviews_as_matrix = sents_encoder.reviews_as_matrix()

In [38]:
reviews_as_matrix.shape

(36885, 10, 300)

In [39]:
labels_as_matrix = sents_encoder.labels_as_matrix()

In [40]:
labels_as_matrix.shape

(36885, 2)

### Split the data
TODO: the datasets are very imbalanced - so we must do some over(/under) sampling. Maybe https://github.com/scikit-learn-contrib/imbalanced-learn ?  

In [41]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(reviews_as_matrix, labels_as_matrix, test_size=0.33, random_state=42)

In [44]:
cnn_k.fit(x_train=X_train.reshape(X_train.shape + (1,)), y_train=y_train, batch_size=800, epochs=5)

Epoch 1/5


KeyboardInterrupt: 

In [66]:
cnn_k.fit(x_train=reviews_as_matrix.reshape(reviews_as_matrix.shape + (1,)), y_train=labels_as_matrix, batch_size=16, epochs=5)

Epoch 1/5
 1264/36885 [>.............................] - ETA: 1284s - loss: 0.0099 - acc: 0.9953

KeyboardInterrupt: 