# Training of complex Neural Network

In [1]:
%load_ext autoreload
%autoreload 2

# external imports
import numpy as np
import pandas as pd
import seaborn as sb
import gensim
from gensim.scripts.glove2word2vec import glove2word2vec
import csv
import scipy
import os.path
import sklearn as sk
import keras
import random
import pickle
import gc
import time

# internal imports
import helpers as HL
import glove_module as GV
import neural_nets as NN
import tokenizing as TO



# Constants
DATA_FOLDER = "gensim_data_folder"
DATA_25DIM = DATA_FOLDER + "/gensim_glove_vectors_25dim.txt"
DATA_50DIM = DATA_FOLDER + "/gensim_glove_vectors_50dim.txt"
DATA_100DIM = DATA_FOLDER + "/gensim_glove_vectors_100dim.txt"
DATA_200DIM = DATA_FOLDER + "/gensim_glove_vectors_200dim.txt"

Using TensorFlow backend.
  return f(*args, **kwds)


#### Import corpus

In [2]:
# Loading corpus

awesome_corpus = pickle.load( open( "stopword100_corpus_n2_SHM_E_SN_H_HK.pkl", "rb" ) )
print(len(awesome_corpus))

nr_pos_tweets = 1250000
nr_neg_tweets = 1250000
total_training_tweets = 2500000

2510000


#### Import global vectors

In [3]:
# uses the created gensim-.txt file to create the word2vec so one can operate on it
global_vectors = GV.make_glove(DATA_200DIM)

#### Vectorize corpus

In [4]:
start = time.time()

###### build vectors of all the tweets ######
num_of_dim = global_vectors.syn0.shape[1]
# seperate traindata and testdata
train_corpus = awesome_corpus[:total_training_tweets:] 
predict_corpus = awesome_corpus[total_training_tweets::]
del awesome_corpus

# Build a vector of all the words in a tweet
vectors = np.zeros(len(train_corpus), dtype=object)
for i, doc in enumerate(train_corpus):
    if (i % 50000) == 0:
        print("tweets processed: %.0f  of total number of tweets: %.0f" % (i,len(train_corpus)))
    vectors[i] = GV.buildWordVector(doc, num_of_dim, global_vectors)
del global_vectors
del doc
print("done with making the trainvectors")

train_document_vecz = np.concatenate(vectors)
del vectors
print("done with concatenating the trainvectors")

train_document_vecs = sk.preprocessing.scale(train_document_vecz)
del train_document_vecz
print("done with scaling the trainvectors")

labels = GV.create_labels(total_training_tweets, nr_pos_tweets)
print("done with creating the labels")
print("time used one the ordeal:", time.time() - start)

#############################################

tweets processed: 0  of total number of tweets: 2500000
tweets processed: 50000  of total number of tweets: 2500000
tweets processed: 100000  of total number of tweets: 2500000
tweets processed: 150000  of total number of tweets: 2500000
tweets processed: 200000  of total number of tweets: 2500000
tweets processed: 250000  of total number of tweets: 2500000
tweets processed: 300000  of total number of tweets: 2500000
tweets processed: 350000  of total number of tweets: 2500000
tweets processed: 400000  of total number of tweets: 2500000
tweets processed: 450000  of total number of tweets: 2500000
tweets processed: 500000  of total number of tweets: 2500000
tweets processed: 550000  of total number of tweets: 2500000
tweets processed: 600000  of total number of tweets: 2500000
tweets processed: 650000  of total number of tweets: 2500000
tweets processed: 700000  of total number of tweets: 2500000
tweets processed: 750000  of total number of tweets: 2500000
tweets processed: 800000  of t

#### Define the neural network model

In [5]:
# Defing model :)
input_dimensions = train_document_vecs.shape[1]
width = 500
depth = 2
epochs = 60
n_folds = 2
split = 0.9
dropout_rate=0.4
funnel=0.3

#model = NN.deep_HB(input_dimensions)
model = NN.dynamic_dense(input_dimensions, width, depth, dropout_rate=dropout_rate, activation='relu', funnel=funnel)
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 500)               100500    
_________________________________________________________________
dropout_1 (Dropout)          (None, 500)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 150)               75150     
_________________________________________________________________
dropout_2 (Dropout)          (None, 150)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 151       
Total params: 175,801
Trainable params: 175,801
Non-trainable params: 0
_________________________________________________________________
None


#### Train model on dataset
This can take a while, it should stop with early stopping(patience=10!), but it can can be stopped prematurely by Interrupting kernel. Then it return the last model it was working on(BUT NOT NECESSARILY THE BEST). Use the "train_NN_dynamic_model.hdf5" that is saved by ModelCheckpoint. Maybe try out once before running it for a long time

In [6]:
improved_model, history = GV.train_NN(model, train_document_vecs, labels)
# backuppickle in case something goes wrong
improved_model.save('Backup_of_further_training_model.h5')

Train on 2000000 samples, validate on 500000 samples
Epoch 1/100000
Epoch 2/100000
Epoch 3/100000
Epoch 4/100000
Epoch 5/100000
Epoch 6/100000
Epoch 7/100000

time spent training: 22022.44162297249


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/Users/havardbjornoy/EPFL_Jupyter/Machine_Learning/CD-433-Project-2/glove_module.py", line 290, in train_NN
    history = model.fit(allX[:split_size], allY[:split_size], epochs=epochs, batch_size=1024, verbose=1, callbacks=[early_stopping, model_checkpoint], validation_data=(allX[split_size:], allY[split_size:]))
  File "/Users/havardbjornoy/anaconda3/lib/python3.6/site-packages/keras/models.py", line 960, in fit
    validation_steps=validation_steps)
  File "/Users/havardbjornoy/anaconda3/lib/python3.6/site-packages/keras/engine/training.py", line 1657, in fit
    validation_steps=validation_steps)
  File "/Users/havardbjornoy/anaconda3/lib/python3.6/site-packages/keras/engine/training.py", line 1202, in _fit_loop
    ins_batch = _slice_arrays(ins[:-1], batch_ids) + [ins[-1]]
  File "/Users/havardbjornoy/anaconda3/lib/python3.6/site-packages/keras/engine/training.py", line 415, in _slice_arrays
    return [None if x is None else x[start] for 

TypeError: must be str, not list

## Submit Kaggle when you are happy with your model

In [None]:
# FOR THE KAGGLE SUBMISSION
test_document_vecs = np.concatenate([GV.buildWordVector(doc, num_of_dim, global_vectors) for doc in predict_corpus])
test_document_vecs = sk.preprocessing.scale(test_document_vecs)

print("Hello world")
pred=model.predict(test_document_vecs)

pred_ones=[]
for i in pred:
    if i> 0.5:
        pred_ones.append(1)
    else:
        pred_ones.append(-1)

#CREATING SUBMISSION
ids = list(range(1,10000+1))
HL.create_csv_submission(ids, pred_ones,"best_proc_corpus_dynamic_dense.txt")