# Optimizing dynamic neural network

In [1]:
%load_ext autoreload
%autoreload 2

# external imports
import numpy as np
import pandas as pd
import seaborn as sb
import gensim
from gensim.scripts.glove2word2vec import glove2word2vec
import pylab as pl
import matplotlib.pyplot as plt
import csv
import scipy
import os.path
import sklearn as sk
import keras
from keras.layers import *
from keras.layers.core import *
import random
import pickle
import gc

# internal imports
import helpers as HL
import cleaning as CL
import glove_module as GV
import neural_nets as NN
import tokenizing as TO



# Constants
DATA_FOLDER = "gensim_data_folder"
DATA_25DIM = DATA_FOLDER + "/gensim_glove_vectors_25dim.txt"
DATA_50DIM = DATA_FOLDER + "/gensim_glove_vectors_50dim.txt"
DATA_100DIM = DATA_FOLDER + "/gensim_glove_vectors_100dim.txt"
DATA_200DIM = DATA_FOLDER + "/gensim_glove_vectors_200dim.txt"

Using TensorFlow backend.
  return f(*args, **kwds)


# TODO-list:
    - make dynamic work on exampledata
    - be able to save and load weigths
    - alter a model instead of creating new
    -check out weigth decay

In [2]:
# uses the created gensim-.txt file to create the word2vec so one can operate on it
global_vectors = GV.make_glove(DATA_200DIM)

## Make TRAININGSET ready for neural network

In [3]:
#FOR TRAINING_SET
corpus_filenames = ['train_pos.txt', 'train_neg.txt','test_data.txt'] 
nr_pos_tweets = 100000
nr_neg_tweets = 100000
total_training_tweets = 200000

In [4]:
full_corpus, corpus_file_lengths = HL.create_corpus(corpus_filenames)

print("Length full corpus", len(full_corpus))
print("File lengths:", corpus_file_lengths)

Length full corpus 210000
File lengths: [100000, 100000, 10000]


In [5]:
###### Choose the corpus
processed_corpus = full_corpus

###### build vectors of all the tweets ######
num_of_dim = global_vectors.syn0.shape[1]
# seperate traindata and testdata
train_corpus = processed_corpus[:total_training_tweets:] 
predict_corpus = processed_corpus[total_training_tweets::] 
# Build a vector of all the words in a tweet
vectors = np.zeros(len(train_corpus), dtype=object)
for i, doc in enumerate(train_corpus):
    if (i % 50000) == 0:
        print("tweets processed: %.0f  of total number of tweets: %.0f" % (i,len(train_corpus)))
    vectors[i] = GV.buildWordVector(doc, num_of_dim, global_vectors)
train_document_vecs = np.concatenate(vectors)
train_document_vecs = sk.preprocessing.scale(train_document_vecs)
labels = GV.create_labels(total_training_tweets, nr_pos_tweets)

# FOR THE KAGGLE SUBMISSION
test_document_vecs = np.concatenate([GV.buildWordVector(doc, num_of_dim, global_vectors) for doc in predict_corpus])
test_document_vecs = sk.preprocessing.scale(test_document_vecs)
#############################################


tweets processed: 0  of total number of tweets: 200000
tweets processed: 50000  of total number of tweets: 200000
tweets processed: 100000  of total number of tweets: 200000
tweets processed: 150000  of total number of tweets: 200000


## Make FULLSET ready for Neural network

In [None]:
processed_corpus = pickle.load( open( "FULL_so_far_best_corpus.pkl", "rb" ) )
print(len(processed_corpus))

nr_pos_tweets = 1250000
nr_neg_tweets = 1250000
total_training_tweets = 2500000

In [None]:
###### Choose the corpus
#processed_corpus = processed_full_corpus

###### build vectors of all the tweets ######
num_of_dim = global_vectors.syn0.shape[1]
# seperate traindata and testdata
train_corpus = processed_corpus[:total_training_tweets:] 
predict_corpus = processed_corpus[total_training_tweets::]
del processed_corpus

# Build a vector of all the words in a tweet
vectors = np.zeros(len(train_corpus), dtype=object)
for i, doc in enumerate(train_corpus):
    if (i % 50000) == 0:
        print("tweets processed: %.0f  of total number of tweets: %.0f" % (i,len(train_corpus)))
    vectors[i] = GV.buildWordVector(doc, num_of_dim, global_vectors)
del global_vectors
del doc
print("done with making the trainvectors")

train_document_vecz = np.concatenate(vectors)
del vectors
print("done with concatenating the trainvectors")

train_document_vecs = sk.preprocessing.scale(train_document_vecz)
del train_document_vecz
print("done with scaling the trainvectors")

labels = GV.create_labels(total_training_tweets, nr_pos_tweets)
print("done with creating the labels")

#############################################


In [None]:
# maybe store trainvectors?
pickle.dump([train_document_vecs, labels], open( "FULL_train_document_vecs_and_labels.pkl", "wb" ) )

In [None]:

gc.collect()

## Only run this this badboy to save time and memory

## Making a perfect neural network

In [6]:
dd = NN.dynamic_dense

In [23]:

input_dimensions = train_document_vecs.shape[1]
width = 200
depth = 8
epochs = 60
n_folds = 2
split = 0.9
dropout_rate=0.4
funnel=0.75

#model = NN.deep_HB(input_dimensions)
model = dd(input_dimensions, width, depth, dropout_rate=dropout_rate, activation='relu', funnel=funnel)
print(model.summary())

final_model, cv_scores, histories = GV.testing_for_dd(model, train_document_vecs, labels, epochs, n_folds, split)


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_55 (Dense)             (None, 200)               40200     
_________________________________________________________________
dropout_43 (Dropout)         (None, 200)               0         
_________________________________________________________________
dense_56 (Dense)             (None, 150)               30150     
_________________________________________________________________
dropout_44 (Dropout)         (None, 150)               0         
_________________________________________________________________
dense_57 (Dense)             (None, 113)               17063     
_________________________________________________________________
dropout_45 (Dropout)         (None, 113)               0         
_________________________________________________________________
dense_58 (Dense)             (None, 85)                9690      
__________

Epoch 20/60
Epoch 21/60
Epoch 22/60
Epoch 23/60
Epoch 00023: early stopping
Train on 90000 samples, validate on 90000 samples
Epoch 1/60
Epoch 2/60
Epoch 3/60
Epoch 4/60
Epoch 5/60
Epoch 6/60
Epoch 7/60
Epoch 8/60
Epoch 9/60
Epoch 10/60
Epoch 11/60
Epoch 12/60
Epoch 13/60
Epoch 14/60
Epoch 15/60
Epoch 16/60
Epoch 17/60
Epoch 18/60
Epoch 19/60
Epoch 20/60
Epoch 21/60
Epoch 00021: early stopping
Val_accuracies: 82.23% (+/- 0.1267)
Time taken:  6.066332117716471 

evaluate on train_data: (loss:0.35624 , acc:83.690%):
Unseen_accuracies: (loss:0.3811 , acc:82.3550%):
Time taken:  6.293135917186737 



In [None]:
# investigating range of weights in model
weights = []
for layer in final_model.layers:
    w = layer.get_weights()
    weights.append(w)
    #print("mean:", np.mean(w))
    #print("std:", np.std(w))
    #print("max,min:", (np.max(w),np.min(w)))

In [None]:
# Investigate history and cv_scores
print(histories[0].history)
print(cv_scores)
print(np.mean([cv_score[1] for cv_score in cv_scores]))

conf = final_model.get_config()
print(type(conf))
final_model.to_json

In [None]:
arr = np.arange(9).reshape((3, 3))
print(arr)
np.random.shuffle(arr)
print(arr)


## Make a new FULL_corpus if Hedda finds a better one

In [None]:
full_corpus, nr_pos_tweets, nr_neg_tweets, total_training_tweets=HL.get_corpus(full=True)

In [None]:
input_={'hashtag': True, 'segmentation_hash': True,'hugs_and_kisses':True,'all_smilies':True,
        'numbers':True,'number_mention':True,'elongation':True, 'set_to_not':True,'exclamation':True}

better_corpus=TO.preprocess_corpus(full_corpus, **input_)

better_corpus_with_ngrams = HL.creating_n_grams_corpus(2,better_corpus)
del better_corpus

stopwords= TO.get_dynamic_stopwords(full_corpus, MinDf=0.00001, MaxDf=,sublinearTF=True,useIDF=False)

stopword_corpus=TO.remove_stopwords(better_corpus_with_ngrams, stopwords)
del 

In [None]:
pickle.dump(better_corpus_with_ngrams, open( "FULL_so_far_best_corpus.pkl", "rb" ) )

In [None]:
jesus_corpus = pickle.load( open( "FULL_so_far_best_corpus.pkl", "rb" ) )
print(len(jesus_corpus))

#################################################################################################
#################################################################################################

## Plotting

In [None]:
def plot_history(history):
    """should make this to plot the history of epochs and validationscore
    maybe even the crossvalidation mean of at each epoch? smoothen out the graph :)
    
    - make history into dataframe that fits seaborn
    - epoch on the x axis
    - score on the y axix (0-1)
    - plot val_los, val_acc, train_acc and train_loss
    """
    
    import seaborn as sns
    sb.set(style="darkgrid")

    # Load the long-form example gammas dataset
    gammas = sns.load_dataset("gammas")

    # Plot the response with standard error
    sb.tsplot(data=gammas, time="timepoint", unit="subject",
           condition="ROI", value="BOLD signal")
    

## Saving and loading

In [18]:
# example of how to save and load model
from keras.models import load_model

final_model.save('sunday0017_model.h5')  # creates a HDF5 file 'my_model.h5'
del final_model  # deletes the existing model


In [None]:
# returns a compiled model
# identical to the previous one
jesus_model = load_model('my_model.h5')## Saving and loading

## Further training of model

In [None]:
print(jesus_model)

# further train it
allX = train_document_vecs
allY = labels

improved_model, history = GV.train_NN(jesus_model, allX, allY, epochs=2)

In [None]:
# classify the bitches
epochs = 4
n_folds = 2

model_scores= GV.run_k_fold([final_model], train_document_vecs, labels, epochs, n_folds)


In [20]:
print(final_model)
model = final_model

<keras.models.Sequential object at 0x17a113da0>


## KAGGLE

In [21]:

# FOR THE KAGGLE SUBMISSION
test_document_vecs = np.concatenate([GV.buildWordVector(doc, num_of_dim, global_vectors) for doc in predict_corpus])
test_document_vecs = sk.preprocessing.scale(test_document_vecs)

print("Hello world")
pred=model.predict(test_document_vecs)

pred_ones=[]
for i in pred:
    if i> 0.5:
        pred_ones.append(1)
    else:
        pred_ones.append(-1)

#CREATING SUBMISSION
ids = list(range(1,10000+1))
HL.create_csv_submission(ids, pred_ones,"RIGHTONEsunday0017_w200_d6_n3_dr04.txt")

Hello world


## NOT RELEVANT

In [None]:
model, hyperparameters = hyperparameters_improver(dd, train_document_vecs, labels, init_hyperparameters)
    

In [None]:
def hyperparameter_improver(model, X, Y, init_hyper, step_sizes, epochs=3, n_folds=2, split=0.7, activation='relu',
                            epoch_threshold=2, time_sensitivity=1):
    input_dimensions = X.shape[1]
    hypers = init_hyper
    
    time_const = 0.1
    loss_const = 2
    acc_const = 0.5
    
    old_state = 999
    state_hist = []
    acc_hist = []
    hyper_hist = [] # list of tuples like [('depth', 1), ('width', 0)...] where 1 is change up, and vice versa
    
    
    curr_hyper, hyper_value = choose_hyperparameter(hypers)
    
    count = 0
    while count < 2:
        
        start = time.time()
        
        model = model(input_dimensions, hypers['width'], hypers['depth'], hypers['dropout_rate'],
                      activation=activation)
        final_model, cv_scores = GV.testing_for_dd(model, train_document_vecs, labels, epochs, n_folds, split)
        time_used = time.time() - start
        
        state, time_dominant, acc = score_state(cv_scores, time_used)
        state_hist.append(state)
        acc_hist.append(acc)
        
        act_on_move(state_hist, hypers, hyper_hist)
        
        old_state = state
        count +=1
        
    return model, hyperparameters, stepsizes

def choose_hyperparameter(hypers):
    """ choose random """
    key = random.choice(hypers.keys())
    print("is this a key:", key)
    return key
    
def choose_new_value(hypers, hyper_hist):
    """ choose either randomly either up or down, stepsize"""
    key = choose_hyperparameter(hypers)
    up_or_down = random.choice([0,1])
    
    if up_or_down:
        hypers[key] += step_sizes[key]
        hyper_hist.append((key, 1))
    else:
        hypers[key] -= step_sizes[key]
        hyper_hist.append((key, 0))
        
    return key, 

def update_value(hypers, key, up_or_down):
    if up_or_down:
        hypers[key] += step_sizes[key]
        hyper_hist.append((key, 1))
    else:
        hypers[key] -= step_sizes[key]
        hyper_hist.append((key, 0))

def act_on_move(state_hist, hypers, hyper_hist):
    """if bad move -> half decrease stepsize and go back a bit(half the distance)
    if good...really good ---> do again or 
    barely good --> do other hyperparameter"""
    last_change = state_hist[-1] > - state_hist[-2]
    # was the last move a backward move?
    #unchanged_hypers = 
    #regret_move = True if (hyper_hist[-1][0] == 
    # if the last two moves have been done on the last 
    #if (hyper_hist[-1][0] == hyper_hist[-1][0]):
        
        
    if (last_change >0.02):
        key = hyper_hist[-1][0]
        update_value(hypers, key, up_or_down)
    else:
        choose_new_value(hypers, hyper_hist)

def score_state(cv_scores, time_used, ):
    """ low score is good """
    time_punishment = time_sensitivity*time_punish_constant*time_used
    loss_punishment = np.mean(cv[:,0]) * loss_const
    acc = np.mean(cv[:,1])
    acc_punishment = acc_const/ acc
    time_dominant = round(time_punishment/(loss_punishment + acc_punishment)-0.5)
    return (loss_punishment + acc_punishment + time_punishment), time_dominant, acc   
    

In [None]:
# Neural net:
dd = NN.dynamic_dense
X = train_document_vecs
Y = labels

init_hyperparameters = {'width':200, 'depth':5, 'dropout_rate':0.2}
step_sizes = {'width':10, 'depth':1, 'dropout_rate':0.025}

model, hyperparameters, stepsizes = hyperparameter_improver(model, X, Y, init_hyperparameters, step_sizes, epochs=3, n_folds=2, split=0.7, activation='relu',
                            epoch_threshold=2, time_sensitivity=1)