### ideas
- We *could* do PCA to vizualize word2vec technology 

# Designing neural networks
##### Content:
- Import pretrained GloVe vectorspace
- Import our own data
- classify with keras FFNN(feedforward)


##### possible additional steps
- clustering
- preprossessing
- tf-idf
- experiment with different neural networks
- PCA vizualize vectorspace
- visualize end results


In [1]:
%load_ext autoreload
%autoreload 2

# external imports
import numpy as np
import pandas as pd
import seaborn as sb
import gensim
from gensim.scripts.glove2word2vec import glove2word2vec
import pylab as pl
import matplotlib.pyplot as plt
import csv
import scipy
import os.path
import sklearn as sk
import keras
from keras.layers import *
from keras.layers.core import *

# internal imports
import helpers as HL
import cleaning as CL
import glove_module as GV
import neural_nets as NN



# Constants
DATA_FOLDER = "gensim_data_folder"
DATA_25DIM = DATA_FOLDER + "/gensim_glove_vectors_25dim.txt"
DATA_50DIM = DATA_FOLDER + "/gensim_glove_vectors_50dim.txt"
DATA_100DIM = DATA_FOLDER + "/gensim_glove_vectors_100dim.txt"
DATA_200DIM = DATA_FOLDER + "/gensim_glove_vectors_200dim.txt"

Using TensorFlow backend.
  return f(*args, **kwds)


## Import pretrained GloVe with gensim
one can use gensims word2vec functions to check similarity and other interesting functions
https://radimrehurek.com/gensim/models/word2vec.html

In [2]:
# ONLY NEED TO THIS THE FIRST TIME ONE IMPORTS THE PRETRAINED GLOVE
# Creates a gensim_word2vec_file in the same folder
# GV.create_gensim_word2vec_file(DATA_25DIM)

In [3]:
# uses the created gensim-.txt file to create the word2vec so one can operate on it
global_vectors = GV.make_glove(DATA_200DIM)

## Get own data ready for classification

#### Some variables

In [4]:
#FOR TRAINING_SET
corpus_filenames = ['train_pos.txt', 'train_neg.txt','test_data.txt'] 
nr_pos_tweets = 100000
nr_neg_tweets = 100000
total_training_tweets = 200000

#### Create corpus from files

In [5]:
full_corpus, corpus_file_lengths = HL.create_corpus(corpus_filenames)

print("Length full corpus", len(full_corpus))
print("File lengths:", corpus_file_lengths)

Length full corpus 210000
File lengths: [100000, 100000, 10000]


#### Possible preprocessing

In [6]:
ngram_full_corpus = HL.creating_n_grams_cropus(2, full_corpus)

## Running the neural net classifier

In [7]:
###### Choose the corpus
processed_corpus = full_corpus

###### build vectors of all the tweets ######
num_of_dim = global_vectors.syn0.shape[1]
# seperate traindata and testdata
train_corpus = processed_corpus[:total_training_tweets:] 
predict_corpus = processed_corpus[total_training_tweets::] 
# Build a vector of all the words in a tweet
vectors = np.zeros(len(train_corpus), dtype=object)
for i, doc in enumerate(train_corpus):
    if (i % 50000) == 0:
        print("tweets processed: %.0f  of total number of tweets: %.0f" % (i,len(train_corpus)))
    vectors[i] = GV.buildWordVector(doc, num_of_dim, global_vectors)
train_document_vecs = np.concatenate(vectors)
train_document_vecs = sk.preprocessing.scale(train_document_vecs)
labels = GV.create_labels(total_training_tweets, nr_pos_tweets)
#############################################

tweets processed: 0  of total number of tweets: 200000
tweets processed: 50000  of total number of tweets: 200000
tweets processed: 100000  of total number of tweets: 200000
tweets processed: 150000  of total number of tweets: 200000


In [8]:
# choose your neuralnets
neural_nets_functions = [NN.conv1d, NN.deep_HB, NN.basic_model_adam]

In [9]:
# classify the bitches
epochs = 1
n_folds = 2

model_scores= GV.run_k_fold(neural_nets_functions, train_document_vecs, labels, epochs, n_folds)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 194, 128)          1024      
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               12900     
_________________________________________________________________
activation_1 (Activation)    (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 40)                4040      
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 41        
_________________________________________________________________
activation_2 (Activation)    (None, 1)                 0         
Total para

In [12]:
print(model_scores)

[(57.761000000000003, 0.53500000000000369), (80.676500000000004, 0.53450000000000131), (80.340000000000003, 0.59499999999999886)]


### Result board
- ngram=2 || deep_HB: 82.64% (0.86%)

In [11]:
from keras.models import Model
from keras.layers import Conv1D, Dense, MaxPool1D, Flatten, Input
import numpy as np

inp =  Input(shape=(5, 1))
conv = Conv1D(filters=2, kernel_size=2)(inp)
pool = MaxPool1D(pool_size=2)(conv)
flat = Flatten()(pool)
dense = Dense(1)(flat)
model = Model(inp, dense)
model.compile(loss='mse', optimizer='adam')

print(model.summary())

# get some data
X = np.expand_dims(np.random.randn(10, 5), axis=2)
y = np.random.randn(10)
print(X.shape, y.shape)


# fit model
model.fit(X, y, epochs=10, verbose=1, steps_per_epoch=200)
print(model.history.history)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 5, 1)              0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 4, 2)              6         
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 2, 2)              0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 4)                 0         
_________________________________________________________________
dense_16 (Dense)             (None, 1)                 5         
Total params: 11
Trainable params: 11
Non-trainable params: 0
_________________________________________________________________
None
(10, 5, 1) (10,)
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
{'loss': [0.5