### ideas
- We *could* do PCA to vizualize word2vec technology 

# GloVe with keras classification
##### Content:
- Import pretrained GloVe vectorspace
- Import our own data
- classify with keras FFNN(feedforward)


##### possible additional steps
- clustering
- preprossessing
- tf-idf
- experiment with different neural networks
- PCA vizualize vectorspace
- visualize end results


In [40]:
# external imports
import numpy as np
import pandas as pd
import seaborn as sb
import gensim as gs
from gensim.scripts.glove2word2vec import glove2word2vec
import pylab as pl
import matplotlib.pyplot as plt
import csv
import scipy
import os.path
import sklearn as sk
import keras
from keras.layers import *
from keras.layers.core import *

# internal imports
import helpers as HL
import cleaning as CL



# Constants
DATA_FOLDER = os.path.join("..", "..","glove_twitter_27B") #Spesialt for min location..opsops
DATA_25DIM = DATA_FOLDER + "/glove_twitter_27B_25d.txt"
DATA_50DIM = DATA_FOLDER + "/glove_twitter_27B_50d.txt"
DATA_100DIM = DATA_FOLDER + "/glove_twitter_27B_100d.txt"
DATA_200DIM = DATA_FOLDER + "/glove_twitter_27B_200d.txt"

## Import pretrained GloVe with gensim
one can use gensims word2vec functions to check similarity and other interesting functions
https://radimrehurek.com/gensim/models/word2vec.html

In [2]:
# spits out a .txt-file with the vectors in gensim format
glove2word2vec(glove_input_file=DATA_50DIM, word2vec_output_file="gensim_glove_vectors_50dim.txt")

(1193514, 50)

In [3]:
# uses the created gensim-.txt file to create the word2vec so one can operate on it
# from gensim.models.keyedvectors import KeyedVectors
glove_model = gs.models.KeyedVectors.load_word2vec_format("gensim_glove_vectors_50dim.txt", binary=False)

In [None]:
glove_model.similar_by_word("racism")

In [None]:
# supposed to show queen, to demonstrate the power of word2vec, but failed #blameTwitterDataset
glove_model.most_similar_cosmul(positive=['woman', 'king'], negative=['man'])

### Example of how one can use a multiple words with the same meaning to predict similarity

In [6]:
tweet = create_topic_vector(['i', 'fucking', 'hate', 'trump'])
glove_model.similar_by_vector(tweet)

[('think', 0.9476158618927002),
 ("n't", 0.9369821548461914),
 ('like', 0.9318934679031372),
 ('just', 0.9307540059089661),
 ('really', 0.9307530522346497),
 ('why', 0.9288229942321777),
 ('hate', 0.9266778826713562),
 ('know', 0.9264548420906067),
 ('that', 0.9209468364715576),
 ('when', 0.9199150204658508)]

#### Storing as ONLY wordvectors to save RAM

In [7]:
global_vectors = glove_model.wv
del glove_model

## Get own data ready for classification

#### Some functions

In [18]:
def buildWordVector(tokens, size, model):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens.split():
        try:       
            word = word.decode('utf-8')
            word_vec = model[word].reshape((1, size))             
            #idf_weighted_vec = word_vec * tfidf_dict[word]
            vec += word_vec
            count += 1.
        except KeyError: # handling the case where the token is not
                         # in the corpus. useful for testing.
            continue
    if count != 0:
        vec /= count
    return vec

In [9]:
class IterableCorpus():
    
    def __init__(self, corpus):
        self.corpus = corpus
        
    def __iter__(self):
        for tweet in self.corpus:
            tweet_words = tweet.split()
            yield [word.decode('utf-8') for word in tweet_words]

#### Some variables

In [16]:
#Global variables
N_DIMENSIONS = 50 #the global_vector_space is 50dim
test_set_tweets = 10000

#FOR TRAINING_SET
corpus_filenames = ['train_pos.txt', 'train_neg.txt','test_data.txt'] 
nr_pos_tweets = 100000
nr_neg_tweets = 100000
total_training_tweets = 200000

#### Create corpus from files

In [11]:
full_corpus, corpus_file_lengths = HL.create_corpus(corpus_filenames)

print("Length full corpus", len(full_corpus))
print("File lengths:", corpus_file_lengths)

Length full corpus 210000
File lengths: [100000, 100000, 10000]


#### Tokenizing hedda-style

In [66]:
import tolken
tokenized_full_corpus = tolken.replace_words(full_corpus)

#### N-gram

In [93]:
ngram_corpus = HL.creating_n_grams_cropus(corpus=tokenized_full_corpus, n_gram=4) #2GramsForLife 

#### Creating labels for the training files. Used to perform validation of the predictions

In [94]:
#Making labels
labels = np.zeros(total_training_tweets);
labels[0:nr_pos_tweets]=1;
labels[nr_pos_tweets:total_training_tweets]=0; 

print("labels shape: ", labels.shape)

labels shape:  (200000,)


#### Splitting the corpus into train and prediction - parts

We're done training the word2vec, so all "common" operations are finished

In [104]:
# SPLITTING THE W2V model into training and predict
train_clusterised_corpus = tokenized_full_corpus[:total_training_tweets:]
predict_clusterised_corpus = tokenized_full_corpus[total_training_tweets::]

print(len(train_clusterised_corpus))
print(len(predict_clusterised_corpus))

200000
10000


In [105]:
train_document_vecs = np.concatenate([buildWordVector(z, N_DIMENSIONS, global_vectors) for z in train_clusterised_corpus])
train_document_vecs = sk.preprocessing.scale(train_document_vecs)

In [106]:
print("Train w2v shape:",train_document_vecs.shape)

Train w2v shape: (200000, 50)


## Running the neural net classifier

In [107]:
#TING FOR Å SIKRE REPRODUSERBARHET ( ikke alt er nødv. nødvendig )

import os
import random
import time
os.environ['PYTHONHASHSEED'] = '0'
random.seed(12345) # NO IDEA WHAT THIS DOES

seed = 7
np.random.seed(seed)

#### Defining some neural net models

In [112]:
def basic_model():
    model = keras.models.Sequential()
    model.add(Dense(100, input_dim=N_DIMENSIONS, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def basic_model_adam():
    model = keras.models.Sequential()
    model.add(Dense(100, input_dim=N_DIMENSIONS, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def wide_model():
    model = keras.models.Sequential()
    model.add(Dense(150, input_dim=N_DIMENSIONS, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def deep_1_model():
    model = keras.models.Sequential()
    model.add(Dense(100, input_dim=N_DIMENSIONS, kernel_initializer='normal', activation='relu'))
    model.add(Dense(60, input_dim=N_DIMENSIONS, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def deep_2_model():
    model = keras.models.Sequential()
    model.add(Dense(100, input_dim=N_DIMENSIONS, kernel_initializer='normal', activation='relu'))
    model.add(Dense(60, input_dim=N_DIMENSIONS, kernel_initializer='normal', activation='relu'))
    model.add(Dense(30, input_dim=N_DIMENSIONS, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def deep_HB():
    model = keras.models.Sequential()
    model.add(Dense(150, input_dim=N_DIMENSIONS, kernel_initializer='normal', activation='relu'))
    model.add(Dense(100, input_dim=N_DIMENSIONS, kernel_initializer='normal', activation='relu'))
    model.add(Dense(80, input_dim=N_DIMENSIONS, kernel_initializer='normal', activation='relu'))
    model.add(Dense(40, input_dim=N_DIMENSIONS, kernel_initializer='normal', activation='relu'))
    model.add(Dense(20, input_dim=N_DIMENSIONS, kernel_initializer='normal', activation='relu'))
    model.add(Dense(40, input_dim=N_DIMENSIONS, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model
    


def recurrent_model():
    # Start neural network
    model = keras.models.Sequential()

    # Add an embedding layer
    model.add(Embedding(input_dim=N_DIMENSIONS, output_dim=128))

    # Add a long short-term memory layer with 128 units
    model.add(LSTM(units=128))

    # Add fully connected layer with a sigmoid activation function
    model.add(Dense(units=1, activation='sigmoid'))
    
    # Compile neural network
    model.compile(loss='binary_crossentropy', # Cross-entropy
                optimizer='Adam', # Adam optimization
                metrics=['accuracy']) # Accuracy performance metric

    return model

def convolutional_model():
    
    model = keras.models.Sequential()
    model.add(Conv1D(32,  activation='elu', padding='same', input_shape=(133332, N_DIMENSIONS)))
    model.add(Conv1D(32,  activation='elu', padding='same'))
    model.add(Conv1D(32,  activation='elu', padding='same'))
    model.add(Conv1D(32,  activation='elu', padding='same'))
    model.add(Dropout(0.25))
    model.add(Conv1D(32,  activation='elu', padding='same'))
    model.add(Conv1D(32,  activation='elu', padding='same'))
    model.add(Conv1D(32,  activation='elu', padding='same'))
    model.add(Conv1D(32,  activation='elu', padding='same'))
    model.add(Dropout(0.25))
    model.add(Flatten())
    model.add(Dense(256, activation='tanh'))
    model.add(Dense(256, activation='tanh'))
    model.add(Dropout(0.5))
    model.add(Dense(2, activation='softmax'))

    model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.0001, decay=1e-6), metrics=['accuracy'])
    
    return model

#### Crossvalidate this bitch

In [144]:
def run_k_fold(models, X, Y, epochs, n_folds, seed):
    
    for neural_model in models:
        
        model_name = neural_model.__name__
        
        model = neural_model()
        
        start = time.time()

        kfold = sklearn.model_selection.StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
        cv_scores = []
        
        for train, test in kfold.split(X, Y):
            
            early_stopping = keras.callbacks.EarlyStopping(monitor='loss', patience=3)
            
            model.fit(X[train], Y[train], epochs=epochs, batch_size=1024, verbose=0, callbacks=[early_stopping])
            
            scores = model.evaluate(X[test], Y[test], verbose=0)
        
        
        # To analyze if it is unbalanced classifying
        labels = Y[test]
        pred = model.predict(X[test])
        pos_right = 0
        neg_right = 0
        for i, label in enumerate(labels):
            if label == 1 and label == pred[i]:
                pos_right += 1
            elif label == 0 and label == pred[i]:
                neg_right += 1
        pos_perc = pos_right/(len(labels)*0.5)
        neg_perc = neg_right/(len(labels)*0.5)
        print("Model: ", model_name)
        print("%.2f%% (+/- %.2f%%)" % (np.mean(cv_scores), np.std(cv_scores)))
        print("Negative sentiment: %.2f%%  Positive sentiment: %.2f%%" % (neg_perc, pos_perc))
        print("Time taken: ", (time.time() - start) / 60, "\n")

In [145]:
models = [ basic_model, basic_model_adam, wide_model, deep_1_model, deep_2_model, deep_HB]#, recurrent_model, convolutional_model]

run_k_fold(models, train_document_vecs, labels, epochs=10, n_folds=3, seed=7)

Model:  basic_model
nan% (+/- nan%)
Negative sentiment: 0.00%  Positive sentiment: 0.00%
Time taken:  0.4995209455490112 



  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)
  keepdims=keepdims)
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
  ret = ret.dtype.type(ret / rcount)


Model:  basic_model_adam
nan% (+/- nan%)
Negative sentiment: 0.00%  Positive sentiment: 0.00%
Time taken:  0.48916691541671753 

Model:  wide_model
nan% (+/- nan%)
Negative sentiment: 0.00%  Positive sentiment: 0.00%
Time taken:  0.545284636815389 

Model:  deep_1_model
nan% (+/- nan%)
Negative sentiment: 0.00%  Positive sentiment: 0.00%
Time taken:  0.6302354017893473 

Model:  deep_2_model
nan% (+/- nan%)
Negative sentiment: 0.00%  Positive sentiment: 0.00%
Time taken:  0.6828426122665405 

Model:  deep_HB
nan% (+/- nan%)
Negative sentiment: 0.00%  Positive sentiment: 0.00%
Time taken:  1.1112324635187785 



In [None]:
print("Negative sentiment: %.2f%%  Positive sentiment: %.2f%%" % labels[i==0][np.argmax(pred)]/len(labels))#, pred[0][np.argmax(pred)] * 100))