### ideas
- We *could* do PCA to vizualize word2vec technology 

# GloVe with keras classification
##### Content:
- Import pretrained GloVe vectorspace
- Import our own data
- classify with keras FFNN(feedforward)


##### possible additional steps
- clustering
- preprossessing
- tf-idf
- experiment with different neural networks
- PCA vizualize vectorspace
- visualize end results


In [3]:
# external imports
import numpy as np
import pandas as pd
import seaborn as sb
import gensim
from gensim.scripts.glove2word2vec import glove2word2vec
import pylab as pl
import matplotlib.pyplot as plt
import csv
import scipy
import os.path
import sklearn as sk
import keras
from keras.layers import *
from keras.layers.core import *

# internal imports
import helpers as HL
import cleaning as CL
import glove_module as GV
import neural_nets as NN



# Constants
DATA_FOLDER = os.path.join("glove.twitter.27B") 
DATA_25DIM = DATA_FOLDER + "/glove.twitter.27B.25d.txt"
DATA_50DIM = DATA_FOLDER + "/glove.twitter.27B.50d.txt"
DATA_100DIM = DATA_FOLDER + "/glove.twitter.27B.100d.txt"
DATA_200DIM = DATA_FOLDER + "/glove.twitter.27B.200d.txt"
#gensim_25

## Import pretrained GloVe with gensim
one can use gensims word2vec functions to check similarity and other interesting functions
https://radimrehurek.com/gensim/models/word2vec.html

In [2]:
# ONLY NEED TO THIS THE FIRST TIME ONE IMPORTS THE PRETRAINED GLOVE
# Creates a gensim_word2vec_file in the same folder

GV.create_gensim_word2vec_file(DATA_25DIM)
#GV.create_gensim_word2vec_file(DATA_50DIM)
#GV.create_gensim_word2vec_file(DATA_100DIM)
#GV.create_gensim_word2vec_file(DATA_200DIM)

In [None]:
GV.method1("gensim_global_vectors_100dim.txt", full_corpus, total_training_tweets, nr_pos_tweets, all_neural_nets=True)


In [None]:
# uses the created gensim-.txt file to create the word2vec so one can operate on it
global_vectors = GV.make_glove("gensim_global_vectors_50dim.txt")

In [None]:
# supposed to show queen, to demonstrate the power of word2vec, but failed #blameTwitterDataset
global_vectors.most_similar(positive=['woman', 'king'], negative=['man'])

## Get own data ready for classification

#### Some variables

In [None]:

#FOR TRAINING_SET
corpus_filenames = ['train_pos.txt', 'train_neg.txt','test_data.txt'] 
nr_pos_tweets = 100000
nr_neg_tweets = 100000
total_training_tweets = 200000

#### Create corpus from files

In [None]:
full_corpus, corpus_file_lengths = HL.create_corpus(corpus_filenames)

print("Length full corpus", len(full_corpus))
print("File lengths:", corpus_file_lengths)

# TESTING COMPLETE FUNCTION

In [None]:
models2 = [NN.basic_model, NN.basic_model_adam, NN.wide_model, NN.deep_2_model, NN.deep_HB]

In [None]:
GV.classify_with_neural_networks(models2, global_vectors, full_corpus, total_training_tweets, nr_pos_tweets)

#### Tokenizing hedda-style

In [None]:
import tolken
tokenized_full_corpus = tolken.replace_words(full_corpus)

#### N-gram

In [None]:
ngram_corpus = HL.creating_n_grams_cropus(corpus=tokenized_full_corpus, n_gram=2) #2GramsForLife 

#### Creating labels for the training files. Used to perform validation of the predictions

In [None]:
#Making labels
labels = np.zeros(total_training_tweets);
labels[0:nr_pos_tweets]=1;
labels[nr_pos_tweets:total_training_tweets]=0; 

print("labels shape: ", labels.shape)

#### Splitting the corpus into train and prediction - parts

We're done training the word2vec, so all "common" operations are finished

In [None]:
# SPLITTING THE W2V model into training and predict
train_clusterised_corpus = tokenized_full_corpus[:total_training_tweets:]
predict_clusterised_corpus = tokenized_full_corpus[total_training_tweets::]

print(len(train_clusterised_corpus))
print(len(predict_clusterised_corpus))

In [None]:
train_document_vecs = np.concatenate([GV.buildWordVector(z, global_vectors.syn0.shape[1], global_vectors) for z in train_clusterised_corpus])
train_document_vecs = sk.preprocessing.scale(train_document_vecs)

In [None]:
print("Train w2v shape:",train_document_vecs.shape)

In [None]:
print(labels)

## Running the neural net classifier

In [None]:
#TING FOR Å SIKRE REPRODUSERBARHET ( ikke alt er nødv. nødvendig )

import os
import random
import time
os.environ['PYTHONHASHSEED'] = '0'
random.seed(12345) # NO IDEA WHAT THIS DOES

seed = 7
np.random.seed(seed)

#### Defining some neural net models

#### Crossvalidate this bitch

In [None]:
def run_k_fold(models, X, Y, epochs, n_folds, seed):
    
    for neural_model in models:
        
        model_name = neural_model.__name__
        
        model = neural_model()
        
        start = time.time()

        kfold = sklearn.model_selection.StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
        cv_scores = []
        
        for train, test in kfold.split(X, Y):
            
            early_stopping = keras.callbacks.EarlyStopping(monitor='loss', patience=3)
            
            model.fit(X[train], Y[train], epochs=epochs, batch_size=1024, verbose=0, callbacks=[early_stopping])
            
            score = model.evaluate(X[test], Y[test], verbose=0)
            cv_scores.append(score)
        
        
        # To analyze if it is unbalanced classifying
        labels = Y[test]
        pred = model.predict(X[test])
        pos_right = 0
        neg_right = 0
        for i, label in enumerate(labels):
            if label == 1 and label == pred[i]:
                pos_right += 1
            elif label == 0 and label == pred[i]:
                neg_right += 1
        pos_perc = pos_right/(len(labels)*0.5)
        neg_perc = neg_right/(len(labels)*0.5)
        
        print("Model: ", model_name)
        print("%.2f%% (+/- %.2f%%)" % (np.mean(cv_scores), np.std(cv_scores)))
        print("Negative sentiment: %.2f%%  Positive sentiment: %.2f%%" % (neg_perc, pos_perc))
        print("Time taken: ", (time.time() - start) / 60, "\n")

In [None]:
models = [ basic_model, basic_model_adam, wide_model, deep_1_model, deep_2_model, deep_HB]#, recurrent_model, convolutional_model]

run_k_fold(models, train_document_vecs, labels, epochs=10, n_folds=3, seed=7)

In [None]:
pos_scores = []
neg_scores = []
labs = [0,1,1,1,1,0]
prd = [0.32,0.49,0.50,0.6232,0.81,0.9]
pos_right = 0
neg_right = 0
for i, lab in enumerate(labs):
    if lab == 1 and prd[i] >=0.5:
        pos_right += 1
        
    elif lab == 0 and prd[i] < 0.5:
        neg_right += 1
pos_scores.append((pos_right / (len(labs) * 0.5))*100)
neg_scores.append((neg_right / (len(labs) * 0.5))*100)
print("neggies:", np.mean(neg_scores))
print("possies:", np.mean(pos_scores))

In [None]:
n_labs = np.array(labs)
n_prd = np.array(prd)
n_prd = np.round(prd)
print(n_prd.astype(int))
arg1 = np.argwhere(n_labs == 1)
arg0 = np.argwhere(n_labs == 0)
neg_

print(pos_corr)

In [None]:
print("Negative sentiment: %.2f%%  Positive sentiment: %.2f%%" % labels[i==0][np.argmax(pred)]/len(labels))#, pred[0][np.argmax(pred)] * 100))

In [None]:
models2 = [NN.basic_model, NN.basic_model_adam, NN.wide_model, NN.deep_2_model, NN.deep_HB]


In [None]:
GV.classify_with_neural_networks(models, global_vectors, processed_corpus)