# Glove with preprocessing 

## Loading Libraries

In [33]:
import random as rn
import tensorflow as tf
from keras import backend as K

# external imports
import numpy as np
import time
import pandas as pd
import seaborn as sb
import gensim
from gensim.scripts.glove2word2vec import glove2word2vec
import pylab as pl
import matplotlib.pyplot as plt
import csv
import scipy
import os.path

import keras
from keras.layers import *
from keras.layers.core import *
from keras import backend as K

import sklearn as sk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import ParameterGrid

# internal imports
import helpers as HL
import cleaning as CL
import glove_module as GV
import neural_nets as NN
import tokenizing as TO
import tokenizing_ekphrasis as TE

%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Defining Data Paths 

### Files used to create model:

In [4]:
# Constants
DATA_FOLDER = os.path.join("glove.twitter.27B") 
DATA_25DIM = DATA_FOLDER + "/glove.twitter.27B.25d.txt"
DATA_50DIM = DATA_FOLDER + "/glove.twitter.27B.50d.txt"
DATA_100DIM = DATA_FOLDER + "/glove.twitter.27B.100d.txt"
DATA_200DIM = DATA_FOLDER + "/glove.twitter.27B.200d.txt"

### Data files:

In [5]:
training_set_pos = "train_pos.txt" 
training_set_neg = "train_neg.txt"
training_set_pos_full = "train_pos_full.txt"
training_set_neg_full = "train_neg_full.txt"
test_set = "test_data.txt"

## Import pretrained GloVe with gensim
one can use gensims word2vec functions to check similarity and other interesting functions https://radimrehurek.com/gensim/models/word2vec.html

## Create the word embeddings using the created gensim-.txt file.

In [23]:
start = time.time()
#Pick one, the higher dimension, the better result and longer computational time. 

#global_vectors=GV.make_glove("data/gensim_global_vectors_25dim.txt")
#global_vectors=GV.make_glove("data/gensim_global_vectors_50dim.txt")
#global_vectors=GV.make_glove("data/gensim_global_vectors_100dim.txt")
global_vectors=GV.make_glove("data/gensim_global_vectors_200dim.txt")
print("Time: ", (time.time() - start) / 60)

Time:  1487730303.511215


## Creating corpus:

In [24]:
start = time.time()

#When testing
inputfiles=[training_set_pos,training_set_neg,test_set]

#when using full data set:
#inputfiles=[training_set_pos_full,training_set_neg_full,test_set]

full_corpus, file_lengths=HL.create_corpus(inputfiles)
nr_pos_tweets = file_lengths[0]
nr_neg_tweets = file_lengths[1]
total_training_tweets =file_lengths [0]+file_lengths[1]


print("Time: ", (time.time() - start) / 60)

Time:  1487730299.4267645


## Testing unprocessed corpus on neural nets to find best neural net: 

In [None]:
neural_nets = [NN.basic_model, NN.basic_model_adam, NN.wide_model, NN.deep_2_model, NN.deep_HB]

GV.classify_with_neural_networks(neural_nets, global_vectors, full_corpus, total_training_tweets, nr_pos_tweets, epochs=10, n_folds=3)

#### We decide to keep the deep_HB-model. 

We now test different combinations of preprocessing to see what has the best results with the chosen neural net model. 

Preprocessing we're testing: 
- N-grams 
- Word cluster
- Stemming 
- Tweet feature creation

Different big stuff
- Sklearn TfidfVectorizer
- ekphrasis?

In [None]:
#Preperation for pipeline testing
cluster_file="50mpaths2.txt"
cluster_dictionary=CL.create_dictionary(cluster_file)

In [None]:
TE_corpus = TE.tokenizing_ekphrasis(full_corpus)

In [26]:
neural_nets = [NN.deep_HB_dropout]

In [29]:
"""
NOTES ( USING 25 DIM DATA )

- clustering decreases the acc by ish 10% each time. Always setting it to false

COMMON BEST FOUND FACTORS EARLIER RUN: 
- Specialfeatures:False
- Stem:True
- N_gram:2

BEST SO FAR:

Ekphrasis:False
N_gram:0
Cluster:False
Stem:True
Special_features:False
Model:  deep_HB
85.11% (+/- 1.91%)
"""

def cross_validate_preprocessing(corpus):
    
    param_grid = ParameterGrid(param_grid = {
    'ekphrasis': [False],
    'n_gram': [0],
    'cluster': [False],
    'stem': [False],
    'special_features': [False]
    })
        
    for params in param_grid:
        
        ekphrasis = params['ekphrasis']
        n_gram = params['n_gram']
        cluster = params['cluster']
        stem = params['stem']
        special_features = params['special_features']
        
        model_scores = run_pipeline(corpus, ekphrasis, cluster, stem, special_features, n_gram)
        
def run_pipeline(corpus, ekphrasis, cluster, stem, special_features, n_gram=0):
        
    if(ekphrasis):
        corpus = TE_corpus
        print("Corpus tokenized!")
        
    if(special_features):
        
        corpus = TO.preprocess_corpus(corpus,
                      stemming=stem, all_smilies=False, pos_smilies=True, 
                      neg_smilies=True, other_smilies=True, hugs_and_kisses=True,
                      hearts=True, hashtag=True, hashtag_mention=True, 
                      numbers=True, number_mention=True, exclamation=True,
                      set_to_not=False)
        print("Special features integrated!")
        
    if(n_gram!=0):
        corpus = HL.creating_n_grams_cropus(n_gram, corpus)
        print("N_grams made!")

    if(cluster):
        corpus = CL.create_clusterized_corpus(corpus,cluster_dictionary)
        print("Corpus Clusterized!")
    
    print("\nEkphrasis:{}\n N_gram:{}\nCluster:{}\nStem:{}\nSpecial_features:{}".format( 
          ekphrasis, n_gram, cluster, stem, special_features))
    
    model_scores = GV.classify_with_neural_networks(neural_nets, global_vectors, corpus, total_training_tweets, nr_pos_tweets, epochs=10, n_folds=5)
    return model_scores
    

In [None]:
#To run the cross val thingy!
cross_validate_preprocessing(full_corpus)

In [None]:
#for n_gram in [10,20,30,40,50]:
#    score = run_pipeline(full_corpus, False, False, False, False, n_gram=n_gram)[0]
#    print(score)

### Everything below is every "feature" run by itself 

## Cleaning tweets using ekphrasis 

In [37]:
TE_corpus=TE.tokenizing_ekphrasis(full_corpus)

Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


In [38]:
GV.classify_with_neural_networks(neural_nets, global_vectors, TE_corpus, total_training_tweets, nr_pos_tweets, epochs=5, n_folds=2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Model:  deep_HB_dropout
81.86% (+/- 0.03%)
Negative sentiment: 75.71%  Positive sentiment: 88.02%
Percentage of positive classifications (should be 50%ish): 56.155
Time taken:  1.0371862967809042 



[81.86099999999999]

In [39]:
GV.classify_with_neural_networks(neural_nets, global_vectors, TE_corpus, total_training_tweets, nr_pos_tweets, epochs=5, n_folds=2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Model:  deep_HB_dropout
81.86% (+/- 0.03%)
Negative sentiment: 75.71%  Positive sentiment: 88.02%
Percentage of positive classifications (should be 50%ish): 56.155
Time taken:  1.057654583454132 



[81.86099999999999]

## Cleaning tweets using costumized tokenizing function

### Not really doing anything, just checking

In [None]:
tokenized_corpus=TO.preprocess_corpus(TE_corpus,stemming=False,
                      all_smilies=False, pos_smilies=False, neg_smilies=False, other_smilies=False,
                      hugs_and_kisses=False,hearts=False,
                      hashtag=False, hashtag_mention=False, 
                      numbers=False, number_mention=False, 
                      exclamation=False,
                      set_to_not=False)

In [None]:
GV.classify_with_neural_networks(neural_nets, global_vectors, tokenized_corpus, total_training_tweets, nr_pos_tweets)

### Finding positive smilies

In [None]:
tokenized_corpus_pos_smile=TO.preprocess_corpus(TE_corpus,stemming=False,
                      all_smilies=False, pos_smilies=True, neg_smilies=False, other_smilies=False,
                      hugs_and_kisses=False,hearts=False,
                      hashtag=False, hashtag_mention=False, 
                      numbers=False, number_mention=False, 
                      exclamation=False,
                      set_to_not=False)

In [None]:
GV.classify_with_neural_networks(neural_nets, global_vectors, tokenized_corpus_pos_smile, total_training_tweets, nr_pos_tweets)

### Finding negative and positive smilies

In [None]:
tokenized_corpus_pos_neg_smile=TO.preprocess_corpus(TE_corpus,stemming=False,
                      all_smilies=False, pos_smilies=True, neg_smilies=True, other_smilies=False,
                      hugs_and_kisses=False,hearts=False,
                      hashtag=False, hashtag_mention=False, 
                      numbers=False, number_mention=False, 
                      exclamation=False,
                      set_to_not=False)

In [None]:
GV.classify_with_neural_networks(neural_nets, global_vectors, tokenized_corpus_pos_neg_smile, total_training_tweets, nr_pos_tweets)

### Finding all smilies

In [None]:
tokenized_corpus_all_smile=TO.preprocess_corpus(TE_corpus,stemming=False,
                      all_smilies=True, pos_smilies=False, neg_smilies=False, other_smilies=False,
                      hugs_and_kisses=False,hearts=False,
                      hashtag=False, hashtag_mention=False, 
                      numbers=False, number_mention=False, 
                      exclamation=False,
                      set_to_not=False)

In [None]:
GV.classify_with_neural_networks(neural_nets, global_vectors, tokenized_corpus_all_smile, total_training_tweets, nr_pos_tweets)

### Finding all hearts

In [None]:
tokenized_corpus_hearts=TO.preprocess_corpus(TE_corpus,stemming=False,
                      all_smilies=False, pos_smilies=False, neg_smilies=False, other_smilies=False,
                      hugs_and_kisses=False,hearts=True,
                      hashtag=False, hashtag_mention=False, 
                      numbers=False, number_mention=False, 
                      exclamation=False,
                      set_to_not=False)

In [None]:
GV.classify_with_neural_networks(neural_nets, global_vectors, tokenized_corpus_hearts, total_training_tweets, nr_pos_tweets)

## Testing clusters

Har kommet på at dette ikke gir så mye mening mer... Hvis vi vil teste dette må vi vel sette hvert ord til et av ordene i clusteren, ikke bare cluster ID? Fordi cluster-ID gir ikke noe mening og dermed gir ikke word2vec vectoren mening.. 

In [None]:
best_corpus_so_far=TE_corpus

In [None]:
cluster_file="50mpaths2.txt"
cluster_dictionary=CL.create_dictionary(cluster_file)

In [None]:
clusteded_corpus=CL.create_clusterized_corpus(best_corpus_so_far,cluster_dictionary) 

In [None]:
GV.classify_with_neural_networks(neural_nets, global_vectors, clusteded_corpus, total_training_tweets, nr_pos_tweets)

## Testing n-grams

In [None]:
best_corpus_so_far=TE_corpus

In [None]:
n_grams=2
n_grams_corpus=HL.creating_n_grams_cropus(n_grams,best_corpus_so_far)

In [None]:
GV.classify_with_neural_networks(neural_nets, global_vectors, n_grams_corpus, total_training_tweets, nr_pos_tweets)

In [None]:
n_grams=3
n_grams_corpus3=HL.creating_n_grams_cropus(n_grams,best_corpus_so_far)

In [None]:
GV.classify_with_neural_networks(neural_nets, global_vectors, n_grams_corpus3, total_training_tweets, nr_pos_tweets)

In [None]:
n_grams=4
n_grams_corpus4=HL.creating_n_grams_cropus(n_grams,best_corpus_so_far)

In [None]:
GV.classify_with_neural_networks(neural_nets, global_vectors, n_grams_corpus4, total_training_tweets, nr_pos_tweets)

In [None]:
#Plot graf for n-grams. null poeng i å gjøre før kjøringer gir samme verdi hver gang.. 

# Making Kaggle submission

Som før for å lage en keggle! 

In [34]:
kaggle_name="keggle_glove_TE_dropout.csv"
#final_corpus=n_grams_corpus

delivery_corpus = full_corpus

# INSERT PREPROSESSING

pred= GV.get_prediction(NN.deep_HB_dropout, global_vectors, delivery_corpus, total_training_tweets, nr_pos_tweets,kaggle_name, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Hello world


In [40]:
print(sum(pred))

#WTF is this?

970
