# Glove with preprocessing 

## Loading Libraries

In [1]:
# external imports
import numpy as np
import pandas as pd
import seaborn as sb
import gensim
from gensim.scripts.glove2word2vec import glove2word2vec
import pylab as pl
import matplotlib.pyplot as plt
import csv
import scipy
import os.path
import sklearn as sk
import keras
from keras.layers import *
from keras.layers.core import *
import tensorflow as tf
import random as rn
from keras import backend as K
from sklearn.feature_extraction.text import TfidfVectorizer
# internal imports
import helpers as HL
import cleaning as CL
import glove_module as GV
import neural_nets as NN
import tokenizing as TO
import tokenizing_ekphrasis as TE

Using TensorFlow backend.
  return f(*args, **kwds)


### Need the following to get consistent results

In [2]:
np.random.seed(7)
rn.seed(7)
tf.set_random_seed(7)
session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)

## Defining Data Paths 

### Files used to create model:

In [3]:
# Constants
DATA_FOLDER = os.path.join("glove.twitter.27B") 
DATA_25DIM = DATA_FOLDER + "/glove.twitter.27B.25d.txt"
DATA_50DIM = DATA_FOLDER + "/glove.twitter.27B.50d.txt"
DATA_100DIM = DATA_FOLDER + "/glove.twitter.27B.100d.txt"
DATA_200DIM = DATA_FOLDER + "/glove.twitter.27B.200d.txt"

### Data files:

In [4]:
training_set_pos = "train_pos.txt" 
training_set_neg = "train_neg.txt"
training_set_pos_full = "train_pos_full.txt"
training_set_neg_full = "train_neg_full.txt"
test_set = "test_data.txt"

## Import pretrained GloVe with gensim
one can use gensims word2vec functions to check similarity and other interesting functions https://radimrehurek.com/gensim/models/word2vec.html

In [5]:
# ONLY NEED TO THIS THE FIRST TIME ONE IMPORTS THE PRETRAINED GLOVE
# Creates a gensim_word2vec_file in the same folder
#GV.create_gensim_word2vec_file(DATA_200DIM)

## Create the word embeddings using the created gensim-.txt file.

In [6]:
#Pick one, the higher dimension, the better result and longer computational time. 

#global_vectors=GV.make_glove("gensim_global_vectors_25dim.txt")
#global_vectors=GV.make_glove("gensim_global_vectors_50dim.txt")
#global_vectors=GV.make_glove("gensim_global_vectors_100dim.txt")
global_vectors=GV.make_glove("gensim_global_vectors_200dim.txt")

## Defining which neural nets to use: 

In [7]:
neural_nets = [NN.deep_HB]
#neural_nets = [NN.basic_model, NN.basic_model_adam, NN.wide_model, NN.deep_2_model, NN.deep_HB]

## Creating corpus:

In [8]:
#When testing
inputfiles=[training_set_pos,training_set_neg,test_set]

#when using full data set:
#inputfiles=[training_set_pos_full,training_set_neg_full,test_set]

full_corpus, file_lengths=HL.create_corpus(inputfiles)
nr_pos_tweets = file_lengths[0]
nr_neg_tweets = file_lengths[1]
total_training_tweets =file_lengths [0]+file_lengths[1]

## Testing unprocessed corpus: 


In [None]:
GV.classify_with_neural_networks(neural_nets, global_vectors, full_corpus, total_training_tweets, nr_pos_tweets)

In [None]:
#testing again to see if we get the same results
GV.classify_with_neural_networks(neural_nets, global_vectors, full_corpus, total_training_tweets, nr_pos_tweets)

## Cleaning tweets using ekphrasis 

In [9]:
TE_corpus=TE.tokenizing_ekphrasis(full_corpus)

Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


In [10]:
GV.classify_with_neural_networks(neural_nets, global_vectors, TE_corpus, total_training_tweets, nr_pos_tweets)

Model:  deep_HB
84.17% (+/- 1.49%)
Negative sentiment: 83.69%  Positive sentiment: 84.64%
Percentage of positive classifications (should be 50%ish): 50.4779762053
Time taken:  1.7471990664800008 



## Cleaning tweets using costumized tokenizing function

### Not really doing anything, just checking

In [None]:
tokenized_corpus=TO.preprocess_corpus(TE_corpus,stemming=False,
                      all_smilies=False, pos_smilies=False, neg_smilies=False, other_smilies=False,
                      hugs_and_kisses=False,hearts=False,
                      hashtag=False, hashtag_mention=False, 
                      numbers=False, number_mention=False, 
                      exclamation=False,
                      set_to_not=False)

In [None]:
GV.classify_with_neural_networks(neural_nets, global_vectors, tokenized_corpus, total_training_tweets, nr_pos_tweets)

### Finding positive smilies

In [None]:
tokenized_corpus_pos_smile=TO.preprocess_corpus(TE_corpus,stemming=False,
                      all_smilies=False, pos_smilies=True, neg_smilies=False, other_smilies=False,
                      hugs_and_kisses=False,hearts=False,
                      hashtag=False, hashtag_mention=False, 
                      numbers=False, number_mention=False, 
                      exclamation=False,
                      set_to_not=False)

In [None]:
GV.classify_with_neural_networks(neural_nets, global_vectors, tokenized_corpus_pos_smile, total_training_tweets, nr_pos_tweets)

### Finding negative and positive smilies

In [None]:
tokenized_corpus_pos_neg_smile=TO.preprocess_corpus(TE_corpus,stemming=False,
                      all_smilies=False, pos_smilies=True, neg_smilies=True, other_smilies=False,
                      hugs_and_kisses=False,hearts=False,
                      hashtag=False, hashtag_mention=False, 
                      numbers=False, number_mention=False, 
                      exclamation=False,
                      set_to_not=False)

In [None]:
GV.classify_with_neural_networks(neural_nets, global_vectors, tokenized_corpus_pos_neg_smile, total_training_tweets, nr_pos_tweets)

### Finding all smilies

In [11]:
tokenized_corpus_all_smile=TO.preprocess_corpus(TE_corpus,stemming=False,
                      all_smilies=True, pos_smilies=False, neg_smilies=False, other_smilies=False,
                      hugs_and_kisses=False,hearts=False,
                      hashtag=False, hashtag_mention=False, 
                      numbers=False, number_mention=False, 
                      exclamation=False,
                      set_to_not=False)

In [12]:
GV.classify_with_neural_networks(neural_nets, global_vectors, tokenized_corpus_all_smile, total_training_tweets, nr_pos_tweets)

Model:  deep_HB
83.54% (+/- 1.33%)
Negative sentiment: 79.93%  Positive sentiment: 87.16%
Percentage of positive classifications (should be 50%ish): 53.6149979157
Time taken:  1.8113451202710469 



### Finding all hearts

In [None]:
tokenized_corpus_hearts=TO.preprocess_corpus(TE_corpus,stemming=False,
                      all_smilies=False, pos_smilies=False, neg_smilies=False, other_smilies=False,
                      hugs_and_kisses=False,hearts=True,
                      hashtag=False, hashtag_mention=False, 
                      numbers=False, number_mention=False, 
                      exclamation=False,
                      set_to_not=False)

In [None]:
GV.classify_with_neural_networks(neural_nets, global_vectors, tokenized_corpus_hearts, total_training_tweets, nr_pos_tweets)

## Testing clusters

Har kommet på at dette ikke gir så mye mening mer... Hvis vi vil teste dette må vi vel sette hvert ord til et av ordene i clusteren, ikke bare cluster ID? Fordi cluster-ID gir ikke noe mening og dermed gir ikke word2vec vectoren mening.. 

In [22]:
best_corpus_so_far=TE_corpus

In [23]:
cluster_file="50mpaths2.txt"
cluster_dictionary=CL.create_dictionary(cluster_file)

In [24]:
clusteded_corpus=CL.create_clusterized_corpus(best_corpus_so_far,cluster_dictionary) 

In [25]:
GV.classify_with_neural_networks(neural_nets, global_vectors, clusteded_corpus, total_training_tweets, nr_pos_tweets)

Model:  deep_HB
64.49% (+/- 0.26%)
Negative sentiment: 55.19%  Positive sentiment: 73.80%
Percentage of positive classifications (should be 50%ish): 59.3005004869
Time taken:  2.121539612611135 



## Testing n-grams

In [13]:
best_corpus_so_far=TE_corpus

In [14]:
n_grams=2
n_grams_corpus=HL.creating_n_grams_cropus(n_grams,best_corpus_so_far)

In [15]:
GV.classify_with_neural_networks(neural_nets, global_vectors, n_grams_corpus, total_training_tweets, nr_pos_tweets)

Model:  deep_HB
84.22% (+/- 1.31%)
Negative sentiment: 83.21%  Positive sentiment: 85.23%
Percentage of positive classifications (should be 50%ish): 51.0104802854
Time taken:  1.880251367886861 



In [16]:
n_grams=3
n_grams_corpus3=HL.creating_n_grams_cropus(n_grams,best_corpus_so_far)

In [18]:
GV.classify_with_neural_networks(neural_nets, global_vectors, n_grams_corpus3, total_training_tweets, nr_pos_tweets)

Model:  deep_HB
84.17% (+/- 1.31%)
Negative sentiment: 82.74%  Positive sentiment: 85.59%
Percentage of positive classifications (should be 50%ish): 51.4260025903
Time taken:  1.9127514322598775 



In [19]:
n_grams=4
n_grams_corpus4=HL.creating_n_grams_cropus(n_grams,best_corpus_so_far)

In [20]:
GV.classify_with_neural_networks(neural_nets, global_vectors, n_grams_corpus4, total_training_tweets, nr_pos_tweets)

Model:  deep_HB
83.99% (+/- 1.36%)
Negative sentiment: 81.79%  Positive sentiment: 86.18%
Percentage of positive classifications (should be 50%ish): 52.1959893505
Time taken:  1.883493419488271 



In [None]:
#Plot graf for n-grams. null poeng i å gjøre før kjøringer gir samme verdi hver gang.. 

## TF-IDF

In [30]:
best_corpus_so_far=n_grams_corpus3

In [29]:

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(
        min_df=11, #5, # removing word that occure less then 10 times 
        max_df = 1.5, #0.6, # remove words that are too frequent ( more then 0.8 * number of tweets )
        sublinear_tf=True, # scale the term frequency in logarithmic scale
        use_idf =False #True
        )


In [33]:
corpus_tf_idf = vectorizer.transform(best_corpus_so_far) 

In [36]:
print(corpus_tf_idf)

  (0, 194)	0.203630843509
  (0, 487)	0.203630843509
  (0, 1055)	0.203630843509
  (0, 1515)	0.203630843509
  (0, 3056)	0.203630843509
  (0, 3786)	0.203630843509
  (0, 4141)	0.203630843509
  (0, 4439)	0.254188975973
  (0, 4682)	0.203630843509
  (0, 5276)	0.254188975973
  (0, 5418)	0.203630843509
  (0, 6069)	0.203630843509
  (0, 6131)	0.203630843509
  (0, 6455)	0.203630843509
  (0, 6672)	0.203630843509
  (0, 6712)	0.203630843509
  (0, 6828)	0.203630843509
  (0, 6863)	0.203630843509
  (0, 7851)	0.203630843509
  (0, 9704)	0.203630843509
  (0, 10315)	0.203630843509
  (0, 10729)	0.203630843509
  (0, 10963)	0.203630843509
  (1, 1012)	0.242951161823
  (1, 2406)	0.242951161823
  :	:
  (209997, 10067)	0.218217890236
  (209997, 10177)	0.218217890236
  (209997, 10305)	0.218217890236
  (209997, 10748)	0.218217890236
  (209997, 10774)	0.218217890236
  (209998, 1936)	0.408248290464
  (209998, 3945)	0.408248290464
  (209998, 4454)	0.408248290464
  (209998, 4885)	0.408248290464
  (209998, 6712)	0.408248

In [32]:
GV.classify_with_neural_networks(neural_nets, global_vectors, corpus_tf_idf, total_training_tweets, nr_pos_tweets)

AttributeError: split not found

In [None]:
##Lag plott

# Making Kaggle submission

In [27]:
kaggle_name="keggle_glove_TE_n2.csv"
final_corpus=n_grams_corpus

pred= GV.get_prediction(NN.deep_HB, global_vectors, final_corpus, total_training_tweets, nr_pos_tweets,kaggle_name)

In [None]:
print(sum(pred))