# Glove with preprocessing 

## Loading Libraries

In [15]:
%load_ext autoreload
%autoreload 2

import random as rn
import tensorflow as tf
from keras import backend as K

# external imports
import numpy as np
import pandas as pd
import seaborn as sb
import gensim
from gensim.scripts.glove2word2vec import glove2word2vec
import pylab as pl
import matplotlib.pyplot as plt
import csv
import scipy
import os.path
import pickle

import keras
from keras.layers import *
from keras.layers.core import *
from keras import backend as K

import sklearn as sk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import ParameterGrid
import time

# internal imports
import helpers as HL
import cleaning as CL
import glove_module as GV
import neural_nets as NN
import tokenizing as TO
import tokenizing_ekphrasis as TE



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Defining Data Paths 

### Files used to create model:

In [2]:
# Constants
DATA_FOLDER = os.path.join("glove.twitter.27B") 
DATA_25DIM = DATA_FOLDER + "/glove.twitter.27B.25d.txt"
DATA_50DIM = DATA_FOLDER + "/glove.twitter.27B.50d.txt"
DATA_100DIM = DATA_FOLDER + "/glove.twitter.27B.100d.txt"
DATA_200DIM = DATA_FOLDER + "/glove.twitter.27B.200d.txt"

### Data files:

In [3]:
training_set_pos = "train_pos.txt" 
training_set_neg = "train_neg.txt"
training_set_pos_full = "train_pos_full.txt"
training_set_neg_full = "train_neg_full.txt"
test_set = "test_data.txt"

## Import pretrained GloVe with gensim
one can use gensims word2vec functions to check similarity and other interesting functions https://radimrehurek.com/gensim/models/word2vec.html

## Create the word embeddings using the created gensim-.txt file.

In [4]:
#Pick one, the higher dimension, the better result and longer computational time. 

#global_vectors=GV.make_glove("data/gensim_global_vectors_25dim.txt")
#global_vectors=GV.make_glove("gensim_global_vectors_50dim.txt")
global_vectors=GV.make_glove("gensim_global_vectors_100dim.txt")
#global_vectors=GV.make_glove("gensim_global_vectors_200dim.txt")

## Creating corpus:

In [5]:
#When testing
inputfiles=[training_set_pos,training_set_neg,test_set]

#when using full data set:
#inputfiles=[training_set_pos_full,training_set_neg_full,test_set]

full_corpus, file_lengths=HL.create_corpus(inputfiles)
nr_pos_tweets = file_lengths[0]
nr_neg_tweets = file_lengths[1]
total_training_tweets =file_lengths [0]+file_lengths[1]

In [6]:
neural_nets=[NN.deep_HB]

In [None]:
#Initializing vectors:

accuracies=[]
stds=[]
names=[]
corpuses=[]
corpuses.append(full_corpus)
names.append('full_corpus')


## Testing unprocessed corpus : 

# Best combo: seg_hash hash mention, set_not, elongment! (to første sammen, deretter de to andre gir 81.67% (+/- 0.58%) med 100) 
gjøre alt samtidig: 81.68% (+/- 0.52%)

## Cleaning tweets using costumized tokenizing function

### Testing to see that we get same result when all parameters are set to false: 

In [None]:
test_corpus=TO.preprocess_corpus(full_corpus,stemming=False,
                      all_smilies=False, pos_smilies=False, neg_smilies=False, other_smilies=False,
                      hugs_and_kisses=False,hearts=False,
                      hashtag=False, hashtag_mention=False, 
                      numbers=False, number_mention=False, 
                      exclamation=False,
                      set_to_not=False, 
                      segmentation_hash= False, 
                      spelling=False,
                      elongation=False,
                      remove_signs=False
                      )

In [None]:
corpuses.append(test_corpus)
names.append('test_corpus')

### Segmentation hash

In [None]:
seg_hash_corpus=TO.preprocess_corpus(full_corpus,stemming=False,
                      all_smilies=False, pos_smilies=False, neg_smilies=False, other_smilies=False,
                      hugs_and_kisses=False,hearts=False,
                      hashtag=True, hashtag_mention=False, 
                      numbers=False, number_mention=False, 
                      exclamation=False,
                      set_to_not=False, 
                      segmentation_hash= True, 
                      spelling=False,
                      elongation=False,
                      remove_signs=False
                      )

In [None]:
corpuses.append(seg_hash_corpus)
names.append('seg_hash_corpus')

### Seg hash and hash mention

In [None]:
seg_hash_m_corpus=TO.preprocess_corpus(full_corpus,stemming=False,
                      all_smilies=False, pos_smilies=False, neg_smilies=False, other_smilies=False,
                      hugs_and_kisses=False,hearts=False,
                      hashtag=True, hashtag_mention=True, 
                      numbers=False, number_mention=False, 
                      exclamation=False,
                      set_to_not=False, 
                      segmentation_hash= True, 
                      spelling=False,
                      elongation=False,
                      remove_signs=False
                      )

In [None]:
corpuses.append(seg_hash_m_corpus)
names.append('seg_hash_m_corpus')

### Pos smilies

In [None]:
pos_s_corpus=TO.preprocess_corpus(full_corpus,stemming=False,
                      all_smilies=False, pos_smilies=True, neg_smilies=False, other_smilies=False,
                      hugs_and_kisses=False,hearts=False,
                      hashtag=False, hashtag_mention=False, 
                      numbers=False, number_mention=False, 
                      exclamation=False,
                      set_to_not=False, 
                      segmentation_hash= False, 
                      spelling=False,
                      elongation=False,
                      remove_signs=False
                      )

In [None]:
corpuses.append(pos_s_corpus)
names.append('pos_s_corpus')

### Neg smilies

In [None]:
neg_s_corpus=TO.preprocess_corpus(full_corpus,stemming=False,
                      all_smilies=False, pos_smilies=False, neg_smilies=True, other_smilies=False,
                      hugs_and_kisses=False,hearts=False,
                      hashtag=False, hashtag_mention=False, 
                      numbers=False, number_mention=False, 
                      exclamation=False,
                      set_to_not=False, 
                      segmentation_hash= False, 
                      spelling=False,
                      elongation=False,
                      remove_signs=False
                      )

In [None]:
corpuses.append(neg_s_corpus)
names.append('neg_s_corpus')

### Other Smilies

In [None]:
other_s_corpus=TO.preprocess_corpus(full_corpus,stemming=False,
                      all_smilies=False, pos_smilies=False, neg_smilies=False, other_smilies=True,
                      hugs_and_kisses=False,hearts=False,
                      hashtag=False, hashtag_mention=False, 
                      numbers=False, number_mention=False, 
                      exclamation=False,
                      set_to_not=False, 
                      segmentation_hash= False, 
                      spelling=False,
                      elongation=False,
                      remove_signs=False
                      )

In [None]:
corpuses.append(other_s_corpus)
names.append('other_s_corpus')

### Hugs and kisses

In [None]:
h_k_corpus=TO.preprocess_corpus(full_corpus,stemming=False,
                      all_smilies=False, pos_smilies=False, neg_smilies=False, other_smilies=False,
                      hugs_and_kisses=True,hearts=False,
                      hashtag=False, hashtag_mention=False, 
                      numbers=False, number_mention=False, 
                      exclamation=False,
                      set_to_not=False, 
                      segmentation_hash= False, 
                      spelling=False,
                      elongation=False,
                      remove_signs=False
                      )

In [None]:
corpuses.append(h_k_corpus)
names.append('h_k_corpus')

### Hearts

In [None]:
h_corpus=TO.preprocess_corpus(full_corpus,stemming=False,
                      all_smilies=False, pos_smilies=False, neg_smilies=False, other_smilies=False,
                      hugs_and_kisses=False,hearts=True,
                      hashtag=False, hashtag_mention=False, 
                      numbers=False, number_mention=False, 
                      exclamation=False,
                      set_to_not=False, 
                      segmentation_hash= False, 
                      spelling=False,
                      elongation=False,
                      remove_signs=False
                      )

In [None]:
corpuses.append(h_corpus)
names.append('h_corpus')

### Numbers

In [None]:
number_corpus=TO.preprocess_corpus(full_corpus,stemming=False,
                      all_smilies=False, pos_smilies=False, neg_smilies=False, other_smilies=False,
                      hugs_and_kisses=False,hearts=False,
                      hashtag=False, hashtag_mention=False, 
                      numbers=True, number_mention=False, 
                      exclamation=False,
                      set_to_not=False, 
                      segmentation_hash= False, 
                      spelling=False,
                      elongation=False,
                      remove_signs=False
                      )

In [None]:
corpuses.append(number_corpus)
names.append('number_corpus')

### Numbers and numbermentions

In [None]:
number_m_corpus=TO.preprocess_corpus(full_corpus,stemming=False,
                      all_smilies=False, pos_smilies=False, neg_smilies=False, other_smilies=False,
                      hugs_and_kisses=False,hearts=False,
                      hashtag=False, hashtag_mention=False, 
                      numbers=True, number_mention=True, 
                      exclamation=False,
                      set_to_not=False, 
                      segmentation_hash= False, 
                      spelling=False,
                      elongation=False,
                      remove_signs=False
                      )

In [None]:
corpuses.append(number_m_corpus)
names.append('number_m_corpus')

### Exclamation

In [None]:
ex_corpus=TO.preprocess_corpus(full_corpus,stemming=False,
                      all_smilies=False, pos_smilies=False, neg_smilies=False, other_smilies=False,
                      hugs_and_kisses=False,hearts=False,
                      hashtag=False, hashtag_mention=False, 
                      numbers=False, number_mention=False, 
                      exclamation=True,
                      set_to_not=False, 
                      segmentation_hash= False, 
                      spelling=False,
                      elongation=False,
                      remove_signs=False
                      )

In [None]:
corpuses.append(ex_corpus)
names.append('ex_corpus')

### Set to not

In [None]:
not_corpus=TO.preprocess_corpus(full_corpus,stemming=False,
                      all_smilies=False, pos_smilies=False, neg_smilies=False, other_smilies=False,
                      hugs_and_kisses=False,hearts=False,
                      hashtag=False, hashtag_mention=False, 
                      numbers=False, number_mention=False, 
                      exclamation=False,
                      set_to_not=True, 
                      segmentation_hash= False, 
                      spelling=False,
                      elongation=False,
                      remove_signs=False
                      )

In [None]:
corpuses.append(not_corpus)
names.append('not_corpus')

### spelling

In [None]:
spelling_corpus=TO.preprocess_corpus(full_corpus,stemming=False,
                      all_smilies=False, pos_smilies=False, neg_smilies=False, other_smilies=False,
                      hugs_and_kisses=False,hearts=False,
                      hashtag=False, hashtag_mention=False, 
                      numbers=False, number_mention=False, 
                      exclamation=False,
                      set_to_not=False, 
                      segmentation_hash= False, 
                      spelling=True,
                      elongation=False,
                      remove_signs=False
                      )

In [None]:
corpuses.append(spelling_corpus)
names.append('spelling_corpus')

### elongation

In [None]:
en_corpus=TO.preprocess_corpus(full_corpus,stemming=False,
                      all_smilies=False, pos_smilies=False, neg_smilies=False, other_smilies=False,
                      hugs_and_kisses=False,hearts=False,
                      hashtag=False, hashtag_mention=False, 
                      numbers=False, number_mention=False, 
                      exclamation=False,
                      set_to_not=False, 
                      segmentation_hash= False, 
                      spelling=False,
                      elongation=True,
                      remove_signs=False
                      )

In [None]:
corpuses.append(en_corpus)
names.append('en_corpus')

### remove_signs

In [None]:
sign_corpus=TO.preprocess_corpus(full_corpus,stemming=False,
                      all_smilies=False, pos_smilies=False, neg_smilies=False, other_smilies=False,
                      hugs_and_kisses=False,hearts=False,
                      hashtag=False, hashtag_mention=False, 
                      numbers=False, number_mention=False, 
                      exclamation=False,
                      set_to_not=False, 
                      segmentation_hash= False, 
                      spelling=False,
                      elongation=False,
                      remove_signs=True
                      )

In [None]:
corpuses.append(sign_corpus)
names.append('sign_corpus')

### 2-ngrams

In [None]:
n_grams=2
n_grams_corpus2=HL.creating_n_grams_cropus(n_grams,full_corpus)

In [None]:
corpuses.append(n_grams_corpus2)
names.append('n_grams_corpus2')

### 3-ngrams

In [None]:
n_grams=3
n_grams_corpus3=HL.creating_n_grams_cropus(n_grams,full_corpus)

In [None]:
corpuses.append(n_grams_corpus3)
names.append('n_grams_corpus3')

### 4-ngrams

In [None]:
n_grams=4
n_grams_corpus4=HL.creating_n_grams_cropus(n_grams,full_corpus)

In [None]:
corpuses.append(n_grams_corpus4)
names.append('n_grams_corpus4')

### Dynamic stopword list

In [None]:
stopwords= CL.get_dynamic_stopwords(full_corpus, MinDf=0.01, MaxDf=0.99,sublinearTF=True,useIDF=False)

In [None]:
stopword_corpus=CL.remove_stopwords(full_corpus, stopwords)

In [None]:
corpuses.append(stopword_corpus)
names.append('stopword_corpus')

In [None]:
for corpus in corpuses: 
    acc,std=GV.classify_with_neural_networks(neural_nets, global_vectors, corpus, total_training_tweets, nr_pos_tweets,epochs=5, n_folds=2)
    accuracies.append(acc)
    stds.append(std)

In [None]:
print(accuracies)

In [None]:
good=0
bad=0
good_corpuses=[]
good_names=[]
for i in range(len(accuracies)):
    if i>0 and  accuracies[i]>accuracies[0]:
        print('Method',names[i], 'improves, with acc=', accuracies[i],'and std:', stds[i],'\n')
        good+=1
        good_corpuses.append(corpuses[i])
        good_names.append(names[i])
    else:
        #print('Method',names[i], 'have acc=', accuracies[i],'and std:', stds[i],'\n')
        bad+=1
    
print(good/bad)

In [None]:
print(good_names)

In [None]:
seg_has_combos=[]
seg_has_combos_names=[]
i=2
seg_has_combo_accs=[]
seg_has_combo_stds=[]
for corpus in good_corpuses[2:]:
    seg_hash_combo_corpus=TO.preprocess_corpus(corpus,stemming=False,
                      all_smilies=False, pos_smilies=False, neg_smilies=False, other_smilies=False,
                      hugs_and_kisses=False,hearts=False,
                      hashtag=True, hashtag_mention=True, 
                      numbers=False, number_mention=False, 
                      exclamation=False,
                      set_to_not=False, 
                      segmentation_hash= True, 
                      spelling=False,
                      elongation=False,
                      remove_signs=False
                      )
    seg_has_combos.append(seg_hash_combo_corpus)
    seg_has_combos_names.append(good_names[i])
    i+=1
    

    acc,std=GV.classify_with_neural_networks(neural_nets, global_vectors, seg_hash_combo_corpus, total_training_tweets, nr_pos_tweets,epochs=5, n_folds=2)
    seg_has_combo_accs.append(acc)
    seg_has_combo_stds.append(std)

In [None]:
good=0
bad=0
good_corpuses2=[]
good_names2=[]
for i in range(len(seg_has_combo_accs)):
    if i>0 and  seg_has_combo_accs[i]>seg_has_combo_accs[0]:
        print('Method',seg_has_combos_names[i], 'improves, with acc=', seg_has_combo_accs[i],'and std:', seg_has_combo_stds[i],'\n')
        good+=1
        good_corpuses2.append(seg_has_combos[i])
        good_names2.append(seg_has_combos_names[i])
    else:
        #print('Method',names[i], 'have acc=', accuracies[i],'and std:', stds[i],'\n')
        bad+=1
    
print(good/bad)

In [None]:
final_corpus=TO.preprocess_corpus(good_corpuses2[0],stemming=False,
                      all_smilies=False, pos_smilies=False, neg_smilies=False, other_smilies=False,
                      hugs_and_kisses=False,hearts=False,
                      hashtag=False, hashtag_mention=False, 
                      numbers=False, number_mention=False, 
                      exclamation=False,
                      set_to_not=False, 
                      segmentation_hash= True, 
                      spelling=False,
                      elongation=True,
                      remove_signs=False
                      )

In [None]:
acc,std=GV.classify_with_neural_networks(neural_nets, global_vectors, final_corpus, total_training_tweets, nr_pos_tweets,epochs=5, n_folds=2)

In [None]:
final2_corpus=TO.preprocess_corpus(full_corpus,stemming=False,
                      all_smilies=False, pos_smilies=False, neg_smilies=False, other_smilies=False,
                      hugs_and_kisses=False,hearts=False,
                      hashtag=True, hashtag_mention=True, 
                      numbers=False, number_mention=False, 
                      exclamation=False,
                      set_to_not=True, 
                      segmentation_hash= True, 
                      spelling=False,
                      elongation=True,
                      remove_signs=False
                      )

In [None]:
acc,std=GV.classify_with_neural_networks(neural_nets, global_vectors, final2_corpus, total_training_tweets, nr_pos_tweets,epochs=5, n_folds=2)

# Teting n grams once more

In [None]:
ns=[2,3,4]

n_accs=[]
n_stds=[]
n_names=[]
n_ns=[]
for i,corpus in enumerate(corpuses):
    for n in ns:
        ngram_corpus=HL.creating_n_grams_cropus(n,corpus)
        acc,std=GV.classify_with_neural_networks(neural_nets, global_vectors, ngram_corpus, total_training_tweets, nr_pos_tweets,epochs=5, n_folds=2)
        n_accs.append(acc)
        n_stds.append(std)
        n_names.append(names[i])
        n_ns.append(n)
        

# Making Kaggle submission

Som før for å lage en keggle! 

In [None]:
kaggle_name="keggle_glove_12_12.csv"
#final_corpus=n_grams_corpus

pred= GV.get_prediction(NN.deep_HB, global_vectors, final2_corpus, total_training_tweets, nr_pos_tweets,kaggle_name, epochs=10)

In [None]:
print(sum(pred))