# Glove with preprocessing 

## Loading Libraries

In [2]:
%load_ext autoreload
%autoreload 2

import random as rn
import tensorflow as tf
from keras import backend as K

# external imports
import numpy as np
import pandas as pd
import seaborn as sb
import gensim
from gensim.scripts.glove2word2vec import glove2word2vec
import pylab as pl
import matplotlib.pyplot as plt
import csv
import scipy
import os.path
import pickle

import keras
from keras.layers import *
from keras.layers.core import *
from keras import backend as K

import sklearn as sk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import ParameterGrid
import time

# internal imports
import helpers as HL
import cleaning as CL
import glove_module as GV
import neural_nets as NN
import tokenizing as TO
import tokenizing_ekphrasis as TE

import maketextfile as MT



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Defining Data Paths 

### Files used to create model:

In [None]:
# Constants
DATA_FOLDER = os.path.join("glove.twitter.27B") 
DATA_25DIM = DATA_FOLDER + "/glove.twitter.27B.25d.txt"
DATA_50DIM = DATA_FOLDER + "/glove.twitter.27B.50d.txt"
DATA_100DIM = DATA_FOLDER + "/glove.twitter.27B.100d.txt"
DATA_200DIM = DATA_FOLDER + "/glove.twitter.27B.200d.txt"

### Data files:

In [3]:
training_set_pos = "train_pos.txt" 
training_set_neg = "train_neg.txt"
training_set_pos_full = "train_pos_full.txt"
training_set_neg_full = "train_neg_full.txt"
test_set = "test_data.txt"

## Import pretrained GloVe with gensim
one can use gensims word2vec functions to check similarity and other interesting functions https://radimrehurek.com/gensim/models/word2vec.html

## Create the word embeddings using the created gensim-.txt file.

In [None]:
#Pick one, the higher dimension, the better result and longer computational time. 

#global_vectors=GV.make_glove("data/gensim_global_vectors_25dim.txt")
#global_vectors=GV.make_glove("gensim_global_vectors_50dim.txt")
#global_vectors=GV.make_glove("gensim_global_vectors_100dim.txt")
global_vectors=GV.make_glove("gensim_global_vectors_200dim.txt")

In [4]:
global_vectors=GV.make_glove("global_vectors.txt")

## Creating corpus:

In [8]:
#When testing
inputfiles=[training_set_pos,training_set_neg,test_set]

#when using full data set:
#inputfiles=[training_set_pos_full,training_set_neg_full,test_set]

full_corpus, file_lengths=HL.create_corpus(inputfiles)
nr_pos_tweets = file_lengths[0]
nr_neg_tweets = file_lengths[1]
total_training_tweets =file_lengths [0]+file_lengths[1]

## Picking the neural net

In [6]:
neural_nets=[NN.deep_HB]

In [9]:
model_score=GV.classify_with_neural_networks(neural_nets, global_vectors, full_corpus, total_training_tweets, nr_pos_tweets, epochs=5, n_folds=3)

tweets processed: 0  of total number of tweets: 200000
tweets processed: 50000  of total number of tweets: 200000
tweets processed: 100000  of total number of tweets: 200000
tweets processed: 150000  of total number of tweets: 200000
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Model:  deep_HB
63.36% (+/- 0.09%)
Negative sentiment: 40.16%  Positive sentiment: 86.56%
Percentage of positive classifications (should be 50%ish): 73.2009728749
Time taken:  0.9043079813321432 



In [10]:
global_vectors.most_similar('cat')

[('till', 0.7419776916503906),
 ('pool', 0.7236136198043823),
 ('maybe', 0.7194472551345825),
 ('loved', 0.7174627184867859),
 ('camera', 0.7156307101249695),
 ('few', 0.7143616676330566),
 ('bro', 0.7082592248916626),
 ('@', 0.7029222249984741),
 ('500', 0.7022773027420044),
 ('nazis', 0.7009774446487427)]

## Initializing variables to apply all preprocessing techniques:

In [None]:
#Initializing vectors:

corpuses=[]
corpuses.append(full_corpus)


In [None]:
#Defining names of corpuses: 
names=['original_corpus','SH_corpus','SHM_corpus','H_corpus','HK_corpus','PS_corpus','NS__corpus','OS_corpus','N_corpus','NM_corpus','ST_corpus','SP_corpus','E_corpus','SN_corpus','RS_corpus','N-2_corpus','N-3_corpus','N-4_corpus']

In [None]:
#Defining inputs to preprocessing function: 
inputs=[{'hashtag': True, 'segmentation_hash': True},
        {'hashtag':True,'segmentation_hash': True,'hashtag_mention':True},
        {'hearts':True},
        {'hugs_and_kisses':True},
        {'pos_smilies':True},
        {'neg_smilies':True},
        {'other_smilies':True},
        {'numbers':True},
        {'numbers':True,'number_mention':True},
        {'stemming':True},
        {'spelling':False},#Warning: When True, it takes forever. Recomended to always have as false 
        {'elongation':True},
        {'set_to_not':True},
        {'remove_signs':True}]

## Applying all preprocessing techniques to the original corpus: 

In [None]:

for input_ in inputs: 
        corpus=TO.preprocess_corpus(full_corpus, **input_)
        corpuses.append(corpus)
        

In [None]:
ns=[2,3,4]
for n in ns: 
    corpus=HL.creating_n_grams_corpus(n,full_corpus)
    corpuses.append(corpus)

## Testing all corpuses: 

In [None]:
accuracies=[]
stds=[]

for corpus in corpuses: 
    model_score=GV.classify_with_neural_networks(neural_nets, global_vectors, corpus, total_training_tweets, nr_pos_tweets, epochs=5, n_folds=3)
    accuracies.append(model_score[0][0])
    stds.append(model_score[0][1])

In [None]:
print(len(accuracies))

## Want to determine which preprocessing techniques that improved the accuracy, and keep them: 

In [None]:
corpuses_1=[]
names_1=[]
stds_1=[]
acc_1=[]
print('The original corpus gave accuracy of: ',accuracies[0],'\n')
for i in range(1,len(accuracies)):
    if accuracies[i]>=accuracies[0]:
        corpuses_1.append(corpuses[i])
        names_1.append(names[i])
        stds_1.append(stds[i])
        acc_1.append(accuracies[i])
        print('IMPROVED:  ',names[i],', score:',accuracies[i],'std:',stds[i])
    else:
        print('Not better:',names[i],', score:',accuracies[i],'std:',stds[i])
        

In [None]:
print(names_1)

## Want to check how many epochs before overfitting: 

In [None]:
accuracies_E=[]
stds_E=[]

for epochs_ in range(3,10):
    model_score=GV.classify_with_neural_networks(neural_nets, global_vectors, full_corpus, total_training_tweets, nr_pos_tweets, epochs=epochs_, n_folds=3)
    accuracies_E.append(model_score[0][0])
    stds_E.append(model_score[0][1])

## Plotting the results:

In [None]:
epoch_values=[3,4,5,6,7,8,9,10,11] #X


In [None]:
print(accuracies_E) # Y
print(stds_E) # error

In [None]:
# Load the example exercise dataset

print(type(epoch_values))
print(type(accuracies_E))
print(type(stds_E))

index = range(0,len(epoch_values))
#s = pd.Series(data, index=index)
df = pd.DataFrame({'epoch_values' : pd.Series(epoch_values, index=epoch_values),
      'accuracies_E' : pd.Series(accuracies_E, index=epoch_values),
      'stds_E' : pd.Series(stds_E, index=epoch_values)})

print(df)


In [None]:

sb.set(style="whitegrid")

# Draw a pointplot to show pulse as a function of three categorical factors
g = sb.factorplot(x="epoch_values", y="accuracies_E", data=df, ) # , capsize=.2, size=6, aspect=.75
#g.despine(left=True)
g.map(plt.errorbar, "epoch_values", "accuracies_E", "stds_E")
plt.show()

# Best combo: seg_hash hash mention, set_not, elongment! (to første sammen, deretter de to andre gir 81.67% (+/- 0.58%) med 100) 
gjøre alt samtidig: 81.68% (+/- 0.52%)

### Dynamic stopword list

In [None]:
stopwords= CL.get_dynamic_stopwords(full_corpus, MinDf=0.01, MaxDf=0.99,sublinearTF=True,useIDF=False)

In [None]:
stopword_corpus=CL.remove_stopwords(full_corpus, stopwords)

In [None]:
corpuses.append(stopword_corpus)
names.append('stopword_corpus')

# Making Kaggle submission

Som før for å lage en keggle! 

In [None]:
final_corpus=TO.preprocess_corpus(full_corpus, segmentation_hash=True, hashtag=True, hashtag_mention=True, set_to_not=True,elongation=True)


In [None]:
model_score=GV.classify_with_neural_networks(neural_nets, global_vectors, final_corpus, total_training_tweets, nr_pos_tweets, epochs=6, n_folds=3)

In [None]:
kaggle_name="keggle_glove_13_12_full.csv"
#final_corpus=n_grams_corpus

pred= GV.get_prediction(NN.deep_HB, global_vectors, final_corpus, total_training_tweets, nr_pos_tweets,kaggle_name, epochs=6)

In [None]:
print(sum(pred))

In [None]:
import enchant
d = enchant.Dict("en_US")