# Determining best combination of preprocessing techniques  

## Loading Libraries

In [4]:
%load_ext autoreload
%autoreload 2

import numpy as np
import csv
import pickle

# internal imports
import helpers as HL
import glove_module as GV
import neural_nets as NN
#import tokenizing as TO


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Import pretrained GloVe with gensim
one can use gensims word2vec functions to check similarity and other interesting functions https://radimrehurek.com/gensim/models/word2vec.html

## Create the word embeddings using the created gensim-.txt file.

In [5]:
#Pick one, the higher dimension, the better result and longer computational time. 
global_vectors=HL.get_global_vectors(200)

## Creating corpus:
In addition to the acutal corpus, some additional information is needed 

In [7]:
full_corpus, nr_pos_tweets, nr_neg_tweets, total_training_tweets=HL.get_corpus(full=True)

## Picking the neural net

In [None]:
neural_nets=[NN.basic_model_adam]

## Initializing variables to apply all preprocessing techniques:

In [None]:
#Initializing vectors:
corpuses=[]
corpuses.append(full_corpus)

In [None]:
#Defining names of corpuses: 
names=['original_corpus','SH_corpus','SHM_corpus','H_corpus','HK_corpus','PS_corpus','NS__corpus','OS_corpus','N_corpus','NM_corpus','ST_corpus','SP_corpus','E_corpus','SN_corpus','RS_corpus','EX_corpus','N-2_corpus','N-3_corpus','N-4_corpus']

In [None]:
#Defining inputs to preprocessing function: 
inputs=[{'hashtag': True, 'segmentation_hash': True},
        {'hashtag':True,'segmentation_hash': True,'hashtag_mention':True},
        {'hearts':True},
        {'hugs_and_kisses':True},
        {'pos_smilies':True},
        {'neg_smilies':True},
        {'other_smilies':True},
        {'numbers':True},
        {'numbers':True,'number_mention':True},
        {'stemming':True},
        {'spelling':False},#Warning: When True, it takes app 149 minutes on test set. Recomended to always set to false 
        {'elongation':True},
        {'set_to_not':True},
        {'remove_signs':True},
        {'exclamation':True}]

## Applying all preprocessing techniques to the original corpus: 

In [None]:

for input_ in inputs: 
        corpus=TO.preprocess_corpus(full_corpus, **input_)
        corpuses.append(corpus)
        

In [None]:
ns=[2,3,4]
for n in ns: 
    corpus=HL.creating_n_grams_corpus(n,full_corpus)
    corpuses.append(corpus)

## Round 1: Testing all preprocessing techniques: 

In [None]:
accuracies=[]
stds=[]

for corpus in corpuses: 
    model_score=GV.classify_with_neural_networks(neural_nets, global_vectors, corpus, total_training_tweets, nr_pos_tweets, epochs=100, n_folds=3)
    accuracies.append(model_score[0][0])
    stds.append(model_score[0][1])

## Want to determine which preprocessing techniques that improved the accuracy, and keep them: 

In [None]:
corpuses_1=[]
names_1=[]
stds_1=[]
acc_1=[]
print('The original corpus gave accuracy of: ',accuracies[0], 'std:', stds[0],'\n')
for i in range(1,len(accuracies)):
    if accuracies[i]>=accuracies[0]:
        corpuses_1.append(corpuses[i])
        names_1.append(names[i])
        stds_1.append(stds[i])
        acc_1.append(accuracies[i])
        print('IMPROVED:  ',names[i],', score:',accuracies[i],'std:',stds[i])
    else:
        print('Not better:',names[i],', score:',accuracies[i],'std:',stds[i])
        

In [None]:
print(names_1)

"""
IMPROVED:   SH_corpus , score: 0.825355 std: 0.000791580697087
IMPROVED:   SHM_corpus , score: 0.825075 std: 0.00108085614214
Not better: H_corpus , score: 0.82204 std: 0.00155889383859
IMPROVED:   HK_corpus , score: 0.82262 std: 0.00126653069446
IMPROVED:   PS_corpus , score: 0.82248 std: 0.00136009190866
IMPROVED:   NS__corpus , score: 0.822385 std: 0.00153407952858
IMPROVED:   OS_corpus , score: 0.822515 std: 0.00137109445335
IMPROVED:   N_corpus , score: 0.822245 std: 0.00134651401775
IMPROVED:   NM_corpus , score: 0.822745 std: 0.00184183332579
Not better: ST_corpus , score: 0.812625 std: 0.000972239682383
IMPROVED:   SP_corpus , score: 0.822245 std: 0.00134651401775
IMPROVED:   E_corpus , score: 0.82334 std: 0.00197917912277
IMPROVED:   SN_corpus , score: 0.825595 std: 0.00118473625757
Not better: RS_corpus , score: 0.814735 std: 0.00153155150093
IMPROVED:   EX_corpus , score: 0.82229 std: 0.00142571034926
IMPROVED:   N-2_corpus , score: 0.823265 std: 0.00156073700539
Not better: N-3_corpus , score: 0.82214 std: 0.00145487112831
Not better: N-4_corpus , score: 0.816695 std: 0.00155212757208
"""

In [None]:
# We do not keep SHM, as SH is better. 
# We do not keep N as NM is better 
# We do not keep SP due to time

# Round 2: Apply all techniques that contributed positively

In [None]:
input_={'hashtag': True, 'segmentation_hash': True,'hugs_and_kisses':True,'all_smilies':True,
        'numbers':True,'number_mention':True,'elongation':True, 'set_to_not':True,'exclamation':True}


In [None]:
all_prepr_corpus=TO.preprocess_corpus(full_corpus, **input_)

In [None]:
corpuses=[]
corpuses.append(all_prepr_corpus)
ns=[2]
for n in ns: 
    corpus=HL.creating_n_grams_corpus(n,all_prepr_corpus)
    #corpuses.append(corpus)

In [None]:
accuracies2=[]
stds2=[]

for corpus in corpuses: 
    model_score=GV.classify_with_neural_networks(neural_nets, global_vectors, corpus, total_training_tweets, nr_pos_tweets, epochs=100, n_folds=3)
    accuracies2.append(model_score[0][0])
    stds2.append(model_score[0][1])

In [None]:
for i in range(len(corpuses)):
    print('Accuracy:',  accuracies2[i], 'std:',stds2[i],'\n' )

In [None]:
best_corpus=corpuses[1]

### Dynamic stopword list

In [None]:
mindfs= [2,3,5,10,20]
maxdfs=[0.8, 0.999] 
accuracies_stop=[]
stds_stop=[]
stop_lens=[]
vocabs=[]


for max_ in maxdfs:
    for min_ in mindfs: 
        stopwords, vocab= TO.get_dynamic_stopwords(best_corpus, MinDf=min_, MaxDf=max_,sublinearTF=True,useIDF=False)
        stopword_corpus=TO.remove_stopwords(best_corpus, stopwords)
        model_score=GV.classify_with_neural_networks(neural_nets, global_vectors, stopword_corpus, total_training_tweets, nr_pos_tweets, epochs=100, n_folds=3)
        accuracies_stop.append(model_score[0][0])
        stds_stop.append(model_score[0][1])
        stop_lens.append(len(stopwords))
        #vocabs.append(vocab)

        print('This is min',min_, 'this is max', max_, 'this is len stopwords', len(stopwords), 'this is acc: ', model_score[0][0],'\n')


In [None]:
print(stop_lens)

In [None]:
for i in range(len(stds_stop)):
    print('Accuracy:',  accuracies_stop[i], 'std:',stds_stop[i],'stopwords:',stop_lens[i],'\n' )

# Testing "best preprocessing" with full dataset: 

Som før for å lage en keggle! 

In [None]:
final_corpus=TO.preprocess_corpus(full_corpus, segmentation_hash=True, hashtag=True, hashtag_mention=True, set_to_not=True,elongation=True)


In [None]:
model_score=GV.classify_with_neural_networks(neural_nets, global_vectors, final_corpus, total_training_tweets, nr_pos_tweets, epochs=6, n_folds=3)

# Making kaggle submission: 

In [None]:
kaggle_name="keggle_glove_test_adam.csv"
#final_corpus=n_grams_corpus

pred= GV.get_prediction(NN.basic_model_adam, global_vectors, all_prepr_corpus, total_training_tweets, nr_pos_tweets,kaggle_name, epochs=100)

In [None]:
print(sum(pred))

In [2]:
corpus = pickle.load(open('full_corpus_prepro_with_n2_and_all_other.pkl', 'rb'))

In [8]:
kaggle_name="keggle_glove_test_adam_full_set_thomas.csv"
#final_corpus=n_grams_corpus

pred= GV.get_prediction(NN.basic_model_adam, global_vectors, corpus, total_training_tweets, nr_pos_tweets,kaggle_name, epochs=100)

Epoch 1/100
Epoch 2/100
   7168/2500000 [..............................] - ETA: 53s - loss: 0.3485 - acc: 0.8436 



Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100


OSError: Unable to open file (unable to open file: name = 'best_neural_model_prediction_model.hdf5', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)