# Determining Best Combination of Preprocessing Techniques on Test Set

## Loading Libraries

In [1]:
%load_ext autoreload
%autoreload 2

# internal imports
import helpers as HL
import glove_module as GV
import neural_nets as NN
import validation_and_prediction as VP
import tokenizing as TO

  return f(*args, **kwds)
Using TensorFlow backend.


## Create the word embeddings using the created gensim-.txt file.

In [2]:
#Pick one, the higher dimension, the better result and longer computational time. 
global_vectors=HL.get_global_vectors(200)

## Creating corpus:
In addition to the acutal corpus, some additional information is needed 

In [3]:
full_corpus, nr_pos_tweets, nr_neg_tweets, total_training_tweets=HL.get_corpus(test=True)

## Picking the neural net
At this stage, we want to use the simple neural net

In [4]:
neural_nets=[NN.basic_model_adam]

## Initializing variables to apply all preprocessing techniques:

In [5]:
#Initializing vectors:
corpuses=[]
corpuses.append(full_corpus)

In [6]:
#Defining names of corpuses: 
names=['original_corpus','SH_corpus','SHM_corpus','H_corpus','HK_corpus','PS_corpus','NS__corpus','OS_corpus','N_corpus','NM_corpus','ST_corpus','SP_corpus','E_corpus','SN_corpus','RS_corpus','EX_corpus','N-2_corpus','N-3_corpus','N-4_corpus']

In [7]:
#Defining inputs to preprocessing function: 
inputs=[{'hashtag': True, 'segmentation_hash': True},
        {'hashtag':True,'segmentation_hash': True,'hashtag_mention':True},
        {'hearts':True},
        {'hugs_and_kisses':True},
        {'pos_smilies':True},
        {'neg_smilies':True},
        {'other_smilies':True},
        {'numbers':True},
        {'numbers':True,'number_mention':True},
        {'stemming':True},
        {'spelling':True},#Warning: When True, it takes app  2.5 h on test set. Recomended to always set to false 
        {'elongation':True},
        {'set_to_not':True},
        {'remove_signs':True},
        {'exclamation':True}]

## Applying all preprocessing techniques to the original corpus: 

In [8]:

for input_ in inputs: 
        corpus=TO.preprocess_corpus(full_corpus, **input_)
        corpuses.append(corpus)
        

Reading twitter - 1grams ...
Reading twitter - 2grams ...


KeyboardInterrupt: 

In [9]:
ns=[2,3,4]
for n in ns: 
    corpus=TO.creating_n_grams_corpus(n,full_corpus)
    corpuses.append(corpus)

## Testing all preprocessing techniques: 

In [11]:
accuracies=[]
stds=[]

for corpus in corpuses: 
    model_score=VP.classify_with_neural_networks(neural_nets, global_vectors, corpus, total_training_tweets, nr_pos_tweets, epochs=100, n_folds=3)
    accuracies.append(model_score[0][0])
    stds.append(model_score[0][1])

tweets processed: 0  of total number of tweets: 200000
tweets processed: 50000  of total number of tweets: 200000
tweets processed: 100000  of total number of tweets: 200000
tweets processed: 150000  of total number of tweets: 200000
Train on 133332 samples, validate on 66668 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 00012: early stopping
Train on 133334 samples, validate on 66666 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 00010: early stopping
Train on 133334 samples, validate on 66666 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100


Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 00009: early stopping
Model:  basic_model_adam
0.81% (+/- 0.01%)
tweets processed: 0  of total number of tweets: 200000
tweets processed: 50000  of total number of tweets: 200000
tweets processed: 100000  of total number of tweets: 200000


TypeError: ufunc 'add' output (typecode 'O') could not be coerced to provided output parameter (typecode 'd') according to the casting rule ''same_kind''

## Want to determine which preprocessing techniques that improved the accuracy, and keep them: 

In [None]:
corpuses_1=[]
names_1=[]
stds_1=[]
acc_1=[]
print('The original corpus gave accuracy of: ',accuracies[0], 'std:', stds[0],'\n')
for i in range(1,len(accuracies)):
    if accuracies[i]>=accuracies[0]:
        corpuses_1.append(corpuses[i])
        names_1.append(names[i])
        stds_1.append(stds[i])
        acc_1.append(accuracies[i])
        print('IMPROVED:  ',names[i],', score:',accuracies[i],'std:',stds[i])
    else:
        print('Not better:',names[i],', score:',accuracies[i],'std:',stds[i])
        

In [None]:
# We do not keep SH, as SHM is better. 
# We do not keep N as NM is better 
# We do not keep SP due to time  (158.26 min)
# We do not keep techniques that imporved less than 0.05 percentage points. 

## Apply all techniques that contributed positively

In [None]:
input_={'hashtag': True, 'segmentation_hash': True, 'hashtag_mention':True,
        'hearts':True,'hugs_and_kisses':True,'elongation':True, 'set_to_not':True}


In [None]:
#Applying the techniques that we have decided to keep:
best_prepr_corpus=TO.preprocess_corpus(full_corpus, **input_)
best_corpus=HL.creating_n_grams_corpus(2,best_prepr_corpus)

In [None]:
#Cross validating:
model_score=VP.classify_with_neural_networks(neural_nets, global_vectors, corpus, 
                                             total_training_tweets, nr_pos_tweets, epochs=100, n_folds=3)

In [None]:
print('Accuracy:',  model_score[0][0], 'std:',model_score[0][1],'\n' )

## Dynamic stopword list

In [None]:
mindfs= [0,2,3,5,10,20, 40, 60,100,140,200]
maxdfs=[0.8] 
accuracies_stop=[]
stds_stop=[]
stop_lens=[]
vocabs=[]


for max_ in maxdfs:
    for min_ in mindfs: 
        stopwords, vocab= TO.get_dynamic_stopwords(best_corpus, MinDf=min_, MaxDf=max_)
        stopword_corpus=TO.remove_stopwords(best_corpus, stopwords)
        model_score=VP.classify_with_neural_networks(neural_nets, global_vectors, stopword_corpus, total_training_tweets, nr_pos_tweets, epochs=100, n_folds=3)
        accuracies_stop.append(model_score[0][0])
        stds_stop.append(model_score[0][1])
        stop_lens.append(len(stopwords))
        vocabs.append(vocab)


In [None]:
for i in range(len(stop_lens)):
    print('min', mindfs[i], 'accuracy', accuracies_stop[i],'std',stds_stop[i],  '\n')
for i in range(len(stop_lens)):
    print('number of stop words', stop_lens[i], 'lenght of vocabolary', len(vocabs[i]),'sum for checking',stop_lens[i]+len(vocabs[i]),  '\n')

## Testing the best combination with the simple and the complex neural net

In [None]:
stopwords, vocab= TO.get_dynamic_stopwords(best_corpus, MinDf=5, MaxDf=0.8)
final_corpus= TO.remove_stopwords(best_corpus, stopwords)

In [None]:
model_score=VP.classify_with_neural_networks(NN.basic_model_adam, global_vectors, final_corpus, total_training_tweets, nr_pos_tweets, epochs=100, n_folds=3)

print('Applying the best combination of preprocessing, using pre-trained global vectors with 200 dimensions, an cros-validatino accuracy of', model_score[0][0],'+-',model_score[0][1], 'was achieved using the simple neural net')

In [None]:
model_score=VP.classify_with_neural_networks(NN.complex_model, global_vectors, final_corpus, total_training_tweets, nr_pos_tweets, epochs=100, n_folds=3)

print('Applying the best combination of preprocessing, using pre-trained global vectors with 200 dimensions, an cros-validatino accuracy of', model_score[0][0],'+-',model_score[0][1], 'was achieved using the compelx neural net')