# Comparing Self- and Pre-trained Word Vectors

## Importing libraries and files: 

In [None]:
%load_ext autoreload
%autoreload 2


import tensorflow as tf
from keras import backend as K

# external imports
import numpy as np

from gensim.scripts.glove2word2vec import glove2word2vec

import glove_solution as GS
import glove_module as GV
import helpers as HL
import neural_nets as NN
import maketextfile as MT

## Creating corpus

In [None]:
full_corpus, nr_pos_tweets, nr_neg_tweets, total_training_tweets=HL.get_corpus(test=True)

## Defining the neural net used to compare:

In [None]:
neural_nets=[NN.basic_model_adam]

## Creating global vectors from the co-occurence matrix 
Only have to be done once. Assumes that the cooc_full.pkl file is created and availiable. 

In [None]:
GS.glove('cooc_full.pkl',50, 'embeddings_full50')
GS.glove('cooc_full.pkl',100, 'embeddings_full100')
GS.glove('cooc_full.pkl',200, 'embeddings_full200')


## Creating datafiles of the embeddings: 
Only have to be done once, assumes that the embedding files and vocab_full.pkl file are availiable. 

In [None]:
MT.make_file('embeddings_full50.npy','vocab_full.pkl', 'global_vectors_full_unprepro50.txt')
MT.make_file('embeddings_full100.npy','vocab_full.pkl', 'global_vectors_full_unprepro100.txt')
MT.make_file('embeddings_full200.npy','vocab_full.pkl', 'global_vectors_full_unprepro200.txt')

## Make global vectors of all embeddings: 

In [None]:
global_vectors_pre_50=GV.make_glove("gensim_global_vectors_50dim.txt")
global_vectors_pre_100=GV.make_glove("gensim_global_vectors_100dim.txt")
global_vectors_pre_200=GV.make_glove("gensim_global_vectors_200dim.txt")

global_vectors_self_50=GV.make_glove('global_vectors_full_unprepro50.txt')
global_vectors_self_100=GV.make_glove('global_vectors_full_unprepro100.txt')
global_vectors_self_200=GV.make_glove('global_vectors_full_unprepro200.txt')
                                      

In [None]:

global_vectors=[global_vectors_pre_50,global_vectors_pre_100,global_vectors_pre_200,global_vectors_self_50, global_vectors_self_100, global_vectors_self_200 ]

In [None]:
names=['pre50','pre100','pre200','self50','self100','self200']

## Do cross validation using all global vectors, to compare

In [None]:
stds=[]
accs=[]

for global_vector in global_vectors: 
    model_score=GV.classify_with_neural_networks(neural_nets, global_vector, full_corpus, total_training_tweets, nr_pos_tweets, epochs=100, n_folds=5)
    accs.append(model_score[0][0])
    stds.append(model_score[0][1])

In [None]:


for i in range(len(accs)):
    print('Global vectors: ', names[i], 'Accuracy: ', accs[i], '+- :', stds[i],'\n')