In [1]:
import numpy as np
import gensim 
from gensim.models.word2vec import Word2Vec 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import LinearSVC
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split

from keras.models import Sequential
from keras.layers import Activation, Dense

import time

import helpers as HL
import cleaning as CL

Using TensorFlow backend.


In [2]:
filenames = ['train_pos.txt', 'train_neg.txt'] 

corpus, file_lengths = HL.create_corpus(filenames)

In [5]:
cluster_file_path = '50mpaths2.txt'
cluster_dict = CL.create_dictionary(cluster_file_path)

clusterised_corpus = CL.create_clusterized_corpus(corpus, cluster_dict)

#clusterised_corpus[0]

In [6]:
class IterableCorpus():
    
    def __init__(self, corpus):
        self.corpus = corpus
        
    def __iter__(self):
        for tweet in self.corpus:
            tweet_words = tweet.split()
            yield [word.decode('utf-8') for word in tweet_words]

In [8]:
iterable_corpus = IterableCorpus(clusterised_corpus)

# SMÆKK INN HYPERPARAMETERE HER!!! 
w2v_model = Word2Vec(iterable_corpus, size=100, window=5, min_count=10)

In [9]:
#Training-set numbers
nr_of_positive_training_examples = 100000
nr_of_training_examples = 200000

#Making labels
labels = np.zeros(nr_of_training_examples);
labels[0:nr_of_positive_training_examples]=1;
labels[nr_of_positive_training_examples:nr_of_training_examples]=0; 

In [10]:
x_train, x_test, y_train, y_test = train_test_split(np.array(clusterised_corpus),
                                                    np.array(labels), test_size=0.2)


#### So we have our word2vec model, giving us a representation for each word in the whole corpus. We now need to create a representation of each Tweet in the dataset. 

We will do this with the following steps: 
- Creating a TFIDF model for each word
- Combining the word2vec-vectors for each word, weighting them by their TFIDF score
- Run that shit through a neural network fuck yeaaaaah

In [11]:
#Creating a vectorizer to use for Tf-IDF analysis on words
vectorizer = TfidfVectorizer(
        min_df = 10, # removing word that occure less then 10 times 
        max_df = 1.5, # remove words that are too frequent ( more then 1.5 * number of tweets )
        sublinear_tf=True, # scale the term frequency in logarithmic scale
        use_idf =True
        #stop_words = custom_stop_words # Removing stop-words
    )

In [12]:
corpus_tf_idf = vectorizer.fit_transform(clusterised_corpus)

<200000x1470 sparse matrix of type '<class 'numpy.float64'>'
	with 2741796 stored elements in Compressed Sparse Row format>

In [14]:
tfidf_dict = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))

print('Size: ', len(tfidf_dict))

Size:  1470


#### Now we want to create a vector for each tweet, by combining the word vecs weighted by the tfidf-dict 

In [15]:
def buildWordVector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens.split():
        try:       
            word = word.decode('utf-8')
            word_vec = w2v_model[word].reshape((1, size))             
            idf_weighted_vec = word_vec * tfidf_dict[word]
            vec += idf_weighted_vec
            count += 1.
        except KeyError: # handling the case where the token is not
                         # in the corpus. useful for testing.
            continue
    if count != 0:
        vec /= count
    return vec

In [18]:
# NEEDS TO BE THE SAME AS WHEN CREATING WORD2VEC
n_dim = 100

total_score = 0

train_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in x_train])
train_vecs_w2v = scale(train_vecs_w2v)

test_vecs_w2v = np.concatenate([buildWordVector(z, n_dim) for z in x_test])
test_vecs_w2v = scale(test_vecs_w2v)


model = Sequential()
model.add(Dense(32, activation='relu', input_dim=n_dim))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop',
      loss='binary_crossentropy',
      metrics=['accuracy'])

model.fit(train_vecs_w2v, y_train, epochs=5, batch_size=32, verbose=1)

score = model.evaluate(test_vecs_w2v, y_test, batch_size=128, verbose=1)

print(score)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
[0.44134737029075621, 0.78385000000000005]


In [20]:
model.metrics_names 

['loss', 'acc']