In [33]:
def buildWordVector(tokens, size, model):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens.split():
        try:       
            word = word.decode('utf-8')
            word_vec = model[word].reshape((1, size))             
            idf_weighted_vec = word_vec * tfidf_dict[word]
            vec += idf_weighted_vec
            count += 1.
        except KeyError: # handling the case where the token is not
                         # in the corpus. useful for testing.
            continue
    if count != 0:
        vec /= count
    return vec

In [18]:
class IterableCorpus():
    
    def __init__(self, corpus):
        self.corpus = corpus
        
    def __iter__(self):
        for tweet in self.corpus:
            tweet_words = tweet.split()
            yield [word.decode('utf-8') for word in tweet_words]

In [97]:
import numpy as np
import gensim 
from gensim.models.word2vec import Word2Vec 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import LinearSVC
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

from keras.wrappers.scikit_learn import KerasRegressor

from keras.models import Sequential
from keras.layers import Activation, Dense, Conv1D, Embedding, LSTM

from keras.layers.core import Dense, Dropout, Flatten

from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint

from keras.layers.convolutional import Conv1D

from keras.constraints import maxnorm
from keras.optimizers import SGD, Adam

import numpy as np
import tensorflow as tf
import random as rn

import time

import helpers as HL
import cleaning as CL



# MEANING doc-vecs not necessary with gensim word2vec bcof gensim distance mesaure sim_score = cosing distance

#### Global variables [ choose sub_set or full_set ]

In [10]:
#Global variables
N_DIMENTIONS = 200 
test_set_tweets = 10000

#FOR TRAINING_SET
corpus_filenames = ['train_pos.txt', 'train_neg.txt','test_data.txt'] 
nr_pos_tweets = 100000
nr_neg_tweets = 100000
total_training_tweets = 200000

#FOR FULL SET
#corpus_filenames = ['train_pos_full.txt', 'train_neg_full.txt','test_data.txt'] 
#nr_pos_tweets = 1250000
#nr_neg_tweets = 1250000
#total_training_tweets = 2500000

#### Create corpus from files

In [12]:
full_corpus, corpus_file_lengths = HL.create_corpus(corpus_filenames)

print("Length full corpus", len(full_corpus))
print("File lengths:", corpus_file_lengths)

Length full corpus 210000
File lengths: [100000, 100000, 10000]


#### Creating clusterizing dictionary

In [13]:
cluster_file_path = '50mpaths2.txt'
cluster_dict = CL.create_dictionary(cluster_file_path)

print("Len cluster dict:", len(cluster_dict))

Len cluster dict: 216856


#### Clusterizing corpus 

In practice quite equal to stemming the corpus 

In [15]:
clusterised_full_corpus = CL.create_clusterized_corpus(full_corpus, cluster_dict)

print("Len clusterised training corpus: ", len(clusterised_full_corpus))

Len clusterised training corpus:  210000


#### N-grams

In [16]:
ngram_corpus = HL.creating_n_grams_cropus(corpus=clusterised_full_corpus, n_gram=2) #2GramsForLife 

#### Creating vectorizer to create idf_weighting of each word in the corpus  

In [26]:
vectorizer = TfidfVectorizer(
        sublinear_tf=True, # scale the term frequency in logarithmic scale
        use_idf =True
    )

corpus_tf_idf = vectorizer.fit_transform(ngram_corpus)

tfidf_dict = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_)) 

#### Creating word2vec for all tweets, train and test

In [41]:
#Making the corpus iterable to be able to feed it to word2vec from gensim
iterable_full_corpus = IterableCorpus(ngram_corpus)

w2v_model = Word2Vec(iterable_full_corpus, size=N_DIMENTIONS, window=5, min_count=10, workers=4)

print("W2v model:", w2v_model)

W2v model: Word2Vec(vocab=44487, size=200, alpha=0.025)


#### Using a different form of the word2vec to use a LOT less RAM when running

In [28]:
word_vectors = w2v_model.wv
del w2v_model

#### Creating labels for the training files. Used to perform validation of the predictions

In [21]:
#Making labels
labels = np.zeros(total_training_tweets);
labels[0:nr_pos_tweets]=1;
labels[nr_pos_tweets:total_training_tweets]=0; 

print("labels shape: ", labels.shape)

labels shape:  (200000,)


#### Splitting the corpus into train and prediction - parts

We're done training the word2vec, so all "common" operations are finished

In [30]:
# SPLITTING THE W2V model into training and predict
train_clusterised_corpus = clusterised_full_corpus[:total_training_tweets:]
predict_clusterised_corpus = clusterised_full_corpus[total_training_tweets::]

print(len(train_clusterised_corpus))
print(len(predict_clusterised_corpus))

200000
10000


#### Creating document wecs for training and test-set

In other words a vector for each tweet, representing the content of that tweet. Each word in each tweet is weighted by its idf-score found in the 'tfidf_dict'

In [35]:
train_document_vecs = np.concatenate([buildWordVector(z, N_DIMENTIONS, word_vectors) for z in train_clusterised_corpus])
train_document_vecs = scale(train_document_vecs)

In [36]:
# ONLY FOR KAGGLE
test_document_vecs = np.concatenate([buildWordVector(z, N_DIMENTIONS, word_vectors) for z in predict_clusterised_corpus])
test_document_vecs = scale(test_document_vecs)

In [37]:
print("Train w2v shape:",train_document_vecs.shape)
print("Train w2v shape:",test_document_vecs.shape)

Train w2v shape: (200000, 200)
Train w2v shape: (10000, 200)


### Running the neural net classifier

In [39]:
#TING FOR Å SIKRE REPRODUSERBARHET ( ikke alt er nødv. nødvendig )

import os
os.environ['PYTHONHASHSEED'] = '0'
rn.seed(12345) # NO IDEA WHAT THIS DOES

seed = 7
np.random.seed(seed)

#### Defining some neural net models

In [95]:
def basic_model():
    model = Sequential()
    model.add(Dense(100, input_dim=N_DIMENTIONS, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def basic_model_adam():
    model = Sequential()
    model.add(Dense(100, input_dim=N_DIMENTIONS, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def wide_model():
    model = Sequential()
    model.add(Dense(150, input_dim=N_DIMENTIONS, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def deep_1_model():
    model = Sequential()
    model.add(Dense(100, input_dim=N_DIMENTIONS, kernel_initializer='normal', activation='relu'))
    model.add(Dense(60, input_dim=N_DIMENTIONS, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def deep_2_model():
    model = Sequential()
    model.add(Dense(100, input_dim=N_DIMENTIONS, kernel_initializer='normal', activation='relu'))
    model.add(Dense(60, input_dim=N_DIMENTIONS, kernel_initializer='normal', activation='relu'))
    model.add(Dense(30, input_dim=N_DIMENTIONS, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def deep_2_model():
    model = Sequential()
    model.add(Dense(100, input_dim=N_DIMENTIONS, kernel_initializer='normal', activation='relu'))
    model.add(Dense(60, input_dim=N_DIMENTIONS, kernel_initializer='normal', activation='relu'))
    model.add(Dense(30, input_dim=N_DIMENTIONS, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model


def recurrent_model():
    # Start neural network
    model = Sequential()

    # Add an embedding layer
    model.add(Embedding(input_dim=N_DIMENTIONS, output_dim=128))

    # Add a long short-term memory layer with 128 units
    model.add(LSTM(units=128))

    # Add fully connected layer with a sigmoid activation function
    model.add(Dense(units=1, activation='sigmoid'))
    
    # Compile neural network
    model.compile(loss='binary_crossentropy', # Cross-entropy
                optimizer='Adam', # Adam optimization
                metrics=['accuracy']) # Accuracy performance metric

    return model

def convolutional_model():
    
    model = Sequential()
    model.add(Conv1D(32,  activation='elu', padding='same', input_shape=(133332, N_DIMENTIONS)))
    model.add(Conv1D(32,  activation='elu', padding='same'))
    model.add(Conv1D(32,  activation='elu', padding='same'))
    model.add(Conv1D(32,  activation='elu', padding='same'))
    model.add(Dropout(0.25))
    model.add(Conv1D(32,  activation='elu', padding='same'))
    model.add(Conv1D(32,  activation='elu', padding='same'))
    model.add(Conv1D(32,  activation='elu', padding='same'))
    model.add(Conv1D(32,  activation='elu', padding='same'))
    model.add(Dropout(0.25))
    model.add(Flatten())
    model.add(Dense(256, activation='tanh'))
    model.add(Dense(256, activation='tanh'))
    model.add(Dropout(0.5))
    model.add(Dense(2, activation='softmax'))

    model.compile(loss='binary_crossentropy', optimizer=Adam(lr=0.0001, decay=1e-6), metrics=['accuracy'])
    
    return model

#Convolution 
#Prøve netverk uten noen av pre-processing
#Recurrent?
#LSTM network? Long short term memory? 
#Dropout? 

#### Running some nets 

In [100]:
def run_k_fold(models, X, Y, epochs, n_folds, seed):
    
    for neural_model in models:
        
        model_name = neural_model.__name__
        
        model = neural_model()
        
        start = time.time()

        kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
        cv_scores = []
        
        for train, test in kfold.split(X, Y):
            
            early_stopping = EarlyStopping(monitor='loss', patience=3)
            
            model.fit(X[train], Y[train], epochs=epochs, batch_size=1024, verbose=0, callbacks=[early_stopping])

            scores = model.evaluate(X[test], Y[test], verbose=0)

            cv_scores.append(scores[1] * 100)

        print("Model: ", model_name)
        print("%.2f%% (+/- %.2f%%)" % (np.mean(cv_scores), np.std(cv_scores)))
        print("Time taken: ", (time.time() - start) / 60, "\n")

In [67]:
models = [basic_model, basic_model_adam, wide_model, deep_1_model, deep_2_model]

run_k_fold(models, train_document_vecs, labels, epochs=10, n_folds=3, seed=7)

Model:  basic_model
80.45% (+/- 0.49%)
Time taken:  0.7064207275708516 

Model:  basic_model_adam
80.86% (+/- 0.52%)
Time taken:  1.143131975332896 

Model:  wide_model
80.93% (+/- 0.73%)
Time taken:  1.0753306984901427 

Model:  deep_1_model
80.13% (+/- 1.16%)
Time taken:  1.1953313549359639 

Model:  deep_2_model
79.79% (+/- 0.87%)
Time taken:  1.1410720268885295 



In [102]:
#NOT WORKING YET
#models = [recurrent_model]

#run_k_fold(models, train_document_vecs, labels, epochs=3, n_folds=3, seed=7)

### Stuff for making keggle predictions! Probably needs updating, not working yet! Look at old_stuff to see how to make a prediction

In [None]:
predictions = model.predict(test_vecs_w2v)

print(predictions)

In [None]:
rounded = [-1 if round(x[0])==0 else 1 for x in predictions]

print(rounded)

In [None]:
ids = list(range(1,test_set_tweets+1))
y_pred = rounded
name = "keggle_submission_neural.csv"

HL.create_csv_submission(ids, y_pred, name)

# OLD STUFF, PROBABLY NOT TO BE USED

In [None]:
#KAN BRUKES TIL NÅR MAN SKAL GJØRE EN MODEL FIT, NÅR MAN HAR FUNNET RIKTIG MODELL Å BRUKE OVER

# Smaller batch equalls longer run times, but less epochs for convergence 
# DENNE GA 80% PÅ KAGGLE

#from keras.callbacks import EarlyStopping
#from keras.callbacks import ModelCheckpoint

#early_stopping = EarlyStopping(monitor='loss', patience=2)
#checkpointer = ModelCheckpoint(filepath='tmp/saved_weights.hdf5', verbose=1, save_best_only=True)

#model_4 = Sequential()
#model_4.add(Dropout(0.05, input_shape=(N_DIMENTIONS,)))
#model_4.add(Dense(100, activation='relu', input_dim=N_DIMENTIONS))
#model_4.add(Dense(60, activation='relu'))
#model_4.add(Dense(30, activation='relu'))
#model_4.add(Dropout(0, input_shape=(N_DIMENTIONS,)))
#model_4.add(Dense(1, activation='sigmoid'))

#sgd = SGD(lr=0.1, momentum=0.9, decay=0.0, nesterov=False)

#model_4.compile(optimizer='rmsprop',
#      loss='binary_crossentropy',
#      metrics=['accuracy'])

#model_4.fit(training_vecs_w2v, labels, epochs=30, batch_size=1024, verbose=1, callbacks=[early_stopping, checkpointer], validation_split=0.15)

In [None]:
#from keras.models import load_model

#best_model = load_model('tmp/saved_weights.hdf5')