In [1]:
def buildWordVector(tokens, size, model):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens.split():
        try:       
            word = word.decode('utf-8')
            word_vec = model[word].reshape((1, size))             
            idf_weighted_vec = word_vec * tfidf_dict[word]
            vec += idf_weighted_vec
            count += 1.
        except KeyError: # handling the case where the token is not
                         # in the corpus. useful for testing.
            continue
    if count != 0:
        vec /= count
    return vec

In [2]:
class IterableCorpus():
    
    def __init__(self, corpus):
        self.corpus = corpus
        
    def __iter__(self):
        for tweet in self.corpus:
            tweet_words = tweet.split()
            yield [word.decode('utf-8') for word in tweet_words]

In [3]:
import numpy as np
import gensim 
from gensim.models.word2vec import Word2Vec 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import LinearSVC
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

from keras.wrappers.scikit_learn import KerasRegressor

from keras.models import Sequential
from keras.layers import Activation, Dense, Conv1D, Embedding, LSTM

from keras.layers.core import Dense, Dropout, Flatten

from keras.callbacks import EarlyStopping
from keras.callbacks import ModelCheckpoint

from keras.layers.convolutional import Conv1D

from keras.constraints import maxnorm
from keras.optimizers import SGD, Adam

import numpy as np
import tensorflow as tf
import random as rn

import time

import helpers as HL
import cleaning as CL
import preprocessing as PP

Using TensorFlow backend.


#### Global variables [ choose sub_set or full_set ]

In [4]:
#Global variables
N_DIMENTIONS = 200 
test_set_tweets = 10000

#FOR TRAINING_SET
corpus_filenames = ['train_pos.txt', 'train_neg.txt','test_data.txt'] 
nr_pos_tweets = 100000
nr_neg_tweets = 100000
total_training_tweets = 200000

#FOR FULL SET
#corpus_filenames = ['train_pos_full.txt', 'train_neg_full.txt','test_data.txt'] 
#nr_pos_tweets = 1250000
#nr_neg_tweets = 1250000
#total_training_tweets = 2500000

#### Create corpus from files

In [8]:
full_corpus, corpus_file_lengths = HL.create_corpus(corpus_filenames)

print("Length full corpus", len(full_corpus))
print("File lengths:", corpus_file_lengths)

Length full corpus 210000
File lengths: [100000, 100000, 10000]


#### Creating clusterizing dictionary

In [9]:
cluster_file_path = '50mpaths2.txt'
cluster_dict = CL.create_dictionary(cluster_file_path)

print("Len cluster dict:", len(cluster_dict))

Len cluster dict: 216856


### Preprocessing

In [10]:
# Clusterizing corpus 
# clusterised_full_corpus = CL.create_clusterized_corpus(full_corpus, cluster_dict)

corpus = full_corpus

#Adding features
corpus = PP.add_features(corpus)

#Stemming words
corpus = PP.stem_words(corpus)

#### Creating vectorizer to create idf_weighting of each word in the corpus  

In [11]:
my_stopword_list = ['and','to','the','of','in','there']

vectorizer = TfidfVectorizer(
    min_df = 10, # removing word that occure less then 10 times 
    max_df = 0.3, # remove words that are too frequent ( more then 0.8 * number of files )
    sublinear_tf=True, # scale the term frequency in logarithmic scale
    max_features = 5000,
    use_idf = True, 
    stop_words = my_stopword_list,
    ngram_range=(1,2)
)

corpus_tf_idf = vectorizer.fit_transform(corpus)

tfidf_dict = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_)) 

#### Creating word2vec for all tweets, train and test

In [12]:
#Making the corpus iterable to be able to feed it to word2vec from gensim
iterable_full_corpus = IterableCorpus(corpus)

w2v_model = Word2Vec(iterable_full_corpus, size=N_DIMENTIONS, window=5, min_count=10, workers=4)

print("W2v model:", w2v_model)

W2v model: Word2Vec(vocab=10448, size=200, alpha=0.025)


#### Using a different form of the word2vec to use a LOT less RAM when running

In [13]:
word_vectors = w2v_model.wv
del w2v_model

#### Creating labels for the training files. Used to perform validation of the predictions

In [14]:
#Making labels
labels = HL.create_labels(nr_pos_tweets, nr_pos_tweets)

#### Splitting the corpus into train and prediction - parts

We're done training the word2vec, so all "common" operations are finished

In [15]:
# SPLITTING THE W2V model into training and predict
train_clusterised_corpus = corpus[:total_training_tweets:]
predict_clusterised_corpus = corpus[total_training_tweets::]

print(len(train_clusterised_corpus))
print(len(predict_clusterised_corpus))

200000
10000


#### Creating document wecs for training and test-set

In other words a vector for each tweet, representing the content of that tweet. Each word in each tweet is weighted by its idf-score found in the 'tfidf_dict'

In [16]:
train_document_vecs = np.concatenate([buildWordVector(z, N_DIMENTIONS, word_vectors) for z in train_clusterised_corpus])
train_document_vecs = scale(train_document_vecs)

In [None]:
# ONLY FOR KAGGLE
test_document_vecs = np.concatenate([buildWordVector(z, N_DIMENTIONS, word_vectors) for z in predict_clusterised_corpus])
test_document_vecs = scale(test_document_vecs)

In [None]:
print("Train w2v shape:",train_document_vecs.shape)
print("Train w2v shape:",test_document_vecs.shape)

### Running the neural net classifier

In [17]:
#TING FOR Å SIKRE REPRODUSERBARHET ( ikke alt er nødv. nødvendig )

import os
os.environ['PYTHONHASHSEED'] = '0'
rn.seed(12345) # NO IDEA WHAT THIS DOES

seed = 7
np.random.seed(seed)

#### Defining some neural net models

In [25]:
def basic_model():
    model = Sequential()
    model.add(Dense(100, input_dim=N_DIMENTIONS, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def basic_model_adam():
    model = Sequential()
    model.add(Dense(100, input_dim=N_DIMENTIONS, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def wide_model():
    model = Sequential()
    model.add(Dense(150, input_dim=N_DIMENTIONS, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def deep_1_model():
    model = Sequential()
    model.add(Dense(100, input_dim=N_DIMENTIONS, kernel_initializer='normal', activation='relu'))
    model.add(Dense(60, input_dim=N_DIMENTIONS, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def deep_2_model():
    model = Sequential()
    model.add(Dense(100, input_dim=N_DIMENTIONS, kernel_initializer='normal', activation='relu'))
    model.add(Dense(60, input_dim=N_DIMENTIONS, kernel_initializer='normal', activation='relu'))
    model.add(Dense(30, input_dim=N_DIMENTIONS, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

def deep_wide():
    model = Sequential()
    model.add(Dense(150, input_dim=N_DIMENTIONS, kernel_initializer='normal', activation='relu'))
    model.add(Dense(150, input_dim=N_DIMENTIONS, kernel_initializer='normal', activation='relu'))
    model.add(Dense(150, input_dim=N_DIMENTIONS, kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal', activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model


#### Running some nets 

In [19]:
def run_k_fold(models, X, Y, epochs, n_folds, seed):
    
    for neural_model in models:
        
        model_name = neural_model.__name__
        
        model = neural_model()
        
        start = time.time()

        kfold = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed)
        cv_scores = []
        
        for train, test in kfold.split(X, Y):
            
            early_stopping = EarlyStopping(monitor='loss', patience=3)
            
            model.fit(X[train], Y[train], epochs=epochs, batch_size=1024, verbose=0, callbacks=[early_stopping])

            scores = model.evaluate(X[test], Y[test], verbose=0)

            cv_scores.append(scores[1] * 100)

        print("Model: ", model_name)
        print("%.2f%% (+/- %.2f%%)" % (np.mean(cv_scores), np.std(cv_scores)))
        print("Time taken: ", (time.time() - start) / 60, "\n")

In [20]:
models = [basic_model, basic_model_adam, wide_model, deep_1_model, deep_2_model]

run_k_fold(models, train_document_vecs, labels, epochs=10, n_folds=3, seed=7)

Model:  basic_model
79.19% (+/- 0.63%)
Time taken:  1.8317458709081014 

Model:  basic_model_adam
79.23% (+/- 0.45%)
Time taken:  1.8012408852577209 

Model:  wide_model
79.43% (+/- 0.69%)
Time taken:  1.6502403140068054 

Model:  deep_1_model
79.61% (+/- 0.65%)
Time taken:  1.7633070906003316 

Model:  deep_2_model
79.53% (+/- 0.75%)
Time taken:  1.899647319316864 



In [24]:
models=[deep_wide]
run_k_fold(models, train_document_vecs, labels, epochs=30, n_folds=2, seed=7)

Model:  deep_wide
77.28% (+/- 0.79%)
Time taken:  2.010883013407389 



### Stuff for making keggle predictions! choose the best model to be 'model', then run .predict! :-) 

In [None]:
predictions = model.predict(test_vecs_w2v)

print(predictions)

In [None]:
rounded = [-1 if round(x[0])==0 else 1 for x in predictions]

print(rounded)

In [None]:
ids = list(range(1,test_set_tweets+1))
y_pred = rounded
name = "keggle_submission_neural.csv"

HL.create_csv_submission(ids, y_pred, name)

# OLD STUFF, PROBABLY NOT TO BE USED

In [None]:
#KAN BRUKES TIL NÅR MAN SKAL GJØRE EN MODEL FIT, NÅR MAN HAR FUNNET RIKTIG MODELL Å BRUKE OVER

# Smaller batch equalls longer run times, but less epochs for convergence 
# DENNE GA 80% PÅ KAGGLE

#from keras.callbacks import EarlyStopping
#from keras.callbacks import ModelCheckpoint

#early_stopping = EarlyStopping(monitor='loss', patience=2)
#checkpointer = ModelCheckpoint(filepath='tmp/saved_weights.hdf5', verbose=1, save_best_only=True)

#model_4 = Sequential()
#model_4.add(Dropout(0.05, input_shape=(N_DIMENTIONS,)))
#model_4.add(Dense(100, activation='relu', input_dim=N_DIMENTIONS))
#model_4.add(Dense(60, activation='relu'))
#model_4.add(Dense(30, activation='relu'))
#model_4.add(Dropout(0, input_shape=(N_DIMENTIONS,)))
#model_4.add(Dense(1, activation='sigmoid'))

#sgd = SGD(lr=0.1, momentum=0.9, decay=0.0, nesterov=False)

#model_4.compile(optimizer='rmsprop',
#      loss='binary_crossentropy',
#      metrics=['accuracy'])

#model_4.fit(training_vecs_w2v, labels, epochs=30, batch_size=1024, verbose=1, callbacks=[early_stopping, checkpointer], validation_split=0.15)

In [None]:
#from keras.models import load_model

#best_model = load_model('tmp/saved_weights.hdf5')