is_duplicate - the target variable, set to 1 if question1 and question2 have essentially the same meaning, and 0 otherwise.

Data source: https://www.kaggle.com/c/quora-question-pairs/data

!brew install wget
!wget -c “https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz”

In [1]:
########################################
## import packages
########################################
import os
import re
import csv
import codecs
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation
from gensim.models import KeyedVectors
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
import sys

Using TensorFlow backend.


In [248]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [256]:
########################################
## set directories and parameters
########################################
BASE_DIR = '/Users/omer/Downloads/RNN_ex/'
EMBEDDING_FILE = BASE_DIR + 'GoogleNews-vectors-negative300.bin'
TRAIN_DATA_FILE = BASE_DIR + 'train.csv'
TEST_DATA_FILE = BASE_DIR + 'test.csv'
MAX_SEQUENCE_LENGTH = 15
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.1

In [257]:
num_lstm = np.random.randint(175, 275)
num_dense = np.random.randint(100, 150)
rate_drop_lstm = 0.15 + np.random.rand() * 0.25
rate_drop_dense = 0.15 + np.random.rand() * 0.25

In [258]:
act = 'relu'
re_weight = True # whether to re-weight classes to fit the 17.5% share in test set

STAMP = 'lstm_%d_%d_%.2f_%.2f'%(num_lstm, num_dense, rate_drop_lstm, \
        rate_drop_dense)

In [259]:
########################################
## index word vectors
########################################
print('Indexing word vectors')
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, \
        binary=True)
print('Found %s word vectors of word2vec' % len(word2vec.vocab))

Indexing word vectors
Found 3000000 word vectors of word2vec


### Let's see how to access word vectors

In [260]:
## Dictionary of all words from GoogleNews
word2vec.vocab

{'</s>': <gensim.models.keyedvectors.Vocab at 0x1b1ec58860>,
 'in': <gensim.models.keyedvectors.Vocab at 0x1b1ec58b00>,
 'for': <gensim.models.keyedvectors.Vocab at 0x1b1ec58be0>,
 'that': <gensim.models.keyedvectors.Vocab at 0x1b1ec58cc0>,
 'is': <gensim.models.keyedvectors.Vocab at 0x1b1ec58da0>,
 'on': <gensim.models.keyedvectors.Vocab at 0x1b1ec58e80>,
 '##': <gensim.models.keyedvectors.Vocab at 0x1b1ec58f60>,
 'The': <gensim.models.keyedvectors.Vocab at 0x1caf21c0b8>,
 'with': <gensim.models.keyedvectors.Vocab at 0x1caf21c2e8>,
 'said': <gensim.models.keyedvectors.Vocab at 0x1caf21c518>,
 'was': <gensim.models.keyedvectors.Vocab at 0x1caf21c748>,
 'the': <gensim.models.keyedvectors.Vocab at 0x1caf21c978>,
 'at': <gensim.models.keyedvectors.Vocab at 0x1caf21cba8>,
 'not': <gensim.models.keyedvectors.Vocab at 0x1caf20b4a8>,
 'as': <gensim.models.keyedvectors.Vocab at 0x1caf20b278>,
 'it': <gensim.models.keyedvectors.Vocab at 0x1caf20b908>,
 'be': <gensim.models.keyedvectors.Vocab at

In [261]:
# Access a single word 
word2vec['child']

array([ 1.65039062e-01, -6.39648438e-02, -1.78527832e-03,  1.84570312e-01,
       -1.18164062e-01,  1.16210938e-01,  2.77343750e-01,  4.61425781e-02,
        3.34472656e-02, -1.78710938e-01,  2.48046875e-01, -1.29882812e-01,
        1.22558594e-01, -5.66406250e-02, -1.61132812e-01,  1.24511719e-01,
       -1.44531250e-01,  1.58203125e-01, -9.57031250e-02,  8.39843750e-02,
        1.02050781e-01,  2.33154297e-02,  2.53906250e-01, -1.32446289e-02,
        2.34375000e-02, -2.98828125e-01,  4.45556641e-03,  8.49609375e-02,
        2.04101562e-01, -1.88476562e-01, -9.13085938e-02, -1.55273438e-01,
       -4.17968750e-01, -1.23535156e-01, -3.28125000e-01,  4.58984375e-02,
        3.49609375e-01, -1.04980469e-01, -1.59179688e-01,  1.37695312e-01,
       -1.96289062e-01,  7.37304688e-02,  1.53320312e-01, -5.46875000e-02,
        1.57226562e-01,  1.95312500e-02,  2.13867188e-01,  1.45507812e-01,
        2.50244141e-02,  1.41601562e-02,  1.02539062e-01,  3.27148438e-02,
       -4.66308594e-02,  

In [262]:
########################################
## process texts in datasets
########################################
print('Processing text dataset')

# The function "text_to_wordlist" is from
# https://www.kaggle.com/currie32/quora-question-pairs/the-importance-of-cleaning-text
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

Processing text dataset


In [263]:
## create list of lists where each sentence is a list containing a list of its words
count = 1 
texts_1 = [] 
texts_2 = []
labels = []
with codecs.open(TRAIN_DATA_FILE, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    for values in reader:
        if count == 1:
            print(values)
        texts_1.append(text_to_wordlist(values[3]))
        texts_2.append(text_to_wordlist(values[4]))
        labels.append(int(values[5]))
        count += 1
print('Found %s texts in train.csv' % len(texts_1))

test_texts_1 = []
test_texts_2 = []
test_ids = []
with codecs.open(TEST_DATA_FILE, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    for values in reader:
        test_texts_1.append(text_to_wordlist(values[1]))
        test_texts_2.append(text_to_wordlist(values[2]))
        test_ids.append(values[0])
print('Found %s texts in test.csv' % len(test_texts_1))



['0', '1', '2', 'What is the step by step guide to invest in share market in india?', 'What is the step by step guide to invest in share market?', '0']
Found 404290 texts in train.csv
Found 2345796 texts in test.csv


#### We will take 4 sentences as a toy example

In [264]:
texts_1[1:5]

['what is the story of kohinoor koh - i - noor diamond ',
 'how can i increase the speed of my internet connection while using a vpn ',
 'why am i mentally very lonely how can i solve it ',
 'which one dissolve in water quikly sugar salt methane and carbon di oxide ']

In [265]:
texts_2[1:5]

['what would happen if the indian government stole the kohinoor koh - i - noor diamond back ',
 'how can internet speed be increased by hacking through dns ',
 'find the remainder when math 23 ^ 24 math is divided by 24 23 ',
 'which fish would survive in salt water ']

In [266]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts_1[1:5] + texts_2[1:5] + test_texts_1[1:5] + test_texts_2[1:5])

In [267]:
tokenizer

<keras_preprocessing.text.Tokenizer at 0x1b1ec58358>

In [268]:
# summarize what was learned
## This is ordered sentence by sentence with word count
print(tokenizer.word_counts)

OrderedDict([('what', 5), ('is', 3), ('the', 7), ('story', 1), ('of', 2), ('kohinoor', 2), ('koh', 2), ('i', 7), ('noor', 2), ('diamond', 2), ('how', 7), ('can', 4), ('increase', 1), ('speed', 2), ('my', 1), ('internet', 2), ('connection', 1), ('while', 1), ('using', 1), ('a', 2), ('vpn', 1), ('why', 1), ('am', 1), ('mentally', 1), ('very', 1), ('lonely', 1), ('solve', 1), ('it', 2), ('which', 3), ('one', 1), ('dissolve', 1), ('in', 2), ('water', 2), ('quikly', 1), ('sugar', 1), ('salt', 2), ('methane', 1), ('and', 1), ('carbon', 1), ('di', 1), ('oxide', 1), ('would', 3), ('happen', 1), ('if', 1), ('indian', 1), ('government', 1), ('stole', 1), ('back', 1), ('be', 1), ('increased', 1), ('by', 2), ('hacking', 1), ('through', 1), ('dns', 1), ('find', 1), ('remainder', 1), ('when', 1), ('math', 2), ('23', 2), ('24', 3), ('divided', 1), ('fish', 1), ('survive', 1), ('should', 1), ('have', 1), ('hair', 2), ('transplant', 2), ('at', 1), ('age', 1), ('much', 2), ('cost', 2), ('but', 1), ('bes

In [269]:
## mapping words to the number of documents/texts they appeared on 
print(tokenizer.word_docs)

defaultdict(<class 'int'>, {'diamond': 2, 'is': 3, 'what': 5, 'the': 5, 'i': 6, 'kohinoor': 2, 'noor': 2, 'story': 1, 'koh': 2, 'of': 2, 'how': 7, 'increase': 1, 'connection': 1, 'can': 4, 'vpn': 1, 'while': 1, 'my': 1, 'internet': 2, 'using': 1, 'speed': 2, 'a': 2, 'very': 1, 'mentally': 1, 'solve': 1, 'it': 2, 'lonely': 1, 'why': 1, 'am': 1, 'and': 1, 'one': 1, 'dissolve': 1, 'oxide': 1, 'methane': 1, 'in': 2, 'sugar': 1, 'which': 3, 'carbon': 1, 'salt': 2, 'di': 1, 'water': 2, 'quikly': 1, 'government': 1, 'would': 3, 'indian': 1, 'back': 1, 'stole': 1, 'if': 1, 'happen': 1, 'increased': 1, 'through': 1, 'hacking': 1, 'by': 2, 'dns': 1, 'be': 1, 'divided': 1, 'when': 1, '23': 1, 'find': 1, '24': 2, 'remainder': 1, 'math': 1, 'fish': 1, 'survive': 1, 'should': 1, 'much': 2, 'transplant': 2, 'at': 1, 'have': 1, 'hair': 2, 'age': 1, 'cost': 2, 'best': 1, 'to': 2, 'from': 1, 'send': 2, 'way': 1, 'money': 2, 'china': 2, 'us': 1, 'but': 1, 'not': 1, 'emulsifiers': 1, 'food': 1, 'aberystwy

In [270]:
print(tokenizer.word_index)

{'the': 1, 'i': 2, 'how': 3, 'what': 4, 'can': 5, 'is': 6, 'which': 7, 'would': 8, '24': 9, 'to': 10, 'of': 11, 'kohinoor': 12, 'koh': 13, 'noor': 14, 'diamond': 15, 'speed': 16, 'internet': 17, 'a': 18, 'it': 19, 'in': 20, 'water': 21, 'salt': 22, 'by': 23, 'math': 24, '23': 25, 'hair': 26, 'transplant': 27, 'much': 28, 'cost': 29, 'send': 30, 'money': 31, 'china': 32, 'start': 33, 'reading': 34, 'story': 35, 'increase': 36, 'my': 37, 'connection': 38, 'while': 39, 'using': 40, 'vpn': 41, 'why': 42, 'am': 43, 'mentally': 44, 'very': 45, 'lonely': 46, 'solve': 47, 'one': 48, 'dissolve': 49, 'quikly': 50, 'sugar': 51, 'methane': 52, 'and': 53, 'carbon': 54, 'di': 55, 'oxide': 56, 'happen': 57, 'if': 58, 'indian': 59, 'government': 60, 'stole': 61, 'back': 62, 'be': 63, 'increased': 64, 'hacking': 65, 'through': 66, 'dns': 67, 'find': 68, 'remainder': 69, 'when': 70, 'divided': 71, 'fish': 72, 'survive': 73, 'should': 74, 'have': 75, 'at': 76, 'age': 77, 'but': 78, 'best': 79, 'way': 80,

In [271]:
print(tokenizer.document_count)

16


In [272]:
print(tokenizer.index_word)

{1: 'the', 2: 'i', 3: 'how', 4: 'what', 5: 'can', 6: 'is', 7: 'which', 8: 'would', 9: '24', 10: 'to', 11: 'of', 12: 'kohinoor', 13: 'koh', 14: 'noor', 15: 'diamond', 16: 'speed', 17: 'internet', 18: 'a', 19: 'it', 20: 'in', 21: 'water', 22: 'salt', 23: 'by', 24: 'math', 25: '23', 26: 'hair', 27: 'transplant', 28: 'much', 29: 'cost', 30: 'send', 31: 'money', 32: 'china', 33: 'start', 34: 'reading', 35: 'story', 36: 'increase', 37: 'my', 38: 'connection', 39: 'while', 40: 'using', 41: 'vpn', 42: 'why', 43: 'am', 44: 'mentally', 45: 'very', 46: 'lonely', 47: 'solve', 48: 'one', 49: 'dissolve', 50: 'quikly', 51: 'sugar', 52: 'methane', 53: 'and', 54: 'carbon', 55: 'di', 56: 'oxide', 57: 'happen', 58: 'if', 59: 'indian', 60: 'government', 61: 'stole', 62: 'back', 63: 'be', 64: 'increased', 65: 'hacking', 66: 'through', 67: 'dns', 68: 'find', 69: 'remainder', 70: 'when', 71: 'divided', 72: 'fish', 73: 'survive', 74: 'should', 75: 'have', 76: 'at', 77: 'age', 78: 'but', 79: 'best', 80: 'way',

In [273]:
encoded_docs = tokenizer.texts_to_matrix((texts_1[1:5] + texts_2[1:5] + test_texts_1[1:5] + test_texts_2[1:5]), mode='count')
print(encoded_docs)

[[0. 1. 1. ... 0. 0. 0.]
 [0. 1. 1. ... 0. 0. 0.]
 [0. 0. 2. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 1. ... 0. 0. 0.]]


In [274]:
encoded_docs_tfidf = tokenizer.texts_to_matrix((texts_1[1:5] + texts_2[1:5] + test_texts_1[1:5] + test_texts_2[1:5]), mode='tfidf')
print(encoded_docs_tfidf)

[[0.         1.29928298 1.18958407 ... 0.         0.         0.        ]
 [0.         1.29928298 1.18958407 ... 0.         0.         0.        ]
 [0.         0.         2.01414091 ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         1.18958407 ... 0.         0.         0.        ]]


In [275]:
## Transform each text in texts to a sequence of its associated index.
sequences_1 = tokenizer.texts_to_sequences(texts_1[1:5])
sequences_2 = tokenizer.texts_to_sequences(texts_2[1:5])
test_sequences_1 = tokenizer.texts_to_sequences(test_texts_1[1:5])
test_sequences_2 = tokenizer.texts_to_sequences(test_texts_2[1:5])

word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

Found 92 unique tokens


In [279]:
sequences_1

[[4, 6, 1, 35, 11, 12, 13, 2, 14, 15],
 [3, 5, 2, 36, 1, 16, 11, 37, 17, 38, 39, 40, 18, 41],
 [42, 43, 2, 44, 45, 46, 3, 5, 2, 47, 19],
 [7, 48, 49, 20, 21, 50, 51, 22, 52, 53, 54, 55, 56]]

#### check that it's correct:

In [280]:
sorted(sequences_1[0])

[1, 2, 4, 6, 11, 12, 13, 14, 15, 35]

In [281]:
for word, idx in word_index.items():     
    if idx in  sequences_1[0]:
        print(word, idx)

the 1
i 2
what 4
is 6
of 11
kohinoor 12
koh 13
noor 14
diamond 15
story 35


In [282]:
## How to get vector representation based on its index. Example:
print(index_word[4])
word2vec[index_word[4]]

what


array([ 0.13964844, -0.00616455,  0.21484375,  0.07275391, -0.16113281,
        0.07568359,  0.16796875, -0.20117188,  0.12597656,  0.00915527,
        0.05249023, -0.15136719, -0.02758789,  0.04199219, -0.234375  ,
        0.13867188, -0.02600098,  0.07910156,  0.02746582, -0.13085938,
       -0.02478027,  0.10009766, -0.07910156, -0.07714844,  0.03759766,
        0.16894531,  0.05371094, -0.05200195,  0.14453125, -0.04370117,
       -0.12597656,  0.06884766, -0.10595703, -0.14550781, -0.00331116,
        0.01367188,  0.13964844,  0.01660156,  0.03417969,  0.16113281,
       -0.01080322,  0.06689453,  0.06835938, -0.15136719, -0.16894531,
        0.03295898, -0.06884766,  0.06787109, -0.07373047,  0.08300781,
        0.05761719,  0.14550781, -0.11865234, -0.13671875,  0.12402344,
        0.04296875, -0.11962891, -0.08154297,  0.06494141, -0.05639648,
       -0.04394531,  0.1484375 , -0.07714844,  0.04614258, -0.02624512,
       -0.06591797,  0.04980469,  0.08886719, -0.01647949, -0.02

In [283]:
### Notice that sequences are of varying lengths. How would we use them in a model??
### solution: padding 
data_1 = pad_sequences(sequences_1, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
data_2 = pad_sequences(sequences_2, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
labels = np.array(labels)
print('Shape of data tensor:', data_1.shape)
print('Shape of label tensor:', labels[1:5].shape)

test_data_1 = pad_sequences(test_sequences_1, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
test_data_2 = pad_sequences(test_sequences_2, maxlen=MAX_SEQUENCE_LENGTH, padding='post')
test_ids = np.array(test_ids)

Shape of data tensor: (4, 15)
Shape of label tensor: (4,)


In [284]:
data_1

array([[ 4,  6,  1, 35, 11, 12, 13,  2, 14, 15,  0,  0,  0,  0,  0],
       [ 3,  5,  2, 36,  1, 16, 11, 37, 17, 38, 39, 40, 18, 41,  0],
       [42, 43,  2, 44, 45, 46,  3,  5,  2, 47, 19,  0,  0,  0,  0],
       [ 7, 48, 49, 20, 21, 50, 51, 22, 52, 53, 54, 55, 56,  0,  0]],
      dtype=int32)

In [285]:
# Now we finally can put all the data in one array (3d) where 
# dimension 1: # sentences
# dimension 2: # max sequence length
# dimension 3: # w2v features (300 in our case)

# Step I: Create a data frame with # sentences X # max sequence length (15) rows and # word features + 1 columns
## First column represent the index of the word
sentences_to_word_idx_df = pd.DataFrame(np.concatenate((data_1.reshape(60, 1), np.zeros((60, 300))), axis=1))

# Step II: Run a for loop to find each word by its index and insert its w2v representation to the other 300 columns
for idx in range(sentences_to_word_idx_df.shape[0]):
    word_idx = sentences_to_word_idx_df.loc[idx, 0]
    try:
        word_vec = word2vec[index_word[word_idx]]
        sentences_to_word_idx_df.iloc[idx, 1:] = word_vec
    except:
        pass

# Step III: reshape to get 3d array
sentences_to_word_idx_array = np.array(sentences_to_word_idx_df.iloc[:, 1:]).reshape(-1, 15, 300)

In [292]:
sentences_to_word_idx_array[0, :, 1]

array([-0.00616455, -0.07324219,  0.10498047,  0.02453613,  0.        ,
        0.        ,  0.13476562, -0.01953125,  0.06738281,  0.23925781,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ])

In [293]:
### Option I: Take the mean of all w2v in each sentence to insert to other algorithms (logistic, random forest, etc)
sentences_to_word_idx_array[sentences_to_word_idx_array == 0] = np.nan
mean_sentence_vec = np.nanmean(sentences_to_word_idx_array, axis=1)

In [294]:
mean_sentence_vec[1]

array([ 7.24283854e-02,  1.99203491e-02,  6.24491374e-02,  8.19905599e-02,
       -8.51847331e-02,  7.00225830e-02, -5.56437174e-03, -7.72806803e-02,
        1.00926717e-01,  7.10856120e-02, -2.16871897e-02, -1.13592148e-01,
       -1.29629771e-01, -5.83089193e-02, -1.00789388e-01,  5.39143880e-04,
        2.37172445e-02,  5.86465200e-02,  4.33756510e-02, -8.66394043e-02,
       -5.62337240e-02,  5.01448313e-02,  2.30725606e-02,  7.04975128e-02,
       -4.65774536e-02, -2.46874491e-02, -8.30434163e-02,  1.18754069e-01,
        2.59526571e-03, -3.98279826e-02,  4.65037028e-02, -3.67431641e-02,
       -4.10528183e-02, -8.85086060e-02,  2.50469844e-02, -3.53597005e-02,
       -3.90688578e-02, -4.69868978e-02,  4.78731791e-02, -2.18963623e-03,
        2.26402283e-02,  1.04054769e-01,  1.21439616e-01,  4.63689168e-02,
        1.91726685e-02, -5.22041321e-02,  7.62939453e-03,  1.25732422e-02,
       -2.50651042e-02, -2.26999919e-02,  1.46586100e-02,  5.35939535e-02,
        2.62705485e-02, -

In [295]:
logistic = LogisticRegression(class_weight='balanced')

In [296]:
logistic.fit(mean_sentence_vec, labels[2:6])



LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l2', random_state=None,
          solver='warn', tol=0.0001, verbose=0, warm_start=False)

In [297]:
dtree = DecisionTreeClassifier()

In [298]:
dtree.fit(mean_sentence_vec, labels[2:6])

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [299]:
rforest = RandomForestClassifier()

In [300]:
rforest.fit(mean_sentence_vec, labels[2:6])



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [None]:
### Option II: Leave as is to use in other algorithms (Logistic, NN, RNN) 

In [None]:
### Option III: Concotenate in order to use in other algorithms (Logistic, NN, RNN) 

In [None]:
### Option IV: Use embedding layer 

In [57]:
########################################
## prepare embeddings
########################################
print('Preparing embedding matrix')

nb_words = min(MAX_NB_WORDS, len(word_index))+1

Preparing embedding matrix


In [58]:
nb_words

93

In [63]:
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    print(word, i)
    if word in word2vec.vocab:
        embedding_matrix[i] = word2vec.word_vec(word)
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

the 1
i 2
how 3
what 4
can 5
is 6
which 7
would 8
24 9
to 10
of 11
kohinoor 12
koh 13
noor 14
diamond 15
speed 16
internet 17
a 18
it 19
in 20
water 21
salt 22
by 23
math 24
23 25
hair 26
transplant 27
much 28
cost 29
send 30
money 31
china 32
start 33
reading 34
story 35
increase 36
my 37
connection 38
while 39
using 40
vpn 41
why 42
am 43
mentally 44
very 45
lonely 46
solve 47
one 48
dissolve 49
quikly 50
sugar 51
methane 52
and 53
carbon 54
di 55
oxide 56
happen 57
if 58
indian 59
government 60
stole 61
back 62
be 63
increased 64
hacking 65
through 66
dns 67
find 68
remainder 69
when 70
divided 71
fish 72
survive 73
should 74
have 75
at 76
age 77
but 78
best 79
way 80
from 81
us 82
food 83
not 84
emulsifiers 85
aberystwyth 86
does 87
require 88
you 89
foods 90
fibre 91
their 92
Null word embeddings: 11


In [255]:
embedding_matrix[1]

array([ 0.08007812,  0.10498047,  0.04980469,  0.0534668 , -0.06738281,
       -0.12060547,  0.03515625, -0.11865234,  0.04394531,  0.03015137,
       -0.05688477, -0.07617188,  0.01287842,  0.04980469, -0.08496094,
       -0.06347656,  0.00628662, -0.04321289,  0.02026367,  0.01330566,
       -0.01953125,  0.09277344, -0.171875  , -0.00131989,  0.06542969,
        0.05834961, -0.08251953,  0.0859375 , -0.00318909,  0.05859375,
       -0.03491211, -0.0123291 , -0.0480957 , -0.00302124,  0.05639648,
        0.01495361, -0.07226562, -0.05224609,  0.09667969,  0.04296875,
       -0.03540039, -0.07324219,  0.03271484, -0.06176758,  0.00787354,
        0.0035553 , -0.00878906,  0.0390625 ,  0.03833008,  0.04443359,
        0.06982422,  0.01263428, -0.00445557, -0.03320312, -0.04272461,
        0.09765625, -0.02160645, -0.0378418 ,  0.01190186, -0.01391602,
       -0.11328125,  0.09326172, -0.03930664, -0.11621094,  0.02331543,
       -0.01599121,  0.02636719,  0.10742188, -0.00466919,  0.09

In [65]:
########################################
## sample train/validation data
########################################
#np.random.seed(1234)
perm = np.random.permutation(len(data_1))
idx_train = perm[:int(len(data_1)*(1-VALIDATION_SPLIT))]
idx_val = perm[int(len(data_1)*(1-VALIDATION_SPLIT)):]

In [66]:
perm

array([0, 2, 3, 1])

In [67]:
idx_train

array([0, 2, 3])

In [68]:
idx_val

array([1])

In [69]:
data_1_train = np.vstack((data_1[idx_train], data_2[idx_train]))
data_2_train = np.vstack((data_2[idx_train], data_1[idx_train]))
labels_train = np.concatenate((labels[idx_train], labels[idx_train]))

In [235]:
data_2

array([[ 4,  8, 57, 58,  1, 59, 60, 61,  1, 12, 13,  2, 14, 15, 62],
       [ 3,  5, 17, 16, 63, 64, 23, 65, 66, 67,  0,  0,  0,  0,  0],
       [68,  1, 69, 70, 24, 25,  9, 24,  6, 71, 23,  9, 25,  0,  0],
       [ 7, 72,  8, 73, 20, 22, 21,  0,  0,  0,  0,  0,  0,  0,  0]],
      dtype=int32)

In [236]:
data_2_train

array([[ 4,  8, 57, 58,  1, 59, 60, 61,  1, 12, 13,  2, 14, 15, 62],
       [68,  1, 69, 70, 24, 25,  9, 24,  6, 71, 23,  9, 25,  0,  0],
       [ 7, 72,  8, 73, 20, 22, 21,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 4,  6,  1, 35, 11, 12, 13,  2, 14, 15,  0,  0,  0,  0,  0],
       [42, 43,  2, 44, 45, 46,  3,  5,  2, 47, 19,  0,  0,  0,  0],
       [ 7, 48, 49, 20, 21, 50, 51, 22, 52, 53, 54, 55, 56,  0,  0]],
      dtype=int32)

In [72]:
labels_train

array([0, 0, 0, 0, 0, 0])

In [70]:
data_1_val = np.vstack((data_1[idx_val], data_2[idx_val]))
data_2_val = np.vstack((data_2[idx_val], data_1[idx_val]))
labels_val = np.concatenate((labels[idx_val], labels[idx_val]))

In [71]:
data_1_train

array([[ 4,  6,  1, 35, 11, 12, 13,  2, 14, 15,  0,  0,  0,  0,  0],
       [42, 43,  2, 44, 45, 46,  3,  5,  2, 47, 19,  0,  0,  0,  0],
       [ 7, 48, 49, 20, 21, 50, 51, 22, 52, 53, 54, 55, 56,  0,  0],
       [ 4,  8, 57, 58,  1, 59, 60, 61,  1, 12, 13,  2, 14, 15, 62],
       [68,  1, 69, 70, 24, 25,  9, 24,  6, 71, 23,  9, 25,  0,  0],
       [ 7, 72,  8, 73, 20, 22, 21,  0,  0,  0,  0,  0,  0,  0,  0]],
      dtype=int32)

In [237]:
data_2_train

array([[ 4,  8, 57, 58,  1, 59, 60, 61,  1, 12, 13,  2, 14, 15, 62],
       [68,  1, 69, 70, 24, 25,  9, 24,  6, 71, 23,  9, 25,  0,  0],
       [ 7, 72,  8, 73, 20, 22, 21,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 4,  6,  1, 35, 11, 12, 13,  2, 14, 15,  0,  0,  0,  0,  0],
       [42, 43,  2, 44, 45, 46,  3,  5,  2, 47, 19,  0,  0,  0,  0],
       [ 7, 48, 49, 20, 21, 50, 51, 22, 52, 53, 54, 55, 56,  0,  0]],
      dtype=int32)

In [73]:
# weight_val = np.ones(len(labels_val))
# if re_weight:
#     weight_val *= 0.472001959
#     weight_val[labels_val==0] = 1.309028344

In [74]:
########################################
## define the model structure
########################################
embedding_layer = Embedding(nb_words,
        EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=MAX_SEQUENCE_LENGTH,
        trainable=False)

W0721 17:38:17.253858 4555916736 deprecation_wrapper.py:119] From /Users/omer/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.



In [75]:
lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)

sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
x1 = lstm_layer(embedded_sequences_1)
# x2 = lstm_layer(x1)

sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
y1 = lstm_layer(embedded_sequences_2)

merged = concatenate([x1, y1])
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)

merged = Dense(num_dense, activation=act)(merged)
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)

preds = Dense(1, activation='sigmoid')(merged)

########################################
## add class weight
########################################
if re_weight:
    class_weight = {0: 1.309028344, 1: 0.472001959}
else:
    class_weight = None



W0721 17:40:36.483162 4555916736 deprecation_wrapper.py:119] From /Users/omer/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0721 17:40:36.500955 4555916736 deprecation_wrapper.py:119] From /Users/omer/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0721 17:40:36.515653 4555916736 deprecation_wrapper.py:119] From /Users/omer/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please use tf.compat.v1.get_default_session instead.

W0721 17:40:36.516503 4555916736 deprecation_wrapper.py:119] From /Users/omer/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:181: The name tf.ConfigProto is deprecated. Please use tf.compat.v1.ConfigProto instead.

W0721 17:40:36.7301

In [76]:
model = Model(inputs=[sequence_1_input, sequence_2_input], \
        outputs=preds)
model.compile(loss='binary_crossentropy',
        optimizer='nadam',
        metrics=['acc'])
print(model.summary())
print(STAMP)

W0721 17:41:05.451287 4555916736 deprecation_wrapper.py:119] From /Users/omer/anaconda3/lib/python3.7/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0721 17:41:05.475080 4555916736 deprecation.py:323] From /Users/omer/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/nn_impl.py:180: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 15)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 15)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 15, 300)      27900       input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   (None, 259)          580160      embedding_1[0][0]                
          

In [78]:
data_1_train, data_2_train

(array([[ 4,  6,  1, 35, 11, 12, 13,  2, 14, 15,  0,  0,  0,  0,  0],
        [42, 43,  2, 44, 45, 46,  3,  5,  2, 47, 19,  0,  0,  0,  0],
        [ 7, 48, 49, 20, 21, 50, 51, 22, 52, 53, 54, 55, 56,  0,  0],
        [ 4,  8, 57, 58,  1, 59, 60, 61,  1, 12, 13,  2, 14, 15, 62],
        [68,  1, 69, 70, 24, 25,  9, 24,  6, 71, 23,  9, 25,  0,  0],
        [ 7, 72,  8, 73, 20, 22, 21,  0,  0,  0,  0,  0,  0,  0,  0]],
       dtype=int32),
 array([[ 4,  8, 57, 58,  1, 59, 60, 61,  1, 12, 13,  2, 14, 15, 62],
        [68,  1, 69, 70, 24, 25,  9, 24,  6, 71, 23,  9, 25,  0,  0],
        [ 7, 72,  8, 73, 20, 22, 21,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 4,  6,  1, 35, 11, 12, 13,  2, 14, 15,  0,  0,  0,  0,  0],
        [42, 43,  2, 44, 45, 46,  3,  5,  2, 47, 19,  0,  0,  0,  0],
        [ 7, 48, 49, 20, 21, 50, 51, 22, 52, 53, 54, 55, 56,  0,  0]],
       dtype=int32))

In [3]:
'''
Single model may achieve LB scores at around 0.29+ ~ 0.30+
Average ensembles can easily get 0.28+ or less
Don't need to be an expert of feature engineering
All you need is a GPU!!!!!!!
'''

########################################
## train the model
########################################


early_stopping = EarlyStopping(monitor='val_loss', patience=3)
bst_model_path = STAMP + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

hist = model.fit([data_1_train, data_2_train], labels_train, \
        validation_data=([data_1_val, data_2_val], labels_val, weight_val), \
        epochs=200, batch_size=2048, shuffle=True, \
        class_weight=class_weight, callbacks=[early_stopping, model_checkpoint])




Indexing word vectors


FileNotFoundError: [Errno 2] No such file or directory: '../input/GoogleNews-vectors-negative300.bin'

In [None]:
model.load_weights(bst_model_path)
bst_val_score = min(hist.history['val_loss'])

In [None]:

########################################
## Prediction
########################################
preds = model.predict([test_data_1, test_data_2], batch_size=8192, verbose=1)
preds += model.predict([test_data_2, test_data_1], batch_size=8192, verbose=1)
preds /= 2