In [1]:
'''
Example of an LSTM model with GloVe embeddings along with magic features

Tested under Keras 2.0 with Tensorflow 1.0 backend

Single model may achieve LB scores at around 0.18+, average ensembles can get 0.17+
'''

########################################
## import packages
########################################
import os
import re
import csv
import codecs
import numpy as np
import pandas as pd

from string import punctuation
from collections import defaultdict

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation,Reshape
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint

from sklearn.preprocessing import StandardScaler

#import sys
#reload(sys)
#sys.setdefaultencoding('utf-8')

Using TensorFlow backend.


In [2]:
########################################
## set directories and parameters
########################################
BASE_DIR = '../data/'
#EMBEDDING_FILE = BASE_DIR + 'glove.840B.300d.txt'
EMBEDDING_FILE = BASE_DIR + 'glove.6B.300d.txt'
TRAIN_DATA_FILE = BASE_DIR + 'train.csv'
TEST_DATA_FILE = BASE_DIR + 'test.csv'
MAX_SEQUENCE_LENGTH = 30
MAX_NB_WORDS = 200000
EMBEDDING_DIM = 300
VALIDATION_SPLIT = 0.1

num_lstm = np.random.randint(175, 275)
num_dense = np.random.randint(100, 150)
rate_drop_lstm = 0.15 + np.random.rand() * 0.25
rate_drop_dense = 0.15 + np.random.rand() * 0.25

act = 'relu'
re_weight = True # whether to re-weight classes to fit the 17.5% share in test set

STAMP = 'lstm_%d_%d_%.2f_%.2f'%(num_lstm, num_dense, rate_drop_lstm, \
        rate_drop_dense)
print(STAMP)

lstm_254_131_0.18_0.34


In [3]:
########################################
## index word vectors
########################################
print('Indexing word vectors')

embeddings_index = {}
f = open(EMBEDDING_FILE,encoding='utf-8')
count = 0
for line in f:
    if count == 0:
        count = 1
        continue
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %d word vectors of glove.' % len(embeddings_index))

Indexing word vectors
Found 399999 word vectors of glove.


In [4]:
## process texts in datasets
########################################
print('Processing text dataset')

# The function "text_to_wordlist" is from
# https://www.kaggle.com/currie32/quora-question-pairs/the-importance-of-cleaning-text
def text_to_wordlist(text, remove_stopwords=False, stem_words=False):
    # Clean the text, with the option to remove stopwords and to stem words.
    
    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
    
    text = " ".join(text)

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " : ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

texts_1 = [] 
texts_2 = []
labels = []
with codecs.open(TRAIN_DATA_FILE, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    for values in reader:
        texts_1.append(text_to_wordlist(values[3]))
        texts_2.append(text_to_wordlist(values[4]))
        labels.append(int(values[5]))
print('Found %s texts in train.csv' % len(texts_1))

test_texts_1 = []
test_texts_2 = []
test_ids = []
with codecs.open(TEST_DATA_FILE, encoding='utf-8') as f:
    reader = csv.reader(f, delimiter=',')
    header = next(reader)
    for values in reader:
        test_texts_1.append(text_to_wordlist(values[1]))
        test_texts_2.append(text_to_wordlist(values[2]))
        test_ids.append(values[0])
print('Found %s texts in test.csv' % len(test_texts_1))

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts_1 + texts_2 + test_texts_1 + test_texts_2)

sequences_1 = tokenizer.texts_to_sequences(texts_1)
sequences_2 = tokenizer.texts_to_sequences(texts_2)
test_sequences_1 = tokenizer.texts_to_sequences(test_texts_1)
test_sequences_2 = tokenizer.texts_to_sequences(test_texts_2)

word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

data_1 = pad_sequences(sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
data_2 = pad_sequences(sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
labels = np.array(labels)
print('Shape of data tensor:', data_1.shape)
print('Shape of label tensor:', labels.shape)

test_data_1 = pad_sequences(test_sequences_1, maxlen=MAX_SEQUENCE_LENGTH)
test_data_2 = pad_sequences(test_sequences_2, maxlen=MAX_SEQUENCE_LENGTH)
test_ids = np.array(test_ids)
########################################
print("ok")

Processing text dataset
Found 404290 texts in train.csv
Found 2345796 texts in test.csv
Found 120499 unique tokens
Shape of data tensor: (404290, 30)
Shape of label tensor: (404290,)
ok


In [None]:
np.save("../train/word_index.npy",word_index)
np.save("../train/data_1.npy",data_1)
np.save("../train/data_2.npy",data_2)
np.save("../train/labels.npy",labels)
np.save("../test/test_data_1.npy",test_data_1)
np.save("../test/test_data_2.npy",test_data_2)
np.save("../test/test_ids.npy",test_ids)

In [6]:
word_index=np.load("../train/word_index.npy")

data_1=np.load("../train/data_1.npy")
data_2=np.load("../train/data_1.npy")

labels = np.load("../train/labels.npy")

print('Shape of data tensor:', data_1.shape)
print('Shape of label tensor:', labels.shape)

test_data_1 = np.load("../test/test_data_1.npy")
test_data_2 = np.load("../test/test_data_2.npy")
test_ids = np.load("../test/test_ids.npy")

print('Shape of data tensor:', test_data_1.shape)
print('Shape of label tensor:', test_data_2.shape)

Shape of data tensor: (404290, 30)
Shape of label tensor: (404290,)
Shape of data tensor: (2345796, 30)
Shape of label tensor: (2345796, 30)


# Doc2vec

In [5]:
#Import Initial Packages
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from subprocess import check_output
import gensim
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from textblob import TextBlob
from nltk.stem import WordNetLemmatizer
import re 
from collections import namedtuple
import multiprocessing
import datetime
import os

tokenizer = RegexpTokenizer(r'\w+')
stopwords = stopwords.words("english")
lemmatizer = WordNetLemmatizer()



In [6]:
model = gensim.models.doc2vec.Doc2Vec.load("../kernels/20170531_A_doc_2_vec_model1")

In [7]:
doc_train1=np.empty((404290,100))
for i in range(404290):
    doc_train1[i]=(model.docvecs[i])
print(doc_train1.shape)

doc_train2=np.empty((404290,100))
for i in range(404290,808580):
    doc_train2[i-404290]=(model.docvecs[i])
print(doc_train2.shape)

(404290, 100)
(404290, 100)


df_test_set1 : (2345796, 1)
df_test_set2 : (2345796, 1)
df_test_set : (4691592, 1)

In [8]:
doc_test1=np.empty((2345796,100))
for i in range(808580,3154376):
    doc_test1[i-808580]=(model.docvecs[i])
print(doc_test1.shape)

doc_test2=np.empty((2345796,100))
for i in range(3154376,5500172):
    doc_test2[i-3154376]=(model.docvecs[i])
print(doc_test2.shape)

(2345796, 100)
(2345796, 100)


In [78]:
np.save("../train/doc_train1.npy",doc_train1)
np.save("../train/doc_train2.npy",doc_train2)

np.save("../test/doc_test1.npy",doc_test1)
np.save("../test/doc_test2.npy",doc_test2)

In [None]:
doc_train1=np.load("../train/doc_train1.npy")
doc_train2=np.load("../train/doc_train2.npy")

doc_test1=np.load("../test/doc_test1.npy")
doc_test2=np.load("../test/doc_test2.npy")

In [18]:
import gc
gc.collect()

97

# Feature part1:train_all

In [9]:
train_all = pd.read_csv('../train/train_all.csv')
test_all = pd.read_csv('../test/test_all.csv')

train_all = train_all.drop(['id','is_duplicate'],axis=1).fillna(0)
test_all = test_all.drop('test_id',axis=1).fillna(0)

In [10]:
print(train_all.shape)
print(test_all.shape)

(404290, 11)
(2345796, 11)


In [11]:
ss = StandardScaler()
ss.fit(np.vstack((train_all, test_all)))
train_all = ss.transform(train_all)
test_all = ss.transform(test_all)

# Feature part2:train_feature

In [14]:
train_features = pd.read_csv('../train/train_features.csv', encoding="ISO-8859-1")
train_features=  train_features.iloc[:, 2:]
train_features.drop(['euclidean_distance','jaccard_distance'], axis=1,inplace=True)

test_features = pd.read_csv('../test/test_features.csv', encoding="ISO-8859-1")
test_features=  test_features.iloc[:, 2:]
test_features.drop(['euclidean_distance','jaccard_distance'], axis=1,inplace=True)

print(train_features.shape)
print(test_features.shape)

(404290, 26)
(2345796, 26)


In [15]:
train_features.fillna(0,inplace=True)
test_features.fillna(0,inplace=True)

In [16]:
train_features=train_features.replace('inf',0)
test_features=test_features.replace('inf',0)

In [17]:
ss = StandardScaler()
ss.fit(np.vstack((train_features, test_features)))
train_features = ss.transform(train_features)
test_features = ss.transform(test_features)

# Feature part3:train5

In [18]:
train5=pd.read_csv('../train/train5.csv', encoding="ISO-8859-1")#之前总结的特征：train5
print(train5.shape)
train5=train5.iloc[:,8:]
print(train5.shape)

test5=pd.read_csv('../test/test5.csv', encoding="ISO-8859-1")#之前总结的特征：test5
print(test5.shape)
test5=test5.iloc[:,5:]
print(test5.shape)

(404290, 21)
(404290, 13)
(2345796, 18)
(2345796, 13)


In [19]:
ss = StandardScaler()
ss.fit(np.vstack((train5, test5)))
train5 = ss.transform(train5)
test5 = ss.transform(test5)

# Feature part4:train6

In [20]:
train6=pd.read_csv('../train/train6.csv')#
print(train6.shape)

test6=pd.read_csv('../test/test6.csv')#
print(test6.shape)

(404290, 16)
(2345796, 16)


In [21]:
train6.fillna(0,inplace=True)
test6.fillna(0,inplace=True)

In [22]:
ss = StandardScaler()
ss.fit(np.vstack((train6, test6)))
train6 = ss.transform(train6)
test6 = ss.transform(test6)

In [23]:
import gc
gc.collect()

51

# Prepare embeddings...

In [24]:
########################################
## prepare embeddings
########################################
print('Preparing embedding matrix')

nb_words = min(MAX_NB_WORDS, len(word_index))+1

embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

Preparing embedding matrix
Null word embeddings: 43908


In [25]:
print(embedding_matrix.shape)

(120500, 300)


# Prepare Training...

In [26]:
########################################
## sample train/validation data
########################################
#np.random.seed(1234)
perm = np.random.permutation(len(data_1))
idx_train = perm[:int(len(data_1)*(1-VALIDATION_SPLIT))]
idx_val = perm[int(len(data_1)*(1-VALIDATION_SPLIT)):]

data_1_train = np.vstack((data_1[idx_train], data_2[idx_train]))
data_2_train = np.vstack((data_2[idx_train], data_1[idx_train]))

doc_1_train = np.vstack((doc_train1[idx_train], doc_train2[idx_train]))
doc_2_train = np.vstack((doc_train2[idx_train], doc_train1[idx_train]))

labels_train = np.concatenate((labels[idx_train], labels[idx_train]))

train_all_train = np.vstack((train_all[idx_train], train_all[idx_train]))
train_features_train = np.vstack((train_features[idx_train], train_features[idx_train]))
train5_train = np.vstack((train5[idx_train], train5[idx_train]))
train6_train = np.vstack((train6[idx_train], train6[idx_train]))


data_1_val = np.vstack((data_1[idx_val], data_2[idx_val]))
data_2_val = np.vstack((data_2[idx_val], data_1[idx_val]))

doc_1_val = np.vstack((doc_train1[idx_val], doc_train2[idx_val]))
doc_2_val = np.vstack((doc_train2[idx_val], doc_train1[idx_val]))

labels_val = np.concatenate((labels[idx_val], labels[idx_val]))

train_all_val = np.vstack((train_all[idx_val], train_all[idx_val]))
train_features_val = np.vstack((train_features[idx_val], train_features[idx_val]))
train5_val = np.vstack((train5[idx_val], train5[idx_val]))
train6_val = np.vstack((train6[idx_val], train6[idx_val]))

#leaks_val = np.vstack((leaks[idx_val], leaks[idx_val]))

weight_val = np.ones(len(labels_val))

#if re_weight:
#    weight_val *= 0.472001959
#    weight_val[labels_val==0] = 1.309028344
    
if re_weight:
    weight_val *= 0.4459459459
    weight_val[labels_val==0] = 1.3253968253

# LSTM TRY TRY

In [None]:
num_lstm = np.random.randint(175, 275)
num_dense = np.random.randint(100, 150)
rate_drop_lstm = 0.15 + np.random.rand() * 0.25
rate_drop_dense = 0.15 + np.random.rand() * 0.25

act = 'relu'
re_weight = True # whether to re-weight classes to fit the 17.5% share in test set

STAMP = 'lstm_%d_%d_%.2f_%.2f'%(num_lstm, num_dense, rate_drop_lstm, \
        rate_drop_dense)


In [None]:
keras.layers.recurrent.LSTM(units, 
                            activation='tanh', 
                            recurrent_activation='hard_sigmoid',
                            use_bias=True,
                            kernel_initializer='glorot_uniform',
                            recurrent_initializer='orthogonal', 
                            bias_initializer='zeros',
                            unit_forget_bias=True,
                            kernel_regularizer=None, 
                            recurrent_regularizer=None,
                            bias_regularizer=None, 
                            activity_regularizer=None, 
                            kernel_constraint=None,
                            recurrent_constraint=None,
                            bias_constraint=None,
                            dropout=0.0, 
                            recurrent_dropout=0.0)

In [22]:
embedded_sequences_2.shape

TensorShape([Dimension(None), Dimension(30), Dimension(300)])

In [145]:
embedded_sequences_1

<tf.Tensor 'embedding_8/Gather:0' shape=(?, 30, 300) dtype=float32>

In [143]:
embedding_layer_doc = Embedding(nb_words,
        100,
        input_length=100,
        trainable=False)

lstm_layer_doc=LSTM(num_lstm,dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm,activation='sigmoid')
doc_1_input = Input(shape=(100,), dtype='int32')
embedded_doc_1 = embedding_layer_doc(doc_1_input)
doc1=lstm_layer_doc(embedded_doc_1)

doc_2_input = Input(shape=(100,), dtype='int32')
embedded_doc_2 = embedding_layer_doc(doc_2_input)
doc2=lstm_layer_doc(embedded_doc_2)

120500

In [80]:
#doc2vec lstm layer
lstm_layer_doc=LSTM(num_lstm,dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)

# Input Block
vocabulary_size=1000
doc1_input = Input(shape=(100,))
doc1_embedding = Embedding(vocabulary_size, 128, mask_zero=True)(doc1_input)
doc1=lstm_layer_doc(doc1_embedding)

doc2_input = Input(shape=(100,))
doc2_embedding = Embedding(vocabulary_size, 128, mask_zero=True)(doc2_input)
doc2=lstm_layer_doc(doc1_embedding)

In [77]:
embedding_layer.input_length

30

In [25]:
print(EMBEDDING_DIM)
print(MAX_SEQUENCE_LENGTH)
print(embedding_matrix.shape)

300
30
(120500, 300)


In [35]:
num_lstm

220

In [39]:
EMBEDDING_DIM

300

In [37]:
doc2_embedding

<tf.Tensor 'embedding_3/Gather:0' shape=(?, 100, 128) dtype=float32>

# LSTM GO DIE

In [27]:
########################################
## define the model structure
########################################
embedding_layer = Embedding(nb_words,
        EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=MAX_SEQUENCE_LENGTH,
        trainable=False)
lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)

sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
x1 = lstm_layer(embedded_sequences_1)

sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
y1 = lstm_layer(embedded_sequences_2)

#doc2vec lstm layer
lstm_layer_doc=LSTM(100,dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)

# Input Block
doc1_input = Input(shape=(100,))
doc1_embedding = Embedding(nb_words, 64, mask_zero=True)(doc1_input)
doc1=lstm_layer_doc(doc1_embedding)

doc2_input = Input(shape=(100,))
doc2_embedding = Embedding(nb_words, 64, mask_zero=True)(doc2_input)
doc2=lstm_layer_doc(doc1_embedding)


merged = concatenate([x1, y1,doc1,doc2])
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)

merged = Dense(num_dense, activation=act)(merged)
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)

preds = Dense(1, activation='sigmoid')(merged)

########################################
## add class weight
########################################
#if re_weight:
#    class_weight = {0: 1.309028344, 1: 0.472001959}
#else:
#    class_weight = None
    
if re_weight:
    class_weight = {0: 1.3253968253, 1: 0.4459459459}
else:
    class_weight = None

In [None]:
########################################
## train the model
########################################
model = Model(inputs=[sequence_1_input, sequence_2_input,doc1_input,doc2_input], \
              outputs=preds)
model.compile(loss='binary_crossentropy',
              optimizer='nadam',
              metrics=['acc'])
#model.summary()
print(STAMP)

early_stopping =EarlyStopping(monitor='val_loss', patience=3)
bst_model_path = STAMP + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

hist = model.fit([data_1_train, data_2_train,doc_1_train,doc_2_train], labels_train, \
        validation_data=([data_1_val, data_2_val,doc_1_val,doc_2_val], labels_val, weight_val), \
        epochs=200, batch_size=512, shuffle=True, \
        class_weight=class_weight, callbacks=[early_stopping, model_checkpoint])

model.load_weights(bst_model_path)
bst_val_score = min(hist.history['val_loss'])
print(bst_val_score)

lstm_220_141_0.29_0.40
Train on 727722 samples, validate on 80858 samples
Epoch 1/200


# LSTM GO OK

In [None]:
########################################
## define the model structure
########################################
embedding_layer = Embedding(nb_words,
        EMBEDDING_DIM,
        weights=[embedding_matrix],
        input_length=MAX_SEQUENCE_LENGTH,
        trainable=False)
lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)

sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_1 = embedding_layer(sequence_1_input)
x1 = lstm_layer(embedded_sequences_1)

sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences_2 = embedding_layer(sequence_2_input)
y1 = lstm_layer(embedded_sequences_2)

#doc2vec lstm layer
lstm_layer_doc=LSTM(100,dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)

# Input Block
doc1_input = Input(shape=(100,))
doc1_embedding = Embedding(nb_words, 64, mask_zero=True)(doc1_input)
doc1=lstm_layer_doc(doc1_embedding)

doc2_input = Input(shape=(100,))
doc2_embedding = Embedding(nb_words, 64, mask_zero=True)(doc2_input)
doc2=lstm_layer_doc(doc1_embedding)


#other features for merge
train_all_input = Input(shape=(train_all.shape[1],),)
train_all_dense = Dense(int(num_dense/2), activation=act)(train_all_input)

train_features_input = Input(shape=(train_features.shape[1],),)
train_features_dense = Dense(int(num_dense/2), activation=act)(train_features_input)

train5_input = Input(shape=(train5.shape[1],),)
train5_dense = Dense(int(num_dense/2), activation=act)(train5_input)

train6_input = Input(shape=(train6.shape[1],),)
train6_dense = Dense(int(num_dense/2), activation=act)(train6_input)


merged = concatenate([x1, y1,doc1,doc2, train_all_dense,train_features_dense,train5_dense,train6_dense])
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)

merged = Dense(num_dense, activation=act)(merged)
merged = Dropout(rate_drop_dense)(merged)
merged = BatchNormalization()(merged)

preds = Dense(1, activation='sigmoid')(merged)

########################################
## add class weight
########################################
#if re_weight:
#    class_weight = {0: 1.309028344, 1: 0.472001959}
#else:
#    class_weight = None
    
if re_weight:
    class_weight = {0: 1.3253968253, 1: 0.4459459459}
else:
    class_weight = None

# Train1

In [77]:
########################################
## train the model
########################################
model = Model(inputs=[sequence_1_input, sequence_2_input, train_all_input,train_features_input,train5_input,train6_input], \
              outputs=preds)
model.compile(loss='binary_crossentropy',
              optimizer='nadam',
              metrics=['acc'])
#model.summary()
print(STAMP)

early_stopping =EarlyStopping(monitor='val_loss', patience=3)
bst_model_path = STAMP + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

hist = model.fit([data_1_train, data_2_train, train_all_train,train_features_train,train5_train,train6_train], labels_train, \
        validation_data=([data_1_val, data_2_val, train_all_val,train_features_val,train5_val,train6_val], labels_val, weight_val), \
        epochs=200, batch_size=512, shuffle=True, \
        class_weight=class_weight, callbacks=[early_stopping, model_checkpoint])

model.load_weights(bst_model_path)
bst_val_score = min(hist.history['val_loss'])
print(bst_val_score)

lstm_204_128_0.39_0.34
Train on 727722 samples, validate on 80858 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200


In [78]:
########################################
## make the submission
########################################
print('Start making the submission before fine-tuning')

preds = model.predict([test_data_1, test_data_2,test_all,test_features,test5,test6], batch_size=1024, verbose=1)
preds += model.predict([test_data_2, test_data_1,test_all,test_features,test5,test6], batch_size=1024, verbose=1)
preds /= 2

submission = pd.DataFrame({'test_id':test_ids, 'is_duplicate':preds.ravel()})
submission.to_csv('../output/LSTM20170528_%.4f_'%(bst_val_score)+STAMP+'.csv', index=False)

Start making the submission before fine-tuning


In [79]:
print(submission[(submission.is_duplicate>0.5)].shape)

(80496, 2)


# Train2

In [81]:
num_lstm = np.random.randint(175, 275)
num_dense = np.random.randint(100, 150)
rate_drop_lstm = 0.15 + np.random.rand() * 0.25
rate_drop_dense = 0.15 + np.random.rand() * 0.25

act = 'relu'
re_weight = True # whether to re-weight classes to fit the 17.5% share in test set

STAMP = 'lstm_%d_%d_%.2f_%.2f'%(num_lstm, num_dense, rate_drop_lstm, \
        rate_drop_dense)

In [85]:
########################################
## train the model
########################################
model = Model(inputs=[sequence_1_input, sequence_2_input, train_all_input,train_features_input,train5_input,train6_input], \
              outputs=preds)
model.compile(loss='binary_crossentropy',
              optimizer='nadam',
              metrics=['acc'])
#model.summary()
print(STAMP)

early_stopping =EarlyStopping(monitor='val_loss', patience=3)
bst_model_path = STAMP + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

hist = model.fit([data_1_train, data_2_train, train_all_train,train_features_train,train5_train,train6_train], labels_train, \
        validation_data=([data_1_val, data_2_val, train_all_val,train_features_val,train5_val,train6_val], labels_val, weight_val), \
        epochs=200, batch_size=512, shuffle=True, \
        class_weight=class_weight, callbacks=[early_stopping, model_checkpoint])

model.load_weights(bst_model_path)
bst_val_score = min(hist.history['val_loss'])
print(bst_val_score)

lstm_208_136_0.21_0.21
Train on 727722 samples, validate on 80858 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200


In [86]:
########################################
## make the submission
########################################
print('Start making the submission before fine-tuning')
print(bst_val_score)

preds = model.predict([test_data_1, test_data_2,test_all,test_features,test5,test6], batch_size=1024, verbose=1)
preds += model.predict([test_data_2, test_data_1,test_all,test_features,test5,test6], batch_size=1024, verbose=1)
preds /= 2

submission = pd.DataFrame({'test_id':test_ids, 'is_duplicate':preds.ravel()})
submission.to_csv('../output/LSTM20170529_%.4f_'%(bst_val_score)+STAMP+'.csv', index=False)

Start making the submission before fine-tuning
0.156718803446


# Train3

In [89]:
########################################
## train the model
########################################
model = Model(inputs=[sequence_1_input, sequence_2_input, train_all_input,train_features_input,train5_input,train6_input], \
              outputs=preds)
model.compile(loss='binary_crossentropy',
              optimizer='nadam',
              metrics=['acc'])
#model.summary()
print(STAMP)

early_stopping =EarlyStopping(monitor='val_loss', patience=3)
bst_model_path = STAMP + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

hist = model.fit([data_1_train, data_2_train, train_all_train,train_features_train,train5_train,train6_train], labels_train, \
        validation_data=([data_1_val, data_2_val, train_all_val,train_features_val,train5_val,train6_val], labels_val, weight_val), \
        epochs=200, batch_size=512, shuffle=True, \
        class_weight=class_weight, callbacks=[early_stopping, model_checkpoint])

model.load_weights(bst_model_path)
bst_val_score = min(hist.history['val_loss'])
print(bst_val_score)

lstm_245_117_0.18_0.40


In [31]:
########################################
## train the model
########################################
model = Model(inputs=[sequence_1_input, sequence_2_input, leaks_input], \
        outputs=preds)
model.compile(loss='binary_crossentropy',
        optimizer='nadam',
        metrics=['acc'])
#model.summary()
print(STAMP)

early_stopping =EarlyStopping(monitor='val_loss', patience=3)
bst_model_path = STAMP + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

hist = model.fit([data_1_train, data_2_train, leaks_train], labels_train, \
        validation_data=([data_1_val, data_2_val, leaks_val], labels_val, weight_val), \
        epochs=200, batch_size=512, shuffle=True, \
        class_weight=class_weight, callbacks=[early_stopping, model_checkpoint])

model.load_weights(bst_model_path)
bst_val_score = min(hist.history['val_loss'])
print(bst_val_score)

lstm_238_144_0.29_0.35
Train on 727722 samples, validate on 80858 samples
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200


In [32]:
########################################
## make the submission
########################################
print('Start making the submission before fine-tuning')
print(bst_val_score)

preds = model.predict([test_data_1, test_data_2,test_all,test_features,test5,test6], batch_size=1024, verbose=1)
preds += model.predict([test_data_2, test_data_1,test_all,test_features,test5,test6], batch_size=1024, verbose=1)
preds /= 2

submission = pd.DataFrame({'test_id':test_ids, 'is_duplicate':preds.ravel()})
submission.to_csv('../output/LSTM20170529_%.4f_'%(bst_val_score)+STAMP+'.csv', index=False)

Start making the submission before fine-tuning


In [33]:
print('../output/%.4f_'%(bst_val_score)+STAMP+'.csv')

../output/0.1752_lstm_238_144_0.29_0.35.csv
