In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
import time
import sys
print(os.listdir("../input"))

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Reshape, Flatten, Concatenate, SpatialDropout1D
from keras.layers import BatchNormalization
from keras.layers import Dense, Input, CuDNNLSTM, Embedding, Dropout, Activation
from keras.layers import CuDNNGRU, Conv1D, MaxPool1D, AvgPool1D
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalMaxPooling1D
from keras.layers import GlobalAveragePooling1D
from keras.layers import concatenate
from keras.optimizers import Adam
from keras.models import Model, Sequential
from keras import backend as K
from keras.layers import Lambda
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, Callback
import tensorflow as tf

from sklearn import metrics
from sklearn.model_selection import train_test_split

import re
import gc

['sample_submission.csv', 'train.csv', 'test.csv', 'embeddings']


Using TensorFlow backend.


Some links:
1. Cleaning up the data – [How to: Preprocessing when using embeddings](https://www.kaggle.com/christofhenkel/how-to-preprocessing-when-using-embeddings)
2. More data clean-up – [Improve your Score with Text Preprocessing -- V2](https://www.kaggle.com/theoviel/improve-your-score-with-text-preprocessing-v2)
2. Using different embeddings – [A look at different embeddings.!](https://www.kaggle.com/sudalairajkumar/a-look-at-different-embeddings)
3. Simple LSTM with embedding – [LSTM is all you need! well, maybe embeddings also.](https://www.kaggle.com/mihaskalic/lstm-is-all-you-need-well-maybe-embeddings-also)
4. [Cyclic LRs, computing F-1 threshold on each epoch, skip connections ](https://www.kaggle.com/shujian/single-rnn-with-4-folds-clr)
5. [InceptionCNN with flip](https://www.kaggle.com/christofhenkel/inceptioncnn-with-flip)

Some further ideas:
1. Mixing embeddings, Capsule layer? – [GRU capsule](https://www.kaggle.com/gmhost/gru-capsule)
2. CNNs and Attention – [Different embeddings with Attention! [Fork][Fork]](https://www.kaggle.com/shujian/different-embeddings-with-attention-fork-fork)
3. [Ideas for tabular features](https://www.kaggle.com/thebrownviking20/analyzing-quora-for-the-insinceres)

In [2]:
tc1 = time.perf_counter()

In [3]:
production = True
# production = False

if production:
    no_f1 = True  # do not compute test/val f1, use fixed number of epochs
    no_val_split = False 
    no_val_split = True # use whole test set for training
    f1_per_epoch = False
else:
    no_f1 = False
    no_val_split = False
    f1_per_epoch = True
    
use_tab = True
# use_tab = False

In [4]:
df_train = pd.read_csv("../input/train.csv")
df_test = pd.read_csv("../input/test.csv")

test_qid = df_test["qid"].values

print("Train shape : ",df_train.shape)
print("Test shape : ",df_test.shape)

df_test['target'] = -1  # just a dummy value

n_train = len(df_train)
n_test = len(df_test)

# easier to pre-process single DF, especially when building vocabulary
df_total = pd.concat([df_train, df_test])
del df_train, df_test
gc.collect()

Train shape :  (1306122, 3)
Test shape :  (56370, 2)


18

In [5]:
def load_embed(file):
    def get_coefs(word,*arr): 
        return word, np.asarray(arr, dtype='float32')
    
    if file == '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec':
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file) if len(o)>100)
    else:
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(file, encoding='latin'))
        
    return embeddings_index

In [6]:
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" }
punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'

punct_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x", "²": "2", "—": "-", "–": "-", "’": "'", "_": "-", "`": "'", '“': '"', '”': '"', '“': '"', "£": "e", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-', 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', }

misspell_dict = {'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ', 'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating', 'pennis': 'penis', 'Etherium': 'Ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization', 'demonetisation': 'demonetization', 'pokémon': 'pokemon'}

In [7]:
def build_vocab(texts):
    sentences = texts.apply(lambda x: x.split()).values
    vocab = {}
    for sentence in sentences:
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

def add_lower(embedding, vocab):
    count = 0
    for word in vocab:
        if word in embedding and word.lower() not in embedding:  
            embedding[word.lower()] = embedding[word]
            count += 1
    print(f"Added {count} words to embedding")
    

def clean_contractions(text, mapping):
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        text = text.replace(s, "'")
    text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")])
    return text

def unknown_punct(embedding, punct):
    unknown = ''
    for p in punct:
        if p not in embedding:
            unknown += p
            unknown += ' '
    return unknown

def clean_special_chars(text, punct, mapping):
    for p in mapping:
        text = text.replace(p, mapping[p])
    
    for p in punct:
        text = text.replace(p, f' {p} ')
    
    specials = {'\u200b': ' ', '…': ' ... ', '\ufeff': '', 'करना': '', 'है': ''}  # Other special characters that I have to deal with in last
    for s in specials:
        text = text.replace(s, specials[s])
    
    return text

def clean_numbers(x):
    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

def correct_spelling(x, dic):
    for word in dic.keys():
        x = x.replace(word, dic[word])
    return x

In [8]:
def compute_len(df):
    df['question_text_len'] = df['question_text'].apply(lambda x: len(x))


def count_chars_multiple(df, chars):
    start_count = False
    new_name = 'question_text_count_' + '_'.join(chars)
    for ch in chars:
        if start_count:
            df[new_name] += df['question_text'].apply(lambda x: x.count(ch))
        else:
            df[new_name] = df['question_text'].apply(lambda x: x.count(ch))
            start_count = True


def count_chars(df, chars):
    for ch in chars:
        df['question_text_count_' + ch] = df['question_text'].apply(lambda x: x.count(ch))


def compute_chars_digits(df):
    df['question_text_digits'] = df['question_text'].apply(lambda x: sum(c.isdigit() for c in x))


def count_capital_letters(df):
    df['question_text_capital_letters'] = df['question_text'].apply(lambda x: sum(c.isupper() for c in x))


def count_bad_punctuation(df, chars):
    start_count = False
    new_name = 'question_text_count_badpc_' + '_'.join(chars)
    for ch in chars:
        rex = '\\' + ch + r'[^\s-]'
        if start_count:
            df[new_name] += df['question_text'].apply(lambda x: len(re.findall(rex, x)))
        else:
            df[new_name] = df['question_text'].apply(lambda x: len(re.findall(rex, x)))
            start_count = True


def count_bad_punctuation2(df, chars):
    start_count = False
    new_name = 'question_text_count_badpc2_' + '_'.join(chars)
    for ch in chars:
        rex = r'[^\s-]' + '\\' + ch
        if start_count:
            df[new_name] += df['question_text'].apply(lambda x: len(re.findall(rex, x)))
        else:
            df[new_name] = df['question_text'].apply(lambda x: len(re.findall(rex, x)))
            start_count = True


def compute_wl(x):
    splitstr = x.split()
    if len(splitstr) == 0:
        return 0
    else:
        return len(max(splitstr, key=len))

def compute_wl2(x):
    splitstr = x.split()
    if len(splitstr) == 0:
        return [0, 0, 0, 0, 0]
    else:
        # number of words, length of longest word, amt of words with length < 4, median word length
        # std of word length
        shortwords = [xx for xx in splitstr if len(xx) < 4]
        wordlens = [len(xx) for xx in splitstr]
        
        
        return [len(splitstr), len(max(splitstr, key=len)), len(shortwords), np.median(wordlens),
                np.std(wordlens)]

                
def longest_word(df):
    df['question_text_longest'] = df['question_text'].apply(lambda x: compute_wl(x))
    

def word_lengths(df):
    df['question_text_longest_list'] = df['question_text'].apply(lambda x: compute_wl2(x))
    df['question_text_numofwords'] = df['question_text_longest_list'].apply(lambda x: x[0])
    df['question_text_longestword'] = df['question_text_longest_list'].apply(lambda x: x[1])
    df['question_text_shortwords'] = df['question_text_longest_list'].apply(lambda x: x[2])
    df['question_text_medwordlen'] = df['question_text_longest_list'].apply(lambda x: x[3])
    df['question_text_stdwordlen'] = df['question_text_longest_list'].apply(lambda x: x[4])


def count_multiple_exclamation(df):
    rex = r'!{2,}'
    new_name = 'question_text_multipleexclamation'
    df[new_name] = df['question_text'].apply(lambda x: len(re.findall(rex, x)))
    

def count_multiple_chars(df, chars):
    for ch in chars:
        rex = ch + r'{2,}'
        new_name = 'question_text_multiple_' + ch
        df[new_name] = df['question_text'].apply(lambda x: len(re.findall(rex, x)))
    


def compute_percentage(df, column_names, divisor):
    for col_name in column_names:
        df[col_name + '_div_' + divisor] = df[col_name] / (df[divisor] + 1)
    return df

charslist = ['!', '?', '$', '&', '(', ')']
charslist_use = ['!', '?', '$', '&', '(']

if use_tab:

    compute_len(df_total)
    print('len')

    compute_chars_digits(df_total)
    print('digits')

    count_capital_letters(df_total)
    print('capital letters')

    count_chars(df_total, charslist)
    print('chars')

    word_lengths(df_total)

    count_bad_punctuation(df_total, ['.', ',', ')'])

    count_bad_punctuation2(df_total, ['('])

    count_multiple_chars(df_total, ['!', r'\.', r'\?'])


    df_total['unbalanced_qt_('] = abs(df_total['question_text_count_('] - df_total['question_text_count_)'])

    tab_features_list = ['question_text_len', 'question_text_numofwords', 'question_text_longestword',
                         'question_text_shortwords', 'question_text_medwordlen',
                         'question_text_stdwordlen']

    for sch in charslist_use:
        # count_chars(df_total, [sch])
        compute_percentage(df_total, ['question_text_count_' + sch], 'question_text_len')
        tab_features_list += ['question_text_count_' + sch + '_div_question_text_len']


    compute_percentage(df_total, ['question_text_capital_letters',
                                  'question_text_digits', 'unbalanced_qt_(',
                                  'question_text_count_badpc_._,_)'], 'question_text_len')

    tab_features_list += ['question_text_capital_letters_div_question_text_len']
    tab_features_list += ['question_text_digits_div_question_text_len']
    tab_features_list += ['unbalanced_qt_(_div_question_text_len']
    tab_features_list += ['question_text_count_badpc_._,_)_div_question_text_len']

    print(len(tab_features_list))
else:
    tab_features_list = []

len
digits
capital letters
chars
15


In [9]:
for tab_feat in tab_features_list:
    tab_mean = df_total[tab_feat].mean()
    tab_std = df_total[tab_feat].std()
    df_total[tab_feat] = (df_total[tab_feat] - tab_mean) / tab_std

In [10]:
if not production:
    for tab_feat in tab_features_list:
        print(tab_feat, df_total[df_total['target']==0][tab_feat].mean(),
              df_total[df_total['target']==1][tab_feat].mean())

In [11]:
def clean_question_text(df):
    # clean
    df['question_text'] = df['question_text'].apply(lambda x:
                                                    clean_numbers(x))
    
    df['question_text'] = df['question_text'].apply(lambda x:
                                                    clean_contractions(x, contraction_mapping))
    
    df['question_text'] = df['question_text'].apply(lambda x:
                                                    clean_special_chars(x, punct, punct_mapping))
    
    df['question_text'] = df['question_text'].apply(lambda x: correct_spelling(x, misspell_dict))

In [12]:
def improve_embedding(df, embedding):
    vocab = build_vocab(df['question_text'])
    add_lower(embedding, vocab)

In [13]:
len_voc = 95000
max_len = 70


def make_data(X):
    t = Tokenizer(num_words=len_voc, filters='')
    t.fit_on_texts(X)
    X = t.texts_to_sequences(X)
    X = pad_sequences(X, maxlen=max_len)
    return X, t.word_index

def make_embed_matrix(embeddings_index, word_index, len_voc):
    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]
    word_index = word_index
    embedding_matrix = np.random.normal(emb_mean, emb_std, (len_voc, embed_size))
    
    for word, i in word_index.items():
        if i >= len_voc:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: 
            embedding_matrix[i] = embedding_vector
    
    return embedding_matrix

In [14]:
def compute_len(df):
    df['question_text_len'] = df['question_text'].apply(lambda x: len(x))


def count_chars_multiple(df, chars):
    start_count = False
    new_name = 'question_text_count_' + '_'.join(chars)
    for ch in chars:
        if start_count:
            df[new_name] += df['question_text'].apply(lambda x: x.count(ch))
        else:
            df[new_name] = df['question_text'].apply(lambda x: x.count(ch))
            start_count = True


def count_chars(df, chars):
    for ch in chars:
        df['question_text_count_' + ch] = df['question_text'].apply(lambda x: x.count(ch))


def compute_chars_digits(df):
    df['question_text_digits'] = df['question_text'].apply(lambda x: sum(c.isdigit() for c in x))
    df['question_text_chars'] = df['question_text'].apply(lambda x: sum(c.isalpha() for c in x))


def count_words(df):
    for col in col_names:
        df['question_text_words'] = df['question_text'].apply(lambda x: len(x.split()))


def count_capital_letters(df):
    df['question_text_capital_letters'] = df['question_text'].apply(lambda x: sum(c.isupper() for c in x))


def count_bad_punctuation(df, chars):
    start_count = False
    new_name = 'question_text_count_badpc_' + '_'.join(chars)
    for ch in chars:
        rex = '\\' + ch + r'[^\s-]'
        if start_count:
            df[new_name] += df['question_text'].apply(lambda x: len(findall(rex, x)))
        else:
            df[new_name] = df['question_text'].apply(lambda x: len(findall(rex, x)))
            start_count = True


def count_bad_punctuation2(df, chars):
    start_count = False
    new_name = 'question_text_count_badpc2_' + '_'.join(chars)
    for ch in chars:
        rex = r'[^\s-]' + '\\' + ch
        if start_count:
            df[new_name] += df['question_text'].apply(lambda x: len(findall(rex, x)))
        else:
            df[new_name] = df['question_text'].apply(lambda x: len(findall(rex, x)))
            start_count = True


def count_english_chars(df):
    df['question_text_eng_chars'] = df['question_text'].apply(lambda x: len(findall('[a-zA-Z]', x)))


def count_unique_words(df):
    df['question_text_num_unique_words'] = df['question_text'].apply(lambda x: len(set(w.lower() for w in x.split())))
    

def compute_wl(x):
    splitstr = x.split()
    if len(splitstr) == 0:
        return 0
    else:
        return len(max(splitstr, key=len))

def longest_word(df):
    df['question_text_longest'] = df['question_text'].apply(lambda x: compute_wl(x))


def count_multiple_exclamation(df):
    rex = r'!{2,}'
    new_name = 'question_text_multipleexclamation'
    df[new_name] = df['question_text'].apply(lambda x: len(findall(rex, x)))


In [15]:
embedding_files = [
    '../input/embeddings/glove.840B.300d/glove.840B.300d.txt',
#     '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec',
#     '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
]

clean_question_text(df_total)
embeddings_list = []

X_total, word_index = make_data(df_total['question_text'])

for i, embedding_f in enumerate(embedding_files):
    print('Loading {}/{}'.format(i + 1, len(embedding_files)))
    embed = load_embed(embedding_f)

    improve_embedding(df_total, embed)

    embeddings_list.append(make_embed_matrix(embed, word_index, len_voc))

del word_index
del embed

gc.collect()

Loading 1/1
Added 17690 words to embedding


  del sys.path[0]


0

In [16]:
if use_tab:
    X_tab = df_total[tab_features_list].values
    print(X_tab.shape, X_total.shape)
    len_tab = len(tab_features_list)

(1362492, 15) (1362492, 70)


In [17]:
if use_tab:
    X_total = np.hstack((X_total, X_tab))
    del X_tab

X_train_full = X_total[:n_train, :]
X_test = X_total[n_train:, :]

print(X_test.shape, n_test)
del X_total

y = df_total['target'].values[:n_train]
print(y.shape)

del df_total

(56370, 85) 56370
(1306122,)


In [18]:
tc_preprocess = time.perf_counter()
print('Time to preprocess', tc_preprocess - tc1)

Time to preprocess 506.2980740000057


In [19]:
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [20]:
def squash(x, axis=-1):
    # s_squared_norm is really small
    # s_squared_norm = K.sum(K.square(x), axis, keepdims=True) + K.epsilon()
    # scale = K.sqrt(s_squared_norm)/ (0.5 + s_squared_norm)
    # return scale * x
    s_squared_norm = K.sum(K.square(x), axis, keepdims=True)
    scale = K.sqrt(s_squared_norm + K.epsilon())
    return x / scale

# A Capsule Implement with Pure Keras
class Capsule(Layer):
    def __init__(self, num_capsule, dim_capsule, routings=3, kernel_size=(9, 1), share_weights=True,
                 activation='default', **kwargs):
        super(Capsule, self).__init__(**kwargs)
        self.num_capsule = num_capsule
        self.dim_capsule = dim_capsule
        self.routings = routings
        self.kernel_size = kernel_size
        self.share_weights = share_weights
        if activation == 'default':
            self.activation = squash
        else:
            self.activation = Activation(activation)

    def build(self, input_shape):
        super(Capsule, self).build(input_shape)
        input_dim_capsule = input_shape[-1]
        if self.share_weights:
            self.W = self.add_weight(name='capsule_kernel',
                                     shape=(1, input_dim_capsule,
                                            self.num_capsule * self.dim_capsule),
                                     # shape=self.kernel_size,
                                     initializer='glorot_uniform',
                                     trainable=True)
        else:
            input_num_capsule = input_shape[-2]
            self.W = self.add_weight(name='capsule_kernel',
                                     shape=(input_num_capsule,
                                            input_dim_capsule,
                                            self.num_capsule * self.dim_capsule),
                                     initializer='glorot_uniform',
                                     trainable=True)

    def call(self, u_vecs):
        if self.share_weights:
            u_hat_vecs = K.conv1d(u_vecs, self.W)
        else:
            u_hat_vecs = K.local_conv1d(u_vecs, self.W, [1], [1])

        batch_size = K.shape(u_vecs)[0]
        input_num_capsule = K.shape(u_vecs)[1]
        u_hat_vecs = K.reshape(u_hat_vecs, (batch_size, input_num_capsule,
                                            self.num_capsule, self.dim_capsule))
        u_hat_vecs = K.permute_dimensions(u_hat_vecs, (0, 2, 1, 3))
        # final u_hat_vecs.shape = [None, num_capsule, input_num_capsule, dim_capsule]

        b = K.zeros_like(u_hat_vecs[:, :, :, 0])  # shape = [None, num_capsule, input_num_capsule]
        for i in range(self.routings):
            b = K.permute_dimensions(b, (0, 2, 1))  # shape = [None, input_num_capsule, num_capsule]
            c = K.softmax(b)
            c = K.permute_dimensions(c, (0, 2, 1))
            b = K.permute_dimensions(b, (0, 2, 1))
            outputs = self.activation(tf.keras.backend.batch_dot(c, u_hat_vecs, [2, 2]))
            if i < self.routings - 1:
                b = tf.keras.backend.batch_dot(outputs, u_hat_vecs, [2, 3])

        return outputs

    def compute_output_shape(self, input_shape):
        return (None, self.num_capsule, self.dim_capsule)

In [21]:
class CyclicLR(Callback):
    """This callback implements a cyclical learning rate policy (CLR).
    The method cycles the learning rate between two boundaries with
    some constant frequency, as detailed in this paper (https://arxiv.org/abs/1506.01186).
    The amplitude of the cycle can be scaled on a per-iteration or 
    per-cycle basis.
    This class has three built-in policies, as put forth in the paper.
    "triangular":
        A basic triangular cycle w/ no amplitude scaling.
    "triangular2":
        A basic triangular cycle that scales initial amplitude by half each cycle.
    "exp_range":
        A cycle that scales initial amplitude by gamma**(cycle iterations) at each 
        cycle iteration.
    For more detail, please see paper.
    
    # Example
        ```python
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., mode='triangular')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```
    
    Class also supports custom scaling functions:
        ```python
            clr_fn = lambda x: 0.5*(1+np.sin(x*np.pi/2.))
            clr = CyclicLR(base_lr=0.001, max_lr=0.006,
                                step_size=2000., scale_fn=clr_fn,
                                scale_mode='cycle')
            model.fit(X_train, Y_train, callbacks=[clr])
        ```    
    # Arguments
        base_lr: initial learning rate which is the
            lower boundary in the cycle.
        max_lr: upper boundary in the cycle. Functionally,
            it defines the cycle amplitude (max_lr - base_lr).
            The lr at any cycle is the sum of base_lr
            and some scaling of the amplitude; therefore 
            max_lr may not actually be reached depending on
            scaling function.
        step_size: number of training iterations per
            half cycle. Authors suggest setting step_size
            2-8 x training iterations in epoch.
        mode: one of {triangular, triangular2, exp_range}.
            Default 'triangular'.
            Values correspond to policies detailed above.
            If scale_fn is not None, this argument is ignored.
        gamma: constant in 'exp_range' scaling function:
            gamma**(cycle iterations)
        scale_fn: Custom scaling policy defined by a single
            argument lambda function, where 
            0 <= scale_fn(x) <= 1 for all x >= 0.
            mode paramater is ignored 
        scale_mode: {'cycle', 'iterations'}.
            Defines whether scale_fn is evaluated on 
            cycle number or cycle iterations (training
            iterations since start of cycle). Default is 'cycle'.
    """

    def __init__(self, base_lr=0.001, max_lr=0.006, step_size=2000., mode='triangular',
                 gamma=1., scale_fn=None, scale_mode='cycle'):
        super(CyclicLR, self).__init__()

        self.base_lr = base_lr
        self.max_lr = max_lr
        self.step_size = step_size
        self.mode = mode
        self.gamma = gamma
        if scale_fn == None:
            if self.mode == 'triangular':
                self.scale_fn = lambda x: 1.
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = lambda x: 1/(2.**(x-1))
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = lambda x: gamma**(x)
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode
        self.clr_iterations = 0.
        self.trn_iterations = 0.
        self.history = {}

        self._reset()

    def _reset(self, new_base_lr=None, new_max_lr=None,
               new_step_size=None):
        """Resets cycle iterations.
        Optional boundary/step size adjustment.
        """
        if new_base_lr != None:
            self.base_lr = new_base_lr
        if new_max_lr != None:
            self.max_lr = new_max_lr
        if new_step_size != None:
            self.step_size = new_step_size
        self.clr_iterations = 0.
        
    def clr(self):
        cycle = np.floor(1+self.clr_iterations/(2*self.step_size))
        x = np.abs(self.clr_iterations/self.step_size - 2*cycle + 1)
        if self.scale_mode == 'cycle':
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(cycle)
        else:
            return self.base_lr + (self.max_lr-self.base_lr)*np.maximum(0, (1-x))*self.scale_fn(self.clr_iterations)
        
    def on_train_begin(self, logs={}):
        logs = logs or {}

        if self.clr_iterations == 0:
            K.set_value(self.model.optimizer.lr, self.base_lr)
        else:
            K.set_value(self.model.optimizer.lr, self.clr())        
            
    def on_batch_end(self, epoch, logs=None):
        
        logs = logs or {}
        self.trn_iterations += 1
        self.clr_iterations += 1

        self.history.setdefault('lr', []).append(K.get_value(self.model.optimizer.lr))
        self.history.setdefault('iterations', []).append(self.trn_iterations)

        for k, v in logs.items():
            self.history.setdefault(k, []).append(v)
        
        K.set_value(self.model.optimizer.lr, self.clr())

In [22]:
class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

In [23]:
if no_f1:
    model_metrics = None
else:
    model_metrics = [f1]

In [24]:
def make_model_capsule(embedding_matrix, embed_size=300):
    K.clear_session()       
    inp = Input(shape=(max_len,))
    x = Embedding(len_voc, embed_size, weights=[embedding_matrix], trainable=False)(inp)
    x = SpatialDropout1D(rate=0.1)(x)
    x = Bidirectional(CuDNNGRU(100, return_sequences=True, 
                               kernel_initializer=initializers.glorot_normal(seed=12300),
                               recurrent_initializer=initializers.orthogonal(gain=1.0,
                                                                             seed=10000)))(x)

    x = Capsule(num_capsule=10, dim_capsule=10, routings=4, share_weights=True)(x)
    x = Flatten()(x)
#     x = BatchNormalization()(x)
    x = Dense(60, activation="elu", kernel_initializer=initializers.glorot_normal(seed=12300))(x)
    x = Dropout(0.025)(x)
    x = BatchNormalization()(x)

    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=model_metrics)
    return model

In [25]:
def make_model_capsule_wtab(embedding_matrix, embed_size=300, tab_hidden=16):
    K.clear_session()
    
    inp = Input(shape=(max_len + len_tab, ))
    
    x = Lambda(lambda zz: zz[:,:max_len], output_shape=(max_len,))(inp)
    tab_x = Lambda(lambda zz: zz[:,max_len:], output_shape=(None,len_tab))(inp)
    
    tab_z = Dense(tab_hidden, activation="elu")(tab_x)
    tab_z = Dropout(0.1)(tab_z)
    tab_z = Reshape((tab_hidden,))(tab_z)
    
    x = Embedding(len_voc, embed_size, weights=[embedding_matrix], trainable=False)(x)
    x = SpatialDropout1D(rate=0.1)(x)
    x = Bidirectional(CuDNNGRU(100, return_sequences=True, 
                               kernel_initializer=initializers.glorot_normal(seed=12300),
                               recurrent_initializer=initializers.orthogonal(gain=1.0,
                                                                             seed=10000)))(x)

    x = Capsule(num_capsule=10, dim_capsule=10, routings=4, share_weights=True)(x)
    x = Flatten()(x)
#     x = BatchNormalization()(x)

    conc = concatenate([x, tab_z])

    x = Dense(60, activation="elu",
              kernel_initializer=initializers.glorot_normal(seed=12300))(conc)
    x = Dropout(0.025)(x)
    x = BatchNormalization()(x)

    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=model_metrics)
    return model

In [26]:
def model_gru_atten_3(embedding_matrix, embed_size=300):
    inp = Input(shape=(max_len,))
    x = Embedding(len_voc, embed_size, weights=[embedding_matrix], trainable=False)(inp)
    x = SpatialDropout1D(0.125)(x)
    x = Bidirectional(CuDNNGRU(140, return_sequences=True))(x)
#     x = Bidirectional(CuDNNGRU(128, return_sequences=True))(x)
#     x = Bidirectional(CuDNNGRU(100, return_sequences=True))(x)
    x = Bidirectional(CuDNNGRU(100, return_sequences=True))(x)
    x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
#     x = Bidirectional(CuDNNGRU(50, return_sequences=True))(x)
    x = Attention(max_len)(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=model_metrics)
    
    return model

In [27]:
def model_gru_atten_3_wtab(embedding_matrix, embed_size=300, tab_hidden=16):
    K.clear_session()
    inp = Input(shape=(max_len + len_tab, ))
    
    x = Lambda(lambda zz: zz[:,:max_len], output_shape=(max_len,))(inp)
    tab_x = Lambda(lambda zz: zz[:,max_len:], output_shape=(None,len_tab))(inp)
    
    x = Embedding(len_voc, embed_size, weights=[embedding_matrix], trainable=False)(x)
    x = SpatialDropout1D(0.125)(x)
    x = Bidirectional(CuDNNGRU(140, return_sequences=True))(x)
    x = Bidirectional(CuDNNGRU(100, return_sequences=True))(x)
    x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)

    x = Attention(max_len)(x)
    
    tab_z = Dense(tab_hidden, activation="elu")(tab_x)
    tab_z = Dropout(0.1)(tab_z)
    tab_z = Reshape((tab_hidden,))(tab_z)
    
    conc = concatenate([x, tab_z])
#     conc = BatchNormalization()(conc)
#     conc = Dense(10, activation="elu")(x)
    conc = Dense(11, activation="elu")(conc)
#     conc = Dropout(0.15)(conc)
#     conc = Dense(4, activation="elu")(conc)
    outp = Dense(1, activation="sigmoid")(conc)    
    

    model = Model(inputs=inp, outputs=outp)
    
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=model_metrics)
    
    return model

In [28]:
def model_lstm_du(embedding_matrix, embed_size=300):
    inp = Input(shape=(max_len,))
    x = Embedding(len_voc, embed_size, weights=[embedding_matrix])(inp)
    x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    conc = Dense(40, activation="elu")(conc)
    conc = Dropout(0.1)(conc)
    outp = Dense(1, activation="sigmoid")(conc)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=model_metrics)
    return model

In [29]:
def make_model_lstm_attn_2(embedding_matrix, embed_size=300):
    
    inp = Input(shape=(max_len,))
    x = Embedding(len_voc, embed_size, weights=[embedding_matrix], trainable=False)(inp)
    x = SpatialDropout1D(0.12)(x)
    x = Bidirectional(CuDNNLSTM(85, return_sequences=True))(x)
#     x = SpatialDropout1D(0.14)(x)
#     x = Bidirectional(CuDNNLSTM(100, return_sequences=True))(x)
    y = Bidirectional(CuDNNGRU(70, return_sequences=True))(x)
    
    atten_1 = Attention(max_len)(x) # skip connect
    atten_2 = Attention(max_len)(y)
    avg_pool = GlobalAveragePooling1D()(y)
    max_pool = GlobalMaxPooling1D()(y)
    
    conc = concatenate([atten_1, atten_2, avg_pool, max_pool])
    conc = Dense(20, activation="elu")(conc)
    conc = Dropout(0.11)(conc)
#     conc = Dense(14, activation="elu")(conc)
#     conc = Dropout(0.15)(conc)
    outp = Dense(1, activation="sigmoid")(conc)    

    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=model_metrics)
    
    return model

In [30]:
def make_model_lstm_attn_2_wtab(embedding_matrix, embed_size=300, tab_hidden=16):
    
    inp = Input(shape=(max_len + len_tab, ))
    
    x = Lambda(lambda zz: zz[:,:max_len], output_shape=(max_len,))(inp)
    tab_x = Lambda(lambda zz: zz[:,max_len:], output_shape=(None,len_tab))(inp)
    
    x = Embedding(len_voc, embed_size, weights=[embedding_matrix], trainable=False)(x)
    x = SpatialDropout1D(0.12)(x)
    x = Bidirectional(CuDNNLSTM(85, return_sequences=True))(x)
#     x = SpatialDropout1D(0.14)(x)
#     x = Bidirectional(CuDNNLSTM(100, return_sequences=True))(x)
    y = Bidirectional(CuDNNGRU(70, return_sequences=True))(x)
    
    atten_1 = Attention(max_len)(x) # skip connect
    atten_2 = Attention(max_len)(y)
    avg_pool = GlobalAveragePooling1D()(y)
    max_pool = GlobalMaxPooling1D()(y)
    
    tab_z = Dense(tab_hidden, activation="elu")(tab_x)
    tab_z = Dropout(0.1)(tab_z)
    
    tab_z = Reshape((tab_hidden,))(tab_z)
    
    conc = concatenate([atten_1, atten_2, avg_pool, max_pool, tab_z])
#     conc = concatenate([atten_1, atten_2, avg_pool, max_pool, tab_z])
#     conc = BatchNormalization()(conc)
    conc = Dense(22, activation="elu")(conc)
#     conc = Dropout(0.125)(conc)
    conc = Dropout(0.11)(conc)
#     conc = Dense(4, activation="elu")(conc)
    outp = Dense(1, activation="sigmoid")(conc)    

    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=model_metrics)
    
    return model

In [31]:
filter_sizes = [1,2,4]
num_filters = 40
def make_model_cnn_3_wtab(embedding_matrix, embed_size=300, tab_hidden=16):    
    inp = Input(shape=(max_len + len_tab, ))
    
    x = Lambda(lambda zz: zz[:,:max_len], output_shape=(None,max_len))(inp)
    tab_x = Lambda(lambda zz: zz[:,max_len:], output_shape=(None,len_tab))(inp)
    
    x = Embedding(len_voc, embed_size, weights=[embedding_matrix], trainable=False)(inp)
    x = SpatialDropout1D(0.125)(x)
    #x = Reshape((maxlen, embed_size, 1))(x)
    
    conv_0 = Conv1D(num_filters, kernel_size=(filter_sizes[0]),
                                 kernel_initializer='he_normal', activation='elu')(x)
    conv_1 = Conv1D(num_filters, kernel_size=(filter_sizes[1]),
                                 kernel_initializer='he_normal', activation='elu')(x)
    conv_2 = Conv1D(num_filters, kernel_size=(filter_sizes[2]), 
                                 kernel_initializer='he_normal', activation='elu')(x)
    conv_3 = Conv1D(num_filters, kernel_size=(filter_sizes[2]),
                                 kernel_initializer='he_normal', activation='elu')(conv_2)
    conv_0_1 = Conv1D(num_filters, kernel_size=(filter_sizes[1]),
                      kernel_initializer='he_normal', activation='elu')(conv_0)
    
    maxpool_0 = MaxPool1D(pool_size=(max_len - filter_sizes[0] + 1))(conv_0)
    maxpool_1 = MaxPool1D(pool_size=(max_len - filter_sizes[1] + 1))(conv_1)
    maxpool_3 = MaxPool1D(pool_size=(max_len - filter_sizes[2] - filter_sizes[2] + 1))(conv_3)
    avgpool_0_1 = MaxPool1D(pool_size=(max_len - filter_sizes[1] - filter_sizes[0] + 1))(conv_0_1)
        
    tab_z = Dense(tab_hidden, activation="elu")(tab_x)
    tab_z = Dropout(0.1)(tab_z)
    
    tab_z = Reshape((tab_hidden,))(tab_z)
    z = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_3, avgpool_0_1])   

    z = Flatten()(z)
    z = Concatenate(axis=1)([z, tab_z])
    z = BatchNormalization()(z)
    z = Dense(16, activation="elu")(z)
    z = Dropout(0.1)(z)
    outp = Dense(1, activation="sigmoid")(z)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=model_metrics)

    return model

In [32]:
filter_sizes = [1,2,4]
num_filters = 40
def make_model_cnn_3(embedding_matrix, embed_size=300):    
    inp = Input(shape=(max_len, ))
    x = Embedding(len_voc, embed_size, weights=[embedding_matrix], trainable=False)(inp)
    x = SpatialDropout1D(0.125)(x)
    #x = Reshape((maxlen, embed_size, 1))(x)
    
    conv_0 = Conv1D(num_filters, kernel_size=(filter_sizes[0]),
                                 kernel_initializer='he_normal', activation='elu')(x)
    conv_1 = Conv1D(num_filters, kernel_size=(filter_sizes[1]),
                                 kernel_initializer='he_normal', activation='elu')(x)
    conv_2 = Conv1D(num_filters, kernel_size=(filter_sizes[2]), 
                                 kernel_initializer='he_normal', activation='elu')(x)
    conv_3 = Conv1D(num_filters, kernel_size=(filter_sizes[2]),
                                 kernel_initializer='he_normal', activation='elu')(conv_2)
    conv_0_1 = Conv1D(num_filters, kernel_size=(filter_sizes[1]),
                      kernel_initializer='he_normal', activation='elu')(conv_0)
    
    maxpool_0 = MaxPool1D(pool_size=(max_len - filter_sizes[0] + 1))(conv_0)
    maxpool_1 = MaxPool1D(pool_size=(max_len - filter_sizes[1] + 1))(conv_1)
#     maxpool_2 = AvgPool1D(pool_size=(max_len - filter_sizes[2] + 1))(conv_2)
    maxpool_3 = MaxPool1D(pool_size=(max_len - filter_sizes[2] - filter_sizes[2] + 1))(conv_3)
    avgpool_0_1 = MaxPool1D(pool_size=(max_len - filter_sizes[1] - filter_sizes[0] + 1))(conv_0_1)
        
    z = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_3, avgpool_0_1])   
#     z = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2, maxpool_3, avgpool_0_1])   
    z = Flatten()(z)
    z = BatchNormalization()(z)
    z = Dense(16, activation="elu")(z)
    z = Dropout(0.1)(z)
    outp = Dense(1, activation="sigmoid")(z)
    
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=model_metrics)

    return model

In [33]:
# X_train, X_val, y_train, y_val = train_test_split(X_train_full,
#                                                   y, test_size=0.1, random_state=420)

if no_val_split:
    X_train = X_train_full
    y_train = y
    
else:
    X_train, X_val, y_train, y_val = train_test_split(X_train_full,
                                                      y, test_size=0.1, random_state=420)
#     X_train, X_val, y_train, y_val = train_test_split(X_train_full,
#                                                       y, test_size=0.1, random_state=420,
#                                                       stratify=y)

In [34]:
del X_train_full

In [35]:
def train_pred(model, train_X, train_y, val_X, val_y, epochs=2, callback=None, class_weight=1.5):
    for e in range(epochs):
        model.fit(train_X, train_y, batch_size=256, epochs=1,
                  validation_data=(val_X, val_y), callbacks=callback,
                  class_weight=class_weight, verbose=0)
        pred_val_y = model.predict([val_X], batch_size=1024, verbose=0)

        best_thresh = 0.5
        best_score = 0.0
        for thresh in np.arange(0.25, 0.501, 0.01):
            thresh = np.round(thresh, 2)
            score = metrics.f1_score(val_y, (pred_val_y > thresh).astype(int))
            if score > best_score:
                best_thresh = thresh
                best_score = score
        print("Epoch: ", e, "-    Val F1 Score: {:.4f}".format(best_score))

#     pred_test_y = model.predict([test_X], batch_size=1024, verbose=0)
    print('='*100)
    return pred_val_y, best_score

In [36]:
# models = [make_model_cnn_3, make_model_lstm_attn_2, model_gru_atten_3, make_model_capsule]
# # embeds = [0, 0, 1, 1]
# embeds = [0, 0, 0, 0]
# epochs = [4, 5, 4, 3]
# cw = [1.5, 1.5, 1.5, 1.5]

# models = [model_gru_atten_3_wtab]
# embeds = [0]
# epochs = [4]
# cw = [1.5]

models = [make_model_cnn_3_wtab, make_model_lstm_attn_2_wtab,
          model_gru_atten_3_wtab, make_model_capsule_wtab]
# embeds = [0, 0, 1, 1]
embeds = [0, 0, 0, 0]
epochs = [4, 4, 4, 3]
# epochs = [1, 1, 1, 1]
cw = [1.5, 1.5, 1.5, 1.5]

# epochs = [5]
batch_size = 256

pred_val = []
pred_test = []

checkpoints = ModelCheckpoint('weights_{}.hdf5'.format(i), monitor="val_f1", mode="max",
                              verbose=True, save_best_only=True)
clr = CyclicLR(base_lr=0.00115, max_lr=0.002,
               step_size=300., mode='exp_range',
               gamma=0.99994)
if no_val_split:
    validation_data = None
else:
    validation_data = [X_val, y_val]

if no_f1:
    callbacks = [clr]
else:
    callbacks = [checkpoints, clr]
        
for i in range(len(models)):
    print('Training model {}'.format(i))
    class_weight = {0: 1.,
                    1: cw[i]}
        
    model = models[i](embeddings_list[embeds[i]])
    
    if f1_per_epoch:
        train_pred(model, X_train, y_train, X_val, y_val, epochs=6, callback=[clr,],
                   class_weight=class_weight)
    else:
        history = model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs[i], 
                            validation_data=validation_data, callbacks=callbacks,
                            class_weight=class_weight, verbose=0)
    
    if not no_f1:
        model.load_weights('weights_{}.hdf5'.format(i))

    if not no_val_split:
        pred_val.append(model.predict(X_val, batch_size=batch_size, verbose=0))
    pred_test.append(model.predict(X_test, batch_size=batch_size, verbose=0))
    
    if not no_val_split:
        print(X_val.shape)

Training model 0
Training model 1
Training model 2
Training model 3


In [37]:
print('Time to train, predict:', time.perf_counter() - tc_preprocess)
tc2 = time.perf_counter()

Time to train, predict: 6734.524324940998


In [38]:
def tweak_threshold(pred, truth):
    thresholds = []
    scores = []
    for thresh in np.arange(0.25, 0.501, 0.01):
        thresh = np.round(thresh, 2)
        thresholds.append(thresh)
        score = metrics.f1_score(truth, (pred>thresh).astype(int))
        scores.append(score)
    return np.max(scores), thresholds[np.argmax(scores)]

if len(models) == 1:
    score_val, threshold_val = tweak_threshold(pred_val[0], y_val)
    print(f"F1={round(score_val, 4)} for thresh {threshold_val}")

In [39]:
from sklearn.linear_model import LinearRegression

if not no_val_split:
    X_test_stack = np.hstack(pred_test)
    X_val_stack = np.hstack(pred_val)
    reg = LinearRegression().fit(X_val_stack, y_val)
    print(reg.score(X_val_stack, y_val), reg.coef_)

    pred_val_mix = reg.coef_[0] * pred_val[0]

    for i in range(1,len(pred_val)):
        pred_val_mix += reg.coef_[i] * pred_val[i]

    curr_max = 0.0

    print("Finished training, LR coefficients:", reg.coef_, reg.intercept_)
    
    rcoef = reg.coef_
    rinter = reg.intercept_

    score_val, threshold_val = tweak_threshold(pred_val_mix, y_val)
    if score_val > curr_max:
        curr_max = score_val
        best_threshold = threshold_val
        print(f"F1={round(score_val, 4)} for thresh {threshold_val}")
else:
#     rcoef = [0.16505831, 0.2449182, 0.35552612, 0.18214285]
#     rinter = -0.0028561205
#     best_threshold = 0.44

    rcoef = [0.11670181, 0.30586663, 0.29109076, 0.18446921]
    rinter = -0.0034813136
    best_threshold = 0.41
    
#     rcoef = [0.1039269, 0.33623272, 0.27149993, 0.21583693] 
#     rinter = -0.0017842948
    
#     [0.1039269  0.33623272 0.27149993 0.21583693] -0.0017842948
#     rcoef = [0.08527213, 0.37222776, 0.19691171, 0.267353]
#     rinter = -0.006001383

In [40]:
print('Time to train, predict, fit LR and threshold:', time.perf_counter() - tc2)

Time to train, predict, fit LR and threshold: 0.06642998899769736


In [41]:
pred_test_mix = rinter + rcoef[0] * pred_test[0]

for i in range(1,len(pred_test)):
    pred_test_mix += rcoef[i] * pred_test[i]
    
# print(pred_test_mix[:-10])
pred_test_mix  = (pred_test_mix > best_threshold).astype(np.int)
# print(pred_test_mix[:-10])
# print(pred_test_mix.shape)
print((pred_test_mix > 0).sum())
# df_test = pd.read_csv("../input/test.csv")

3569


In [42]:
out_df = pd.DataFrame({"qid": test_qid})
out_df['prediction'] = pred_test_mix
out_df.to_csv("submission.csv", index=False)

In [43]:
print('Time total:', time.perf_counter() - tc1)

Time total: 7241.136747415003
