In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import time
import re
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn import metrics
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.models import Model
from keras.layers import Input, Embedding, Bidirectional, CuDNNLSTM, CuDNNGRU, Dense, Activation, Dropout, Conv2D,Conv1D
from keras.layers.core import RepeatVector
from keras.layers.wrappers import TimeDistributed
from keras.layers import concatenate, GlobalMaxPooling1D, GlobalAveragePooling1D, MaxPool2D,MaxPool1D,Concatenate
from keras.layers import Reshape, Flatten, Concatenate, Dropout, SpatialDropout1D, BatchNormalization
from keras.engine.topology import Layer
# Attention
from keras import initializers, regularizers, constraints
from keras import backend as K
from keras.callbacks import *
import gc
# find_best_weight
from scipy.optimize import minimize
from sklearn.metrics import mean_squared_error

Using TensorFlow backend.


In [2]:
test_flag=False # ローカル環境"True" Kernel登録時は"False"にする
blend_flag=True # ブレンドがあるときは"True"、ないときは"False"
data_flag=False # 事前に作ったデータを使用する"True" 事前のデータなし"False"
energy_saving_flag=True # 出力や計算を最小限にする"True" 多くのアウトプットをする"False"

### 特徴量の変数

In [3]:
max_features=120000 #Tokenizeで採用する単語の最大数+1(頻出順) 100に設定したら99単語を抽出
maxlen=70 # pad_sequencesで設定する最大文長
np.random.seed(seed=32)
test_size=0.1
post_fix="_f_"+str(max_features)+"_l_"+str(maxlen)

### データ取得

In [4]:
if data_flag==False:
    train_df = pd.read_csv("../input/train.csv")
    test_df = pd.read_csv("../input/test.csv")

### データ修正

In [5]:
# 特殊文字の間にスペースを入れる(それによって前後の単語を認識することができる)
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]
def clean_text(x):
    x = str(x)
    for punct in puncts:
        x = x.replace(punct, f' {punct} ')
    return x

# 数字を一括マスキング
def clean_numbers(x):
    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

# ミススペル修正
mispell_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have", 'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ', 'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating', 'pennis': 'penis', 'Etherium': 'Ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization', 'demonetisation': 'demonetization'}
def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re

mispellings, mispellings_re = _get_mispell(mispell_dict)
def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]
    return mispellings_re.sub(replace, text)

# 省略形を戻す
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have" }
def clean_contractions(text):
    specials = ["’", "‘", "´", "`"]
    for s in specials:
        # 一旦文章をリストにする
        text = text.replace(s, "'")
    text = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in text.split(" ")])
    return text

In [6]:
if data_flag==False:
    #lower
    train_df["question_text"] = train_df["question_text"].apply(lambda x: x.lower())
    test_df["question_text"] = test_df["question_text"].apply(lambda x: x.lower())

    # Clean the text
    train_df["question_text"] = train_df["question_text"].apply(lambda x: clean_text(x))
    test_df["question_text"] = test_df["question_text"].apply(lambda x: clean_text(x))

    # Clean numbers
    train_df["question_text"] = train_df["question_text"].apply(lambda x: clean_numbers(x))
    test_df["question_text"] = test_df["question_text"].apply(lambda x: clean_numbers(x))  

    # Clean speelings
    train_df["question_text"] = train_df["question_text"].apply(lambda x: replace_typical_misspell(x))
    test_df["question_text"] = test_df["question_text"].apply(lambda x: replace_typical_misspell(x))  

### 必要なデータをデータフレームから抽出(データ分割込み)

In [7]:
if data_flag==False:
    X_train = train_df["question_text"]
    y_train = train_df["target"]
    X_test = test_df["question_text"]
    X_test_qid = test_df["qid"]

In [8]:
# テスト⇒どちらでも　本番⇒ブレンド(True)だけ
if data_flag==False:
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=test_size, random_state=2018)

In [9]:
if data_flag==False:
    del train_df,test_df
    gc.collect()
    time.sleep(5)

### 文章のトークン化

In [10]:
if data_flag==False:
    tokenizer=Tokenizer(num_words=max_features)
    # トレーニングデータで最適化
    tokenizer.fit_on_texts(X_train)
    
    X_train=tokenizer.texts_to_sequences(X_train)
    if test_flag or blend_flag:
        X_val=tokenizer.texts_to_sequences(X_val)
    if test_flag==False:
        X_test=tokenizer.texts_to_sequences(X_test)

### パディング

In [11]:
if data_flag==False:
    X_train=pad_sequences(X_train,maxlen=maxlen)
    X_val=pad_sequences(X_val,maxlen=maxlen)
    if test_flag==False:
        X_test=pad_sequences(X_test,maxlen=maxlen)

### Embeddings

In [12]:
def load_emb_google():
    EMBEDDING_FILE = '../input/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'
    from gensim.models import KeyedVectors
    embeddings_index = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)
    
    # 平均、分散計算用に全部の数字を保存
    all_embs=np.zeros((len(embeddings_index.vocab),300),dtype='float32')
    i=0
    for word in embeddings_index.vocab:
        all_embs[i] = embeddings_index[word]
        i=i+1
    
    # 平均分散を計算
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]
    
    # モデルに渡す重み付け行列の初期化
    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index) + 1)
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    
    # モデルに渡す重み付け行列にEmbeddingファイルの数値を格納
    for word, i in word_index.items():
        if i >= max_features: continue # max_features以上の単語はパスする
        try:
            embedding_matrix[i] = embeddings_index[word]
        except:
            pass
        
    return embedding_matrix

In [13]:
def load_emb_glove():
    EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in tqdm(open(EMBEDDING_FILE,'r',encoding='utf-8')))

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index) + 1)
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
            
    return embedding_matrix

In [14]:
def load_emb_para():
    EMBEDDING_FILE = '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in tqdm(open(EMBEDDING_FILE, encoding="utf8", errors='ignore')) if len(o)>100)

    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index) + 1)
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
            
    return embedding_matrix

In [15]:
def load_emb_wiki():
    EMBEDDING_FILE = '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
    
    # ファイルからデータを取得
    def get_coefs(word,*arr): 
        return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in tqdm(open(EMBEDDING_FILE,'r',encoding='utf-8')) if len(o)>100) # if len(o)>100 はファイルの最初に不要な文字列があるため削除する目的
   
    # embeddings_indexの特徴量を算出(対象ワードがなかったときの補間に使用)
    all_embs = np.stack(embeddings_index.values())
    emb_mean,emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]
    
    # モデルに渡す重み付け行列の初期化
    word_index = tokenizer.word_index
    nb_words = min(max_features, len(tokenizer.word_index) + 1)
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    
    # モデルに渡す重み付け行列にEmbeddingファイルの数値を格納
    for word, i in word_index.items():
        if i >= max_features: continue # max_features以上の単語はパスする
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
            
    return embedding_matrix

In [16]:
if data_flag==False:
#     embedding_matrix_google=load_emb_google() # 1
    embedding_matrix_glove=load_emb_glove() # 2
    embedding_matrix_para=load_emb_para() # 3
#     embedding_matrix_wiki=load_emb_wiki()# 4

2196017it [03:33, 10303.84it/s]
  
1703756it [02:46, 10256.89it/s]


In [17]:
embed_size=300 # ハードコード embedding_matrix_google.shape[1]他はすべて300

### モデル構築

In [18]:
# https://www.kaggle.com/suicaokhoailang/lstm-attention-baseline-0-652-lb

class Attention(Layer):
    def __init__(self, step_dim,
                 W_regularizer=None, b_regularizer=None,
                 W_constraint=None, b_constraint=None,
                 bias=True, **kwargs):
        self.supports_masking = True
        self.init = initializers.get('glorot_uniform')

        self.W_regularizer = regularizers.get(W_regularizer)
        self.b_regularizer = regularizers.get(b_regularizer)

        self.W_constraint = constraints.get(W_constraint)
        self.b_constraint = constraints.get(b_constraint)

        self.bias = bias
        self.step_dim = step_dim
        self.features_dim = 0
        super(Attention, self).__init__(**kwargs)

    def build(self, input_shape):
        assert len(input_shape) == 3

        self.W = self.add_weight((input_shape[-1],),
                                 initializer=self.init,
                                 name='{}_W'.format(self.name),
                                 regularizer=self.W_regularizer,
                                 constraint=self.W_constraint)
        self.features_dim = input_shape[-1]

        if self.bias:
            self.b = self.add_weight((input_shape[1],),
                                     initializer='zero',
                                     name='{}_b'.format(self.name),
                                     regularizer=self.b_regularizer,
                                     constraint=self.b_constraint)
        else:
            self.b = None

        self.built = True

    def compute_mask(self, input, input_mask=None):
        return None

    def call(self, x, mask=None):
        features_dim = self.features_dim
        step_dim = self.step_dim

        eij = K.reshape(K.dot(K.reshape(x, (-1, features_dim)),
                        K.reshape(self.W, (features_dim, 1))), (-1, step_dim))

        if self.bias:
            eij += self.b

        eij = K.tanh(eij)

        a = K.exp(eij)

        if mask is not None:
            a *= K.cast(mask, K.floatx())

        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())

        a = K.expand_dims(a)
        weighted_input = x * a
        return K.sum(weighted_input, axis=1)

    def compute_output_shape(self, input_shape):
        return input_shape[0],  self.features_dim

In [19]:
def model_bi_gru(embedding_matrix,embed_size):
    inp=Input(shape=(maxlen,))
    x = Embedding(min(max_features,embedding_matrix.shape[0]),embed_size,weights=[embedding_matrix])(inp)
    x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
    x = GlobalMaxPooling1D()(x)
    x = Dense(16)(x)
    x = Activation('relu')(x)
    x = Dense(1, activation='sigmoid')(x)
    model = Model(inputs = inp, outputs = x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    epochs = -1
    return model, epochs

In [20]:
def model_bi_gru_without_emb(embed_size):
    inp=Input(shape=(maxlen,))
    x = Embedding(min(max_features,embedding_matrix.shape[0]),embed_size)(inp)
    x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
    x = GlobalMaxPooling1D()(x)
    x = Dense(16)(x)
    x = Activation('relu')(x)
    x = Dense(1, activation='sigmoid')(x)
    model = Model(inputs = inp, outputs = x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    epochs = -1
    return model, epochs

In [21]:
def model_bi_gru_atten(embedding_matrix,embed_size):
    inp=Input(shape=(maxlen,))
    x = Embedding(min(max_features,embedding_matrix.shape[0]),embed_size,weights=[embedding_matrix])(inp)
    x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
    x = Attention(maxlen)(x)
#     x = GlobalMaxPooling1D()(x)
    x = Dense(16)(x)
    x = Activation('relu')(x)
    x = Dense(1, activation='sigmoid')(x)
    model = Model(inputs = inp, outputs = x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    epochs = -1
    return model, epochs

In [22]:
def model_bi_gru_avepool(embedding_matrix,embed_size):
    inp=Input(shape=(maxlen,))
    x = Embedding(min(max_features,embedding_matrix.shape[0]),embed_size,weights=[embedding_matrix])(inp)
    x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
    x = GlobalAveragePooling1D()(x)
    x = Dense(16)(x)
    x = Activation('relu')(x)
    x = Dense(1, activation='sigmoid')(x)
    model = Model(inputs = inp, outputs = x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    epochs=-1
    return model, epochs

In [23]:
def model_bi_gru_dupool(embedding_matrix,embed_size):
    inp=Input(shape=(maxlen,))
    x = Embedding(min(max_features,embedding_matrix.shape[0]),embed_size,weights=[embedding_matrix])(inp)
    x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
    
    max_pl = GlobalMaxPooling1D()(x)
    avg_pl = GlobalAveragePooling1D()(x)
    
    x = concatenate([max_pl, avg_pl])
    
    x = Dense(16)(x)
    x = Activation('relu')(x)
    x = Dense(1, activation='sigmoid')(x)
    model = Model(inputs = inp, outputs = x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    epochs=-1
    return model, epochs

In [24]:
def model_bi_gru2(embedding_matrix,embed_size):
    inp=Input(shape=(maxlen,))
    x = Embedding(min(max_features,embedding_matrix.shape[0]),embed_size,weights=[embedding_matrix])(inp)
    x = Bidirectional(CuDNNGRU(128, return_sequences=True))(x)
    x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
    x = GlobalMaxPooling1D()(x)
    x = Dense(16)(x)
    x = Activation('relu')(x)
    x = Dense(1, activation='sigmoid')(x)
    model = Model(inputs = inp, outputs = x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    epochs=2
    return model,epochs

In [25]:
def model_bi_gru2_avepool(embedding_matrix,embed_size):
    inp=Input(shape=(maxlen,))
    x = Embedding(min(max_features,embedding_matrix.shape[0]),embed_size,weights=[embedding_matrix])(inp)
    x = Bidirectional(CuDNNGRU(128, return_sequences=True))(x)
    x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
    x = GlobalAveragePooling1D()(x)
    x = Dense(16)(x)
    x = Activation('relu')(x)
    x = Dense(1, activation='sigmoid')(x)
    model = Model(inputs = inp, outputs = x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    epochs=-1
    return model, epochs

In [26]:
def model_bi_gru2_dupool(embedding_matrix,embed_size):
    inp=Input(shape=(maxlen,))
    x = Embedding(min(max_features,embedding_matrix.shape[0]),embed_size,weights=[embedding_matrix])(inp)
    x = Bidirectional(CuDNNGRU(128, return_sequences=True))(x)
    x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
    
    max_pl = GlobalMaxPooling1D()(x)
    avg_pl = GlobalAveragePooling1D()(x)
    
    x = concatenate([max_pl, avg_pl])
    
    x = Dense(16)(x)
    x = Activation('relu')(x)
    x = Dense(1, activation='sigmoid')(x)
    model = Model(inputs = inp, outputs = x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    epochs=2
    return model,epochs

In [27]:
def model_bi_gru2_atten(embedding_matrix,embed_size):
    inp=Input(shape=(maxlen,))
    x = Embedding(min(max_features,embedding_matrix.shape[0]),embed_size,weights=[embedding_matrix])(inp)
    x = Bidirectional(CuDNNGRU(128, return_sequences=True))(x)
    x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
    x = Attention(maxlen)(x)
#     x = GlobalMaxPooling1D()(x)
    x = Dense(16)(x)
    x = Activation('relu')(x)
    x = Dense(1, activation='sigmoid')(x)
    model = Model(inputs = inp, outputs = x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    epochs = 2
    return model, epochs

In [28]:
def model_bi_gru2_atten_bn(embedding_matrix,embed_size):
    inp=Input(shape=(maxlen,))
    x = Embedding(min(max_features,embedding_matrix.shape[0]),embed_size,weights=[embedding_matrix])(inp)
    x = Bidirectional(CuDNNGRU(128, return_sequences=True))(x)
    x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
    x = Attention(maxlen)(x)
#     x = GlobalMaxPooling1D()(x)
    x = BatchNormalization()(x)
    x = Dense(16)(x)
    x = Activation('relu')(x)
    x = Dense(1, activation='sigmoid')(x)
    model = Model(inputs = inp, outputs = x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    epochs = 2
    return model, epochs

In [29]:
def model_bi_gru3(embedding_matrix,embed_size):
    inp=Input(shape=(maxlen,))
    x = Embedding(min(max_features,embedding_matrix.shape[0]),embed_size,weights=[embedding_matrix])(inp)
    x = Bidirectional(CuDNNGRU(128, return_sequences=True))(x)
    x = Bidirectional(CuDNNGRU(100, return_sequences=True))(x)
    x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
    x = GlobalMaxPooling1D()(x)
    x = Dense(16)(x)
    x = Activation('relu')(x)
    x = Dense(1, activation='sigmoid')(x)
    model = Model(inputs = inp, outputs = x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    epochs = 2
    return model,epochs

In [30]:
def model_bi_gru3_atten(embedding_matrix,embed_size):
    inp = Input(shape=(maxlen,))
    x = Embedding(min(max_features,embedding_matrix.shape[0]), embed_size, weights=[embedding_matrix], trainable=False)(inp)
    x = Bidirectional(CuDNNGRU(128, return_sequences=True))(x)
    x = Bidirectional(CuDNNGRU(100, return_sequences=True))(x)
    x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
    x = Attention(maxlen)(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    epochs = 3
    return model,epochs

In [31]:
def model_bi_gru3_atten_bn(embedding_matrix,embed_size):
    inp = Input(shape=(maxlen,))
    x = Embedding(min(max_features,embedding_matrix.shape[0]), embed_size, weights=[embedding_matrix], trainable=False)(inp)
    x = Bidirectional(CuDNNGRU(128, return_sequences=True))(x)
    x = Bidirectional(CuDNNGRU(100, return_sequences=True))(x)
    x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
    x = Attention(maxlen)(x)
    x = BatchNormalization()(x)
    x = Dense(16)(x)
    x = Activation('relu')(x)
    x = Dense(1, activation='sigmoid')(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    epochs = 3
    return model,epochs

In [32]:
def model_bi_gru3_atten_midify(embedding_matrix,embed_size):
    inp = Input(shape=(maxlen,))
    x = Embedding(min(max_features,embedding_matrix.shape[0]), embed_size, weights=[embedding_matrix], trainable=False)(inp)
    x = Bidirectional(CuDNNGRU(128, return_sequences=True))(x)
    x = Bidirectional(CuDNNGRU(100, return_sequences=True))(x)
    x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
    x = Attention(maxlen)(x)
    x = Dense(16)(x)
    x = Activation('relu')(x)    
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    epochs = 3
    return model,epochs

In [33]:
def model_bi_lstm(embedding_matrix,embed_size):
    inp=Input(shape=(maxlen,))
    x = Embedding(min(max_features,embedding_matrix.shape[0]),embed_size,weights=[embedding_matrix])(inp)
    x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
    x = GlobalMaxPooling1D()(x)
    x = Dense(16)(x)
    x = Activation('relu')(x)
    x = Dense(1, activation='sigmoid')(x)
    model = Model(inputs = inp, outputs = x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    epochs=-1
    return model, epochs

In [34]:
def model_bi_lstm2(embedding_matrix,embed_size):
    inp=Input(shape=(maxlen,))
    x = Embedding(min(max_features,embedding_matrix.shape[0]),embed_size,weights=[embedding_matrix])(inp)
    x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(x)
    x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
    x = GlobalMaxPooling1D()(x)
    x = Dense(16)(x)
    x = Activation('relu')(x)
    x = Dense(1, activation='sigmoid')(x)
    model = Model(inputs = inp, outputs = x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    epochs=1
    return model, epochs

In [35]:
def model_bi_lstm2_atten(embedding_matrix,embed_size):
    inp = Input(shape=(maxlen,))
    x = Embedding(min(max_features,embedding_matrix.shape[0]), embed_size, weights=[embedding_matrix], trainable=False)(inp)
    x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(x)
    x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
    x = Attention(maxlen)(x)
    x = Dense(16)(x)
    x = Activation('relu')(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    epochs = 5
    return model, epochs

In [36]:
def model_bi_lstm2_atten_bn(embedding_matrix,embed_size):
    inp = Input(shape=(maxlen,))
    x = Embedding(min(max_features,embedding_matrix.shape[0]), embed_size, weights=[embedding_matrix], trainable=False)(inp)
    x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(x)
    x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
    x = Attention(maxlen)(x)
    x = BatchNormalization()(x)
    x = Dense(16)(x)
    x = Activation('relu')(x)
    x = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inp, outputs=x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    epochs = 5
    return model, epochs

In [37]:
def model_bi_lstm2_dupool(embedding_matrix,embed_size):
    inp=Input(shape=(maxlen,))
    x = Embedding(min(max_features,embedding_matrix.shape[0]),embed_size,weights=[embedding_matrix])(inp)
    x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(x)
    x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
    
    max_pl = GlobalMaxPooling1D()(x)
    avg_pl = GlobalAveragePooling1D()(x)
    
    x = concatenate([max_pl, avg_pl])
    
    x = Dense(16)(x)
    x = Activation('relu')(x)
    x = Dense(1, activation='sigmoid')(x)
    model = Model(inputs = inp, outputs = x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    epochs=2
    return model,epochs

In [38]:
def model_bi_lstm3(embedding_matrix,embed_size):
    inp=Input(shape=(maxlen,))
    x = Embedding(min(max_features,embedding_matrix.shape[0]),embed_size,weights=[embedding_matrix])(inp)
    x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(x)
    x = Bidirectional(CuDNNLSTM(100, return_sequences=True))(x)
    x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
    x = GlobalMaxPooling1D()(x)
    x = Dense(16)(x)
    x = Activation('relu')(x)
    x = Dense(1, activation='sigmoid')(x)
    model = Model(inputs = inp, outputs = x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    epochs=2
    return model,epochs

In [39]:
def model_bi_lstm3_atten(embedding_matrix,embed_size):
    inp=Input(shape=(maxlen,))
    x = Embedding(min(max_features,embedding_matrix.shape[0]),embed_size,weights=[embedding_matrix])(inp)
    x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(x)
    x = Bidirectional(CuDNNLSTM(100, return_sequences=True))(x)
    x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
    x = Attention(maxlen)(x)
    x = Dense(16)(x)
    x = Activation('relu')(x)
    x = Dense(1, activation='sigmoid')(x)
    model = Model(inputs = inp, outputs = x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    epochs=2
    return model,epochs

In [40]:
def model_bi_lstm3_atten_bn(embedding_matrix,embed_size):
    inp=Input(shape=(maxlen,))
    x = Embedding(min(max_features,embedding_matrix.shape[0]),embed_size,weights=[embedding_matrix])(inp)
    x = Bidirectional(CuDNNLSTM(128, return_sequences=True))(x)
    x = Bidirectional(CuDNNLSTM(100, return_sequences=True))(x)
    x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
    x = Attention(maxlen)(x)
    x = BatchNormalization()(x)
    x = Dense(16)(x)
    x = Activation('relu')(x)
    x = Dense(1, activation='sigmoid')(x)
    model = Model(inputs = inp, outputs = x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    epochs=2
    return model,epochs

In [41]:
def model_bi_gru_lstm(embedding_matrix,embed_size):
    inp=Input(shape=(maxlen,))
    x = Embedding(min(max_features,embedding_matrix.shape[0]),embed_size,weights=[embedding_matrix])(inp)
    x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
    x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
    x = GlobalMaxPooling1D()(x)
    x = Dense(16)(x)
    x = Activation('relu')(x)
    x = Dense(1, activation='sigmoid')(x)
    model = Model(inputs = inp, outputs = x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    epochs=-1
    return model, epochs

In [42]:
def model_bi_lstm_gru(embedding_matrix,embed_size):
    inp=Input(shape=(maxlen,))
    x = Embedding(min(max_features,embedding_matrix.shape[0]),embed_size,weights=[embedding_matrix])(inp)
    x = Bidirectional(CuDNNLSTM(64, return_sequences=True))(x)
    x = Bidirectional(CuDNNGRU(64, return_sequences=True))(x)
    x = GlobalMaxPooling1D()(x)
    x = Dense(16)(x)
    x = Activation('relu')(x)
    x = Dense(1, activation='sigmoid')(x)
    model = Model(inputs = inp, outputs = x)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    epochs=-1
    return model, epochs

In [43]:
#single-rnn-with-4-folds-cir
def model_sdp_lstm_gru_atten_dupool(embedding_matrix,embed_size):
    
    inp = Input(shape=(maxlen,))
    x = Embedding(min(max_features,embedding_matrix.shape[0]), embed_size, weights=[embedding_matrix], trainable=False)(inp)
    x = SpatialDropout1D(0.1)(x)
    x = Bidirectional(CuDNNLSTM(40, return_sequences=True))(x)
    y = Bidirectional(CuDNNGRU(40, return_sequences=True))(x)
    
    atten_1 = Attention(maxlen)(x) # skip connect
    atten_2 = Attention(maxlen)(y)
    avg_pool = GlobalAveragePooling1D()(y)
    max_pool = GlobalMaxPooling1D()(y)
    
    conc = concatenate([atten_1, atten_2, avg_pool, max_pool])
    conc = Dense(16, activation="relu")(conc)
    conc = Dropout(0.1)(conc)
    outp = Dense(1, activation="sigmoid")(conc)    

    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    epochs=5
    return model,epochs

### モデル読み込み

In [44]:
# embedding_matrix = np.mean([embedding_matrix_google,embedding_matrix_glove,embedding_matrix_para,embedding_matrix_wiki], axis = 0)

In [45]:
embedding_matrix = np.mean([embedding_matrix_glove,embedding_matrix_para], axis = 0)
del embedding_matrix_glove,embedding_matrix_para
gc.collect()
time.sleep(5)

In [46]:
model=[]

In [47]:
# model.append([model_sdp_lstm_gru_atten_dupool(embedding_matrix_google,embed_size),'model_sdp_lstm_gru_atten_dupool_emb_1'])
# model.append([model_sdp_lstm_gru_atten_dupool(embedding_matrix_glove,embed_size),'model_sdp_lstm_gru_atten_dupool_emb_2'])
# model.append([model_sdp_lstm_gru_atten_dupool(embedding_matrix_para,embed_size),'model_sdp_lstm_gru_atten_dupool_emb_3'])
# model.append([model_sdp_lstm_gru_atten_dupool(embedding_matrix_wiki,embed_size),'model_sdp_lstm_gru_atten_dupool_emb_4'])

In [48]:
model.append([model_bi_gru2_atten(embedding_matrix,embed_size),'model_bi_gru2_atten'])
model.append([model_bi_gru3_atten(embedding_matrix,embed_size),'model_bi_gru3_atten'])
model.append([model_bi_lstm2_atten(embedding_matrix,embed_size),'model_bi_lstm2_atten'])
model.append([model_bi_lstm3_atten(embedding_matrix,embed_size),'model_bi_lstm3_atten'])
model.append([model_sdp_lstm_gru_atten_dupool(embedding_matrix,embed_size),'model_sdp_lstm_gru_atten_dupool'])

In [49]:
batch_size=512
epochs=[]
for i in range(len(model)):
    epochs.append(model[i][0][1])

In [50]:
## 独自にエポック数を設定するときはここで
# for i in range(len(model)):
#     epochs[i]=5
# epochs[0]=1
# epochs[1]=1

In [51]:
for i in range(len(model)):
    print("モデル: {0}, epochs: {1}".format(model[i][1],epochs[i]))

モデル: model_bi_gru2_atten, epochs: 2
モデル: model_bi_gru3_atten, epochs: 3
モデル: model_bi_lstm2_atten, epochs: 5
モデル: model_bi_lstm3_atten, epochs: 2
モデル: model_sdp_lstm_gru_atten_dupool, epochs: 5


### モデル計算

In [52]:
hist=[]
pred_train = []
pred_test = []
if blend_flag:
    pred_val=[]

In [53]:
%%time
for i in range(len(model)):
    hist.append([])
    print(model[i][1])
    if test_flag:
        for j in range(epochs[i]):
            if j==0:
                verbose=1
            else:
                verbose=0
            hist[i].append(model[i][0][0].fit(X_train, y_train, batch_size=batch_size, epochs=1, validation_data=(X_val, y_val), verbose=verbose))
            tmp_pred_y_val=model[i][0][0].predict([X_val],batch_size=1024,verbose=0)
            for j in np.arange(0.1, 0.501, 0.01):
                print("thresh: {0:.2f} - F1 Score: {1:.4f}".format(j, metrics.f1_score(y_val,tmp_pred_y_val>j)))
            print('-' * 60)
        pred_val.append(model[i][0][0].predict([X_val],batch_size=1024,verbose=0))
        np.save('./saved_processed_input_data/pred_val_'+model[i][1]+post_fix,pred_val[i])
        pred_train.append(model[i][0][0].predict([X_train],batch_size=1024,verbose=0))
        np.save('./saved_processed_input_data/pred_train_'+model[i][1]+post_fix,pred_train[i])
    else:
        if blend_flag:
            for j in range(epochs[i]):
                hist[i].append(model[i][0][0].fit(X_train, y_train, batch_size=batch_size, epochs=1, validation_data=(X_val, y_val), verbose=0))                    
            pred_val.append(model[i][0][0].predict([X_val],batch_size=1024,verbose=0))
            if data_flag==False:
                pred_test.append(model[i][0][0].predict([X_test],batch_size=1024,verbose=0))
        else:
            for j in range(epochs[i]):
                hist[i].append(model[i][0][0].fit(X_train, y_train, batch_size=batch_size, epochs=1, verbose=0))
            pred_train.append(model[i][0][0].predict([X_train],batch_size=1024,verbose=0))
            pred_test.append(model[i][0][0].predict([X_test],batch_size=1024,verbose=0))

model_bi_gru2_atten
model_bi_gru3_atten
model_bi_lstm2_atten
model_bi_lstm3_atten
model_sdp_lstm_gru_atten_dupool
CPU times: user 1h 5min 13s, sys: 25min, total: 1h 30min 13s
Wall time: 1h 45min 11s


### ウエイト算出

In [54]:
def find_best_weight(preds, target,init=0.5):
    def _validate_func(weights):
        ''' scipy minimize will pass the weights as a numpy array '''
        final_prediction = 0
        for weight, prediction in zip(weights, preds):
                final_prediction += weight * prediction
        return np.sqrt(mean_squared_error(final_prediction, target))

    #the algorithms need a starting value, right not we chose 0.5 for all weights
    #its better to choose many random starting points and run minimize a few times
    starting_values = [init]*len(preds)

    #adding constraints and a different solver as suggested by user 16universe
    #https://kaggle2.blob.core.windows.net/forum-message-attachments/75655/2393/otto%20model%20weights.pdf?sv=2012-02-12&se=2015-05-03T21%3A22%3A17Z&sr=b&sp=r&sig=rkeA7EJC%2BiQ%2FJ%2BcMpcA4lYQLFh6ubNqs2XAkGtFsAv0%3D
    cons = ({'type':'eq','fun':lambda w: 1-sum(w)})
    #our weights are bound between 0 and 1
    bounds = [(0, 1)] * len(preds)

    res = minimize(_validate_func, starting_values, method='Nelder-Mead', bounds=bounds, constraints=cons)

    print('Ensemble Score: {best_score}'.format(best_score=(1-res['fun'])))
    print('Best Weights: {weights}'.format(weights=res['x']))

    return res

In [55]:
if blend_flag:
    res = find_best_weight(pred_val,y_val)['x']
    init=[0.4,0.6,0.3,0.7,0.2,0.8,0.1,0.9]
    for n in range(len(init)):
        check_flag = True
        for m in range(len(res)):
            if res[m]<=0 or res[m]>=1:
                check_flag = False
                print(n,m)
                res = find_best_weight(pred_val,y_val,init[n])['x']
                break
        if check_flag:
            break



Ensemble Score: 0.8358769319865883
Best Weights: [0.12814028 0.29353068 0.24471352 0.20555449 0.16191741]


### 閾値算出

In [56]:
def calc_pred_weight(pred, w):
    for i in range(len(pred)):
        if i ==0:
            pred_val = pred[i] * w[i]
        else:
            pred_val += pred[i] * w[i]
    return pred_val

In [57]:
if blend_flag:
    pred_val_w_ave = calc_pred_weight(pred_val, res)
    if test_flag or (test_flag==False and energy_saving_flag==False):
        pred_train_w_ave =  calc_pred_weight(pred_train, res)

In [58]:
if blend_flag:
    def bestThresshold(y_train,train_preds):
        tmp = [0,0,0] # idx, cur, max
        delta = 0
        for tmp[0] in tqdm(np.arange(0.1, 0.501, 0.01)):
            tmp[1] = metrics.f1_score(y_train, np.array(train_preds)>tmp[0])
            if tmp[1] > tmp[2]:
                delta = tmp[0]
                tmp[2] = tmp[1]
        print('best threshold is {:.4f} with F1 score: {:.4f}'.format(delta, tmp[2]))
        return delta

In [59]:
if blend_flag:
    delta=bestThresshold(y_val,pred_val_w_ave)

100%|██████████| 41/41 [00:00<00:00, 48.14it/s]

best threshold is 0.3500 with F1 score: 0.7060





### Submission

In [60]:
if test_flag==False:
    if blend_flag:
        for i in range(len(pred_val)):
            if i ==0:
                pred_test_w_ave = pred_test[i] * res[i]
            else:
                pred_test_w_ave += pred_test[i] * res[i]
        pred_test_w_ave_result = (pred_test_w_ave > delta ).astype(np.int)
        submission = pd.DataFrame.from_dict({'qid': X_test_qid})
        submission['prediction'] = pred_test_w_ave_result
    else:
        pred_test_result = (pred_test > delta ).astype(np.int)
        submission = pd.DataFrame.from_dict({'qid': X_test_qid})
        submission['prediction'] = pred_test_result        
    submission.to_csv('submission.csv', index=False)