# Imports

In [1]:
# standard imports
import time
import random
import os
import numpy as np
import pandas as pd
import warnings
import re

# pytorch imports
import torch
import torch.nn as nn
import torch.utils.data
from torch.optim.optimizer import Optimizer
from torch.utils.data import Dataset, DataLoader

# imports for preprocessing the questions
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import text_to_word_sequence

# cross validation and metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler

# progress bars
from tqdm import tqdm
tqdm.pandas()

# visualization
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings("ignore", message="F-score is ill-defined and being set to 0.0 due to no predicted samples.")
%matplotlib inline

import gc

Using TensorFlow backend.


# Loading the data

In [2]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")

In [3]:
enable_local_test = False

In [4]:
if enable_local_test:
    n_test = len(test_df) * 4 #225480
    train_df, local_test_df = (train_df.iloc[:-n_test].reset_index(drop=True), 
                               train_df.iloc[-n_test:].reset_index(drop=True)) #ケツから-n_testインデックス
else:
    local_test_df = pd.DataFrame([[None, None, 0], [None, None, 0]], columns=['qid', 'question_text', 'target'])
    n_test = 2

In [5]:
print('n_test = {}'.format(len(test_df)*4))
print('train_df rows: {}'.format(len(train_df)))
print('local_test_df rows: {}'.format(len(local_test_df)))
print('ture test_df rows: {}'.format(len(test_df)))

n_test = 225480
train_df rows: 1306122
local_test_df rows: 2
ture test_df rows: 56370


# Utility functions

In [6]:
def seed_everything(seed=1234):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

In [7]:
def threshold_search(y_true, y_proba):
    best_threshold = 0
    best_score = 0
    for threshold in tqdm([i * 0.01 for i in range(100)], disable=True):
        score = f1_score(y_true=y_true, y_pred=y_proba > threshold)
        if score > best_score:
            best_threshold = threshold
            best_score = score
    search_result = {'threshold': best_threshold, 'f1': best_score}
    return search_result

In [8]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# Processing input

- embed_sizeは300でいいと思う
- max_wordsを絞るとスコアが下がる気がするんだよね。
- maxlen=80だとNG?

In [9]:
embed_size = 300
max_features= 120000 #200000 #120000 #95000 #200000
maxlen = 70
filters = '\t\n'

In [10]:
puncts = (',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'",
          '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', '·', 
          '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', 
          '§', '″', '′', 'Â', '█', '½', '…', '“', '★', '”', '–', '●', '►', '−', '¢', 
          '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
          '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', '¯', '♦', 
          '¤', '▲', '¸', '¾', 'Ã', '⋅', '‘', '∞', '∙', '）', '↓', '、', '│', '（', '»', '♪',
          '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√',
          '✔','，','？','℃','＞','！','。','／','；','„','‛','₹','÷','θ','π','Σ','Δ','ʃ','≠',
          '∈', '≡', '＝', 'Σ', 'Δ','∘','℅','≥','◦','ºF','∫','∠', '∑','∇','✓','∆',
          '\u200b', '\u200e', '\u202a', '\u202c', '\ufeff', '\uf0d8', '\u2061', 
          '\x10', '\x7f', '\x9d', '\xad', '\xa0',
         )
def clean_punct(x, puncts):
    if x is not None:
        for punct in puncts:
            x = x.replace(punct, f' {punct} ')
        return x

In [11]:
train_question_row = train_df.copy()
test_question_row = test_df.copy()
local_question_row = local_test_df.copy()

In [12]:
%%time
for df in [train_df, test_df, local_test_df]:
    df["question_text"] = df["question_text"].str.lower()
    #df["question_text"] = df["question_text"].map(replace_typical_misspell)
    df["question_text"] = df["question_text"].apply(lambda x: clean_punct(x, puncts))
    df["question_text"].fillna("_##_", inplace=True)
    
x_train = train_df["question_text"].values
x_test = test_df["question_text"].values
x_test_local = local_test_df["question_text"].values

tokenizer = Tokenizer(num_words=max_features, filters=filters)
tokenizer.fit_on_texts(list(x_train) + list(x_test_local)+list(x_test))
x_train = tokenizer.texts_to_sequences(x_train)
x_test = tokenizer.texts_to_sequences(x_test)
x_test_local = tokenizer.texts_to_sequences(x_test_local)

x_train = pad_sequences(x_train, maxlen=maxlen)
x_test = pad_sequences(x_test, maxlen=maxlen)
x_test_local = pad_sequences(x_test_local, maxlen=maxlen)

y_train = train_df['target'].values
y_test = local_test_df['target'].values

CPU times: user 1min 53s, sys: 928 ms, total: 1min 54s
Wall time: 1min 54s


In [13]:
print('x_train rows {}'.format(len(x_train)))
print('y_train rows {}'.format(len(y_train)))
print('x_test_local rows {}'.format(len(x_test_local)))
print('y_test rows {}'.format(len(y_test)))

x_train rows 1306122
y_train rows 1306122
x_test_local rows 2
y_test rows 2


# Creating the embeddings matrix

In [14]:
## FUNCTIONS TAKEN FROM https://www.kaggle.com/gmhost/gru-capsule
def load_glove(word_index, embeddings_index=None):
    if not embeddings_index:
        EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
        def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')[:300]
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))
    
    emb_mean,emb_std = -0.005838499,0.48782197

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index)+1)
    word_in_embed = []
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: 
            embedding_matrix[i] = embedding_vector
            word_in_embed.append(word)
            
    return embedding_matrix, word_in_embed

def load_para(word_index, embeddings_index=None):
    if not embeddings_index:
        EMBEDDING_FILE = '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
        def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore') if len(o)>100)
    
    emb_mean,emb_std = -0.0053247833,0.49346462

    # word_index = tokenizer.word_index
    word_in_embed = []
    nb_words = min(max_features, len(word_index)+1)
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None: 
            embedding_matrix[i] = embedding_vector
            word_in_embed.append(word)
    
    return embedding_matrix, word_in_embed

In [15]:
%%time
word_in_glove = []
word_in_para = []

embeddings_index_glove, word_in_glove = load_glove(tokenizer.word_index)
embeddings_index_para, word_in_para = load_para(tokenizer.word_index)

embedding_matrix = np.mean([embeddings_index_glove, embeddings_index_para], axis=0)
del embeddings_index_glove ,embeddings_index_para
gc.collect()

np.shape(embedding_matrix)

CPU times: user 5min 33s, sys: 9.77 s, total: 5min 43s
Wall time: 5min 43s


In [16]:
def make_addtional_features(token, word_in_embed):
    '''
    chars - 文字数
    words - 単語数
    words_vs_unique - テキストに含まれる語彙の中でユニークなワードの割合
    unknown_count - embedにない語彙(unknown)の数
    max_len_unknown - unknownな語彙の最大長さ
    min_len_unknown - unknownな語彙の最小長さ 　あまり効果ないと思うので外した
    unknown_vs_words - テキスト含まれる語彙の中でunknownなワードの割合
    last_question - 最後の単語がクエスチョンかどうか
    captial_rate - isupperの単語数 / 単語数
    '''
    features = []
    for sentence in tqdm(token):
        isupper = sum(1 for s in sentence if s.isupper()) #uppercount
        sentence = [w.lower() for w in sentence] #小文字化
        chars = len(''.join(sentence))
        words = len(sentence)
        unique_word = len(set(sentence))
        
        last_question = 0
        if token[len(sentence)-1] == '?':
            last_question = 1
        
        
        words_vs_unique = 0
        captial_rate = 0
        
        if words != 0:
            words_vs_unique = (unique_word/words)
            captial_rate = (isupper/words)

        unknown_count = 0
        max_len_unknown = 0
        min_len_unknown = 0
        unknown_vs_words = 0
        ##ここからワードのチェック##
        for word in sentence:
            if not word in word_in_embed:
                unknown_count +=1
                if max_len_unknown < len(word):
                    max_len_unknown = len(word)
                if min_len_unknown == 0:
                    min_len_unknown = len(word)
                elif min_len_unknown > len(word):
                    min_len_unknown = len(word)
        
        if unknown_count !=0 :
            unknown_vs_words = unknown_count / words

        features.append([max_len_unknown, captial_rate])
    
    features = np.array(features)
    return features

In [17]:
word_in_embed = set(word_in_glove+word_in_para)

In [18]:
%%time
#Captialの情報を失いたくないのでここでlowerをしないでトークン化を行う
for df in [train_question_row, test_question_row, local_question_row]:
    df["question_text"] = df["question_text"].apply(lambda x: clean_punct(x, puncts))
    df.fillna("_##_", inplace=True)

train_text_token = [text_to_word_sequence(text, lower=False, filters=filters) for text in train_question_row['question_text'].values]
test_text_token = [text_to_word_sequence(text, lower=False, filters=filters) for text in test_question_row['question_text'].values]
local_test_token = [text_to_word_sequence(text, lower=False, filters=filters) for text in local_question_row['question_text'].values]

CPU times: user 1min 12s, sys: 996 ms, total: 1min 13s
Wall time: 1min 13s


In [19]:
# train_text_token = [text_to_word_sequence(text) for text in train_df['question_text'].values]
# test_text_token = [text_to_word_sequence(text) for text in test_df['question_text'].values]
# local_test_token = [text_to_word_sequence(text) for text in local_test_df['question_text'].values]

In [20]:
features = make_addtional_features(train_text_token,word_in_embed)
test_features = make_addtional_features(test_text_token,word_in_embed)
test_local_features =  make_addtional_features(local_test_token,word_in_embed)

100%|██████████| 1306122/1306122 [00:14<00:00, 93162.70it/s] 
100%|██████████| 56370/56370 [00:00<00:00, 110998.66it/s]
100%|██████████| 2/2 [00:00<00:00, 2778.60it/s]


In [21]:
ss = StandardScaler()
ss.fit(features)
features = ss.transform(features)
test_features = ss.transform(test_features)
test_local_features = ss.transform(test_local_features)

In [22]:
y_train = train_df['target'].values
y_test = local_test_df['target'].values

In [23]:
np.random.seed(1029)
trn_idx = np.random.permutation(len(x_train))

x_train = x_train[trn_idx]
y_train = y_train[trn_idx]
features = features[trn_idx]

In [24]:
np.save("x_train",x_train)
np.save("x_test",x_test)
np.save("y_train",y_train)

np.save("features",features)
np.save("test_features",test_features)
np.save('test_local_features', test_local_features)

In [25]:
x_train = np.load("x_train.npy")
x_test = np.load("x_test.npy")
y_train = np.load("y_train.npy")
features = np.load("features.npy")
test_features = np.load("test_features.npy")
test_local_features = np.load('test_local_features.npy')

# Defining the model

In [26]:
class EarlyStopping:
    """Early stops the training if validation loss dosen't improve after a given patience."""
    def __init__(self, patience=0, verbose=False,filename='checkpoint.pt'):
        """
        Args:
            patience (int): How long to wait after last time validation loss improved.
                            Default: 0
            verbose (bool): If True, prints a message for each validation loss improvement. 
                            Default: False
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_score = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.filename = filename

    def __call__(self, val_loss, model):

        score = -val_loss

        if self.best_score is None:
            self.best_score = score
            if self.verbose:
                print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
            self.save_checkpoint(val_loss, model)
        elif score < self.best_score:
            self.counter += 1
            if self.verbose:
                print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_score = score
            self.save_checkpoint(val_loss, model)
            self.counter = 0

    def save_checkpoint(self, val_loss, model):
        '''Saves model when validation loss decrease.'''
        if self.verbose:
            print(f'Validation loss decreased ({self.val_loss_min:.6f} --> {val_loss:.6f}).  Saving model ...')
        torch.save(model.state_dict(), self.filename )
        self.val_loss_min = val_loss

In [27]:
n_splits = 4
splits = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=1024).split(x_train, y_train))

In [28]:
class GaussianNoise(nn.Module):
    """Gaussian noise regularizer.

    Args:
        sigma (float, optional): relative standard deviation used to generate the
            noise. Relative means that it will be multiplied by the magnitude of
            the value your are adding the noise to. This means that sigma can be
            the same regardless of the scale of the vector.
        is_relative_detach (bool, optional): whether to detach the variable before
            computing the scale of the noise. If `False` then the scale of the noise
            won't be seen as a constant but something to optimize: this will bias the
            network to generate vectors with smaller values.
    """

    def __init__(self, sigma=0.1, is_relative_detach=True):
        super().__init__()
        self.sigma = sigma
        self.is_relative_detach = is_relative_detach
        self.noise = torch.tensor(0 ,dtype=torch.float).cuda()

    def forward(self, x):
        if self.training and self.sigma != 0:
            scale = self.sigma * x.detach() if self.is_relative_detach else self.sigma * x
            sampled_noise = self.noise.repeat(*x.size()).normal_() * scale
            x = x + sampled_noise
        return x 

In [29]:
class Attention(nn.Module):
    def __init__(self, feature_dim, step_dim, bias=True, **kwargs):
        super(Attention, self).__init__(**kwargs)
        
        self.supports_masking = True

        self.bias = bias
        self.feature_dim = feature_dim
        self.step_dim = step_dim
        self.features_dim = 0
        
        weight = torch.zeros(feature_dim, 1)
        nn.init.xavier_uniform_(weight)
        self.weight = nn.Parameter(weight)
        
        if bias:
            self.b = nn.Parameter(torch.zeros(step_dim))
        
    def forward(self, x, mask=None):
        feature_dim = self.feature_dim
        step_dim = self.step_dim

        eij = torch.mm(
            x.contiguous().view(-1, feature_dim), 
            self.weight
        ).view(-1, step_dim)
        
        if self.bias:
            eij = eij + self.b
            
        eij = torch.tanh(eij)
        a = torch.exp(eij)
        
        if mask is not None:
            a = a * mask

        a = a / torch.sum(a, 1, keepdim=True) + 1e-10

        weighted_input = x * torch.unsqueeze(a, -1)
        return torch.sum(weighted_input, 1)

In [30]:
class NeuralNet(nn.Module):
    def __init__(self):
        super(NeuralNet, self).__init__()
        
        hidden_size = 90
        self.hidden_size = hidden_size
        
        self.embedding_weight = torch.FloatTensor(embedding_matrix)
        self.embedding = nn.Embedding.from_pretrained(self.embedding_weight, freeze=True)
        
        self.embedding_dropout = nn.Dropout2d(0.1)
        self.lstm = nn.LSTM(embed_size, hidden_size, bidirectional=True, batch_first=True)
        self.gru = nn.GRU(hidden_size * 2, hidden_size, bidirectional=True, batch_first=True)
        
        self.lstm_attention = Attention(hidden_size * 2, maxlen)
        self.gru_attention = Attention(hidden_size * 2, maxlen)
        
        #self.batchnorm = nn.BatchNorm1d(hidden_size*2*4)
        self.gaussian = GaussianNoise(0.1)
        
        self.linear = nn.Linear(hidden_size*2*4+2, 16)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.1)
        self.out = nn.Linear(16, 1)
    
    def forward(self, x):
        h_embedding = self.embedding(x[0])
        h_embedding = torch.squeeze(self.embedding_dropout(torch.unsqueeze(h_embedding, 0)))
        
        #hiddenの初期状態
        #batch_size = x[0].shape[0]
        #h0 = nn.Parameter(torch.randn(2, batch_size, self.hidden_size).type(torch.float32), requires_grad=False).cuda() #ランダムに初期化
        #c0 = nn.Parameter(torch.randn(2, batch_size, self.hidden_size).type(torch.float32), requires_grad=False).cuda()
        #h0 = torch.zeros(2, batch_size,  self.hidden_size).cuda()
        #c0 = torch.zeros(2, batch_size,  self.hidden_size).cuda()
        
        #h_lstm, _ = self.lstm(h_embedding,(h0, c0))
        h_lstm, _ = self.lstm(h_embedding)
        h_gru, _ = self.gru(h_lstm)
        
        h_lstm_atten = self.lstm_attention(h_lstm)
        h_gru_atten = self.gru_attention(h_gru)
        
        avg_pool = torch.mean(h_gru, 1)
        max_pool, _ = torch.max(h_gru, 1)
        
        conc = torch.cat((h_lstm_atten, h_gru_atten, avg_pool, max_pool), 1)
        #conc = self.batchnorm(conc)
        conc = self.gaussian(conc)
        
        f = torch.tensor(x[1], dtype=torch.float).cuda()
        
        conc = torch.cat((conc, f), 1)
        
        #conc = torch.cat((h_lstm_atten, h_gru_atten, avg_pool, max_pool, f), 1)
        #conc = self.batchnorm(conc)
        #conc = self.gaussian(conc)
        conc = self.relu(self.linear(conc))
        conc = self.dropout(conc)
        out = self.out(conc)
        
        return out

# Training

Regarding the training procedure, we use Cyclic LR with 5 epochs. I also made a separate function (`train_model`) to train the model because we are going to use it multiple times.

In [31]:
batch_size = 512
n_epochs = 5
clip = 5

In [32]:
class CyclicLR(object):
    def __init__(self, optimizer, base_lr=1e-3, max_lr=6e-3,
                 step_size=2000, factor=0.6, min_lr=1e-4, mode='triangular', gamma=1.,
                 scale_fn=None, scale_mode='cycle', last_batch_iteration=-1):

        if not isinstance(optimizer, torch.optim.Optimizer):
            raise TypeError('{} is not an Optimizer'.format(
                type(optimizer).__name__))
        self.optimizer = optimizer

        if isinstance(base_lr, list) or isinstance(base_lr, tuple):
            if len(base_lr) != len(optimizer.param_groups):
                raise ValueError("expected {} base_lr, got {}".format(
                    len(optimizer.param_groups), len(base_lr)))
            self.base_lrs = list(base_lr)
        else:
            self.base_lrs = [base_lr] * len(optimizer.param_groups)

        if isinstance(max_lr, list) or isinstance(max_lr, tuple):
            if len(max_lr) != len(optimizer.param_groups):
                raise ValueError("expected {} max_lr, got {}".format(
                    len(optimizer.param_groups), len(max_lr)))
            self.max_lrs = list(max_lr)
        else:
            self.max_lrs = [max_lr] * len(optimizer.param_groups)

        self.step_size = step_size

        if mode not in ['triangular', 'triangular2', 'exp_range'] \
                and scale_fn is None:
            raise ValueError('mode is invalid and scale_fn is None')

        self.mode = mode
        self.gamma = gamma

        if scale_fn is None:
            if self.mode == 'triangular':
                self.scale_fn = self._triangular_scale_fn
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = self._triangular2_scale_fn
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = self._exp_range_scale_fn
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode

        self.batch_step(last_batch_iteration + 1)
        self.last_batch_iteration = last_batch_iteration
        
        self.last_loss = np.inf
        self.min_lr = min_lr
        self.factor = factor
        
    def batch_step(self, batch_iteration=None):
        if batch_iteration is None:
            batch_iteration = self.last_batch_iteration + 1
        self.last_batch_iteration = batch_iteration
        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
            param_group['lr'] = lr

    def step(self, loss):
        if loss > self.last_loss:
            self.base_lrs = [max(lr * self.factor, self.min_lr) for lr in self.base_lrs]
            self.max_lrs = [max(lr * self.factor, self.min_lr) for lr in self.max_lrs]
            
    def _triangular_scale_fn(self, x):
        return 1.

    def _triangular2_scale_fn(self, x):
        return 1 / (2. ** (x - 1))

    def _exp_range_scale_fn(self, x):
        return self.gamma**(x)

    def get_lr(self):
        step_size = float(self.step_size)
        cycle = np.floor(1 + self.last_batch_iteration / (2 * step_size))
        x = np.abs(self.last_batch_iteration / step_size - 2 * cycle + 1)

        lrs = []
        param_lrs = zip(self.optimizer.param_groups, self.base_lrs, self.max_lrs)
        for param_group, base_lr, max_lr in param_lrs:
            base_height = (max_lr - base_lr) * np.maximum(0, (1 - x))
            if self.scale_mode == 'cycle':
                lr = base_lr + base_height * self.scale_fn(cycle)
            else:
                lr = base_lr + base_height * self.scale_fn(self.last_batch_iteration)
            lrs.append(lr)
        return lrs

In [33]:
class MyDataset(Dataset):
    def __init__(self,dataset):
        self.dataset = dataset

    def __getitem__(self, index):
        data, target = self.dataset[index]

        return data, target, index
    def __len__(self):
        return len(self.dataset)

In [34]:
x_test_cuda = torch.tensor(x_test, dtype=torch.long).cuda()
test = torch.utils.data.TensorDataset(x_test_cuda)
test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)

In [35]:
x_test_cuda = torch.tensor(x_test, dtype=torch.long).cuda()
test = torch.utils.data.TensorDataset(x_test_cuda)
test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)

x_test_local_cuda = torch.tensor(x_test_local, dtype=torch.long).cuda()
test_local = torch.utils.data.TensorDataset(x_test_local_cuda)
test_local_loader = torch.utils.data.DataLoader(test_local, batch_size=batch_size, shuffle=False)

In [36]:
%%time
train_preds = np.zeros(len(train_df))
test_preds = np.zeros((len(test_df), len(splits)))
test_preds_local = np.zeros((n_test, len(splits)))
seed = 6017

for ii, (train_idx, valid_idx) in enumerate(splits):
    fold_stime = time.time()
    
    x_train_fold = torch.tensor(x_train[train_idx.astype(int)], dtype=torch.long).cuda()
    y_train_fold = torch.tensor(y_train[train_idx.astype(int), np.newaxis], dtype=torch.float32).cuda()
    
    kfold_X_features = features[train_idx.astype(int)]
    kfold_X_valid_features = features[valid_idx.astype(int)]
    x_val_fold = torch.tensor(x_train[valid_idx.astype(int)], dtype=torch.long).cuda()
    y_val_fold = torch.tensor(y_train[valid_idx.astype(int), np.newaxis], dtype=torch.float32).cuda()
    
    seed_everything(seed + ii)
    
    model = NeuralNet()
    model.cuda()

    loss_fn = torch.nn.BCEWithLogitsLoss(reduction='sum')

    step_size = 300
    base_lr, max_lr = 0.001, 0.003   
    optimizer = torch.optim.Adam(model.parameters(), lr=max_lr)
    
    scheduler = CyclicLR(optimizer, base_lr=base_lr, max_lr=max_lr,step_size=step_size, 
                         mode='exp_range',gamma=0.99994)

    train = torch.utils.data.TensorDataset(x_train_fold, y_train_fold)
    valid = torch.utils.data.TensorDataset(x_val_fold, y_val_fold)
    
    train = MyDataset(train)
    valid = MyDataset(valid)

    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
    
    valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False)

    #early_stopping = EarlyStopping(patience=0,verbose=True)
    print(f'Fold {ii + 1}')
    
    #traning start
    for epoch in range(n_epochs):
        start_time = time.time()
        model.train()
        avg_loss = 0.  
        
        for i, (x_batch, y_batch, index) in enumerate(train_loader):
            f = kfold_X_features[index]
            y_pred = model([x_batch,f])
            
            if scheduler:
                scheduler.batch_step()
            
            loss = loss_fn(y_pred, y_batch)
            optimizer.zero_grad()
            loss.backward()
            
            nn.utils.clip_grad_norm_(model.parameters(), clip)
            
            optimizer.step()          
            avg_loss += loss.item() / len(train_loader)
            
        ###1epoch end###
        #検証   
        model.eval()
        
        valid_preds_fold = np.zeros((x_val_fold.size(0)))
        test_preds_fold = np.zeros((len(test_df)))
        
        validate = True
        avg_val_loss = 0.

        for i, (x_batch, y_batch, index) in enumerate(valid_loader):
            f = kfold_X_valid_features[index]
            y_pred = model([x_batch,f]).detach()

            avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
            valid_preds_fold[i * batch_size:(i+1) * batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]

        search_result = threshold_search(y_val_fold.cpu().numpy(), valid_preds_fold)
        val_f1, val_threshold = search_result['f1'], search_result['threshold']
        elapsed_time = time.time() - start_time
        print('Epoch {}/{} \t loss={:.4f} \t val_loss={:.4f} \t val_f1={:.4f} best_t={:.2f} \t time={:.2f}s'.format(
            epoch + 1, n_epochs, avg_loss, avg_val_loss, val_f1, val_threshold, elapsed_time))
        
#         early_stopping(1.-val_f1, model)
#         if early_stopping.early_stop:
#             print('Early stopping at',epoch+1,'epoch')
#             break
        
    ####1fold end####
    #model.load_state_dict(torch.load('checkpoint.pt'))
    valid_preds_fold = np.zeros((x_val_fold.size(0)))
    
    avg_val_loss = 0.
    for i, (x_batch, y_batch, index) in enumerate(valid_loader):
        f = kfold_X_valid_features[i * batch_size:(i+1) * batch_size]
        y_pred = model([x_batch, f]).detach()

        avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
        valid_preds_fold[i * batch_size:(i+1) * batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]

    print('Validation loss: ', avg_val_loss)

    test_preds_fold = np.zeros((len(test_loader.dataset)))
    
    for i, (x_batch,) in enumerate(test_loader):
        f = test_features[i * batch_size:(i+1) * batch_size]
        y_pred = model([x_batch,f]).detach()

        test_preds_fold[i * batch_size:(i+1) * batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]

    test_preds_local_fold = np.zeros((len(test_local_loader.dataset)))
    
    for i, (x_batch,) in enumerate(test_local_loader):
        f = test_local_features[i * batch_size:(i+1) * batch_size]
        y_pred = model([x_batch, f]).detach()

        test_preds_local_fold[i * batch_size:(i+1) * batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]
        
    train_preds[valid_idx] = valid_preds_fold
    test_preds[:, ii] = test_preds_fold
    test_preds_local[:, ii] = test_preds_local_fold
    elapsed_time = time.time() - fold_stime
    print()
    #break
    
#print('end')

Fold 1
Epoch 1/5 	 loss=65.6226 	 val_loss=51.6962 	 val_f1=0.6737 best_t=0.36 	 time=250.88s
Epoch 2/5 	 loss=57.6233 	 val_loss=50.6934 	 val_f1=0.6826 best_t=0.31 	 time=252.08s
Epoch 3/5 	 loss=54.1142 	 val_loss=50.2969 	 val_f1=0.6855 best_t=0.39 	 time=251.45s
Epoch 4/5 	 loss=50.4237 	 val_loss=51.2529 	 val_f1=0.6851 best_t=0.34 	 time=251.30s
Epoch 5/5 	 loss=46.6014 	 val_loss=52.5747 	 val_f1=0.6827 best_t=0.30 	 time=252.04s
Validation loss:  52.574651120224736

Fold 2
Epoch 1/5 	 loss=64.8794 	 val_loss=55.9611 	 val_f1=0.6683 best_t=0.18 	 time=251.63s
Epoch 2/5 	 loss=57.0850 	 val_loss=50.3772 	 val_f1=0.6828 best_t=0.37 	 time=252.07s
Epoch 3/5 	 loss=53.5185 	 val_loss=49.8198 	 val_f1=0.6823 best_t=0.39 	 time=252.68s
Epoch 4/5 	 loss=49.7282 	 val_loss=49.9711 	 val_f1=0.6866 best_t=0.32 	 time=251.75s
Epoch 5/5 	 loss=45.7953 	 val_loss=52.2739 	 val_f1=0.6846 best_t=0.32 	 time=252.69s
Validation loss:  52.273865786465755

Fold 3
Epoch 1/5 	 loss=65.9894 	 val_lo

# Evaluation

In [37]:
search_result = threshold_search(y_train, train_preds)
search_result

{'threshold': 0.34, 'f1': 0.6822392034271159}

In [38]:
pd.DataFrame(test_preds_local).corr()

Unnamed: 0,0,1,2,3
0,,,,
1,,,,
2,,,,
3,,,,


In [39]:
f1_score(y_test, test_preds_local.mean(axis=1) > search_result['threshold'])

0.0

In [40]:
submission = test_df[['qid']].copy()
submission['prediction'] = test_preds.mean(axis=1) > search_result['threshold']
submission.to_csv('submission.csv', index=False)