In [None]:
import time
import random
import pandas as pd
import numpy as np
import gc
import re
import torch
from torchtext import data
import spacy
from tqdm import tqdm_notebook, tnrange
from tqdm.auto import tqdm

tqdm.pandas(desc='Progress')
from collections import Counter
from textblob import TextBlob
from nltk import word_tokenize

import torch as t
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from torch.autograd import Variable
from torchtext.data import Example
from sklearn.metrics import f1_score
import torchtext
import os 
import string

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# cross validation and metrics
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from torch.optim.optimizer import Optimizer
from unidecode import unidecode

In [None]:
abs_start = time.time()
# time.time()-abs_start

In [None]:
embed_size = 300 # how big is each word vector
max_features = 120000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 70 # max number of words in a question to use
batch_size = 512 # how many samples to process at once
n_epochs = 5 # how many times to iterate over all samples
n_splits = 5 # Number of K-fold Splits

proj_embed_sz = 300
emb_dropout = 0.2
rnn_dim = 512
fc_dim = 1024
clf_dropout = 0.2
attnnet = 'dep_gating'

SEED = 1029

In [None]:
def seed_everything(seed=1029):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

In [None]:
def load(embedding_name,word_index):
    if embedding_name == 'glove':
        EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
        def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')[:300]
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))
        emb_mean,emb_std = -0.005838499,0.48782197
    elif embedding_name == 'fasttext':
        EMBEDDING_FILE = '../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec'
        def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE) if len(o)>100)
        emb_mean,emb_std = -0.0033469985,0.109855495
    elif embedding_name == 'para':
        EMBEDDING_FILE = '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
        def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
        embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore') if len(o)>100)
        emb_mean,emb_std = -0.0053247833,0.49346462
        
    embed_size = 300
    nb_words = min(max_features,len(word_index))
    
    punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
    for p in punct:
        if p not in embeddings_index:
            embeddings_index[p] = np.zeros((1,embed_size))
            
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    
    for word, i in word_index.items():
        if i >= nb_words: continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is None:
            embedding_vector = embeddings_index.get(word.capitalize())
        if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        
    return embedding_matrix

In [None]:
puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

def clean_text(x):
    x = str(x)
    for punct in puncts:
        x = x.replace(punct, f' {punct} ')
    return x

def clean_numbers(x):
    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

mispell_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have", 'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ', 'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating', 'pennis': 'penis', 'Etherium': 'Ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization', 'demonetisation': 'demonetization'}

def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re

mispellings, mispellings_re = _get_mispell(mispell_dict)
def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]
    return mispellings_re.sub(replace, text)

In [None]:
def add_features(df):    
    
    df['question_text'] = df['question_text'].apply(lambda x:str(x))
    df['total_length'] = df['question_text'].apply(len)
    df['capitals'] = df['question_text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
    df['caps_vs_length'] = df.apply(lambda row: float(row['capitals'])/float(row['total_length']),axis=1)
    df['num_words'] = df.question_text.str.count('\S+')
    df['num_unique_words'] = df['question_text'].apply(lambda comment: len(set(w for w in comment.split())))
    df['words_vs_unique'] = df['num_unique_words'] / df['num_words']  
    df["count_punctuations"] =df["question_text"].apply(lambda x: len([c for c in x if c in string.punctuation]))
    
    return df

def load_and_prec():
    train_df = pd.read_csv("../input/train.csv")
    test_df = pd.read_csv("../input/test.csv")
    print("Train shape : ",train_df.shape)
    print("Test shape : ",test_df.shape)
    
    train_df["question_text"] = train_df["question_text"].apply(lambda x: x.lower()).apply(lambda x: clean_text(x)).apply(lambda x: clean_numbers(x)).apply(lambda x: replace_typical_misspell(x))
                    
    test_df["question_text"] = test_df["question_text"].apply(lambda x: x.lower()).apply(lambda x: clean_text(x)).apply(lambda x: clean_numbers(x)).apply(lambda x: replace_typical_misspell(x))

#     # lower
#     train_df["question_text"] = train_df["question_text"].apply(lambda x: x.lower())
#     test_df["question_text"] = test_df["question_text"].apply(lambda x: x.lower())

#     # Clean the text
#     train_df["question_text"] = train_df["question_text"].apply(lambda x: clean_text(x))
#     test_df["question_text"] = test_df["question_text"].apply(lambda x: clean_text(x))
    
#     # Clean numbers
#     train_df["question_text"] = train_df["question_text"].apply(lambda x: clean_numbers(x))
#     test_df["question_text"] = test_df["question_text"].apply(lambda x: clean_numbers(x))
    
#     # Clean spellings
#     train_df["question_text"] = train_df["question_text"].apply(lambda x: replace_typical_misspell(x))
#     test_df["question_text"] = test_df["question_text"].apply(lambda x: replace_typical_misspell(x))
    
    ## fill up the missing values
    train_X = train_df["question_text"].fillna("_##_").values
    test_X = test_df["question_text"].fillna("_##_").values
    
    train_df["question_text_rev"] = train_df["question_text"].apply(lambda x: ' '.join(reversed(x.split())))
    test_df["question_text_rev"] = test_df["question_text"].apply(lambda x: ' '.join(reversed(x.split())))
    
    train_X_rev = train_df["question_text_rev"].fillna("_##_").values
    test_X_rev = test_df["question_text_rev"].fillna("_##_").values
    
    ###################### Add Features ###############################
    #  https://github.com/wongchunghang/toxic-comment-challenge-lstm/blob/master/toxic_comment_9872_model.ipynb
    train = add_features(train_df)
    test = add_features(test_df)

    features = train[['total_length','capitals','caps_vs_length','num_words','num_unique_words','words_vs_unique','count_punctuations']].fillna(0)
    test_features = test[['total_length','capitals','caps_vs_length','num_words','num_unique_words','words_vs_unique','count_punctuations']].fillna(0)

    ss = StandardScaler()
    ss.fit(np.vstack((features, test_features)))
    features = ss.transform(features)
    test_features = ss.transform(test_features)
    ###########################################################################

    ## Tokenize the sentences
    tokenizer = Tokenizer(num_words=max_features,filters = '')
    tokenizer.fit_on_texts(list(train_X))
    train_X = tokenizer.texts_to_sequences(train_X)
    train_X_rev = tokenizer.texts_to_sequences(train_X_rev)
    test_X = tokenizer.texts_to_sequences(test_X)
    test_X_rev = tokenizer.texts_to_sequences(test_X_rev)

    ## Pad the sentences 
    train_X = pad_sequences(train_X, maxlen=maxlen)
    test_X = pad_sequences(test_X, maxlen=maxlen)
    
    train_X_rev = pad_sequences(train_X_rev, maxlen=maxlen)
    test_X_rev = pad_sequences(test_X_rev, maxlen=maxlen)

    ## Get the target values
    train_y = train_df['target'].values
        
    #shuffling the data
    np.random.seed(SEED)
    trn_idx = np.random.permutation(len(train_X))

    train_X = train_X[trn_idx]
    train_X_rev = train_X_rev[trn_idx]
    train_y = train_y[trn_idx]    
    features = features[trn_idx]
    
    return train_X, test_X,train_X_rev, test_X_rev, train_y, features, test_features, tokenizer.word_index

In [None]:
start  = time.time()
x_train, x_test,x_train_rev, x_test_rev, y_train, features, test_features, word_index = load_and_prec() 
(time.time() - start)/60

In [None]:
seed_everything()
start  = time.time()

glove_embeddings = load('glove',word_index)
# paragram_embeddings = load('para',word_index)
fasttext_embeddings = load('fasttext',word_index)

embedding_matrix_mean = np.mean([glove_embeddings,fasttext_embeddings], axis=0)
embedding_matrix_concat = np.concatenate([glove_embeddings,fasttext_embeddings], axis=1)

(time.time() - start)/60

In [None]:
embed_size_mean = embedding_matrix_mean.shape[1]
embed_size_concat = embedding_matrix_concat.shape[1]
embeds = ['glove','fasttext']
embed_size_mean,embed_size_concat

In [None]:
use_pretrained_embedding = True

hidden_size = 60
gru_len = hidden_size

Routings = 4 #5
Num_capsule = 5
Dim_capsule = 5#16
dropout_p = 0.25
rate_drop_dense = 0.28
LR = 0.001
T_epsilon = 1e-7
num_classes = 30


class CyclicLR(object):
    def __init__(self, optimizer, base_lr=1e-3, max_lr=6e-3,
                 step_size=2000, mode='triangular', gamma=1.,
                 scale_fn=None, scale_mode='cycle', last_batch_iteration=-1):

        if not isinstance(optimizer, Optimizer):
            raise TypeError('{} is not an Optimizer'.format(
                type(optimizer).__name__))
        self.optimizer = optimizer

        if isinstance(base_lr, list) or isinstance(base_lr, tuple):
            if len(base_lr) != len(optimizer.param_groups):
                raise ValueError("expected {} base_lr, got {}".format(
                    len(optimizer.param_groups), len(base_lr)))
            self.base_lrs = list(base_lr)
        else:
            self.base_lrs = [base_lr] * len(optimizer.param_groups)

        if isinstance(max_lr, list) or isinstance(max_lr, tuple):
            if len(max_lr) != len(optimizer.param_groups):
                raise ValueError("expected {} max_lr, got {}".format(
                    len(optimizer.param_groups), len(max_lr)))
            self.max_lrs = list(max_lr)
        else:
            self.max_lrs = [max_lr] * len(optimizer.param_groups)

        self.step_size = step_size

        if mode not in ['triangular', 'triangular2', 'exp_range'] \
                and scale_fn is None:
            raise ValueError('mode is invalid and scale_fn is None')

        self.mode = mode
        self.gamma = gamma

        if scale_fn is None:
            if self.mode == 'triangular':
                self.scale_fn = self._triangular_scale_fn
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = self._triangular2_scale_fn
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = self._exp_range_scale_fn
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode

        self.batch_step(last_batch_iteration + 1)
        self.last_batch_iteration = last_batch_iteration

    def batch_step(self, batch_iteration=None):
        if batch_iteration is None:
            batch_iteration = self.last_batch_iteration + 1
        self.last_batch_iteration = batch_iteration
        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
            param_group['lr'] = lr

    def _triangular_scale_fn(self, x):
        return 1.

    def _triangular2_scale_fn(self, x):
        return 1 / (2. ** (x - 1))

    def _exp_range_scale_fn(self, x):
        return self.gamma**(x)

    def get_lr(self):
        step_size = float(self.step_size)
        cycle = np.floor(1 + self.last_batch_iteration / (2 * step_size))
        x = np.abs(self.last_batch_iteration / step_size - 2 * cycle + 1)

        lrs = []
        param_lrs = zip(self.optimizer.param_groups, self.base_lrs, self.max_lrs)
        for param_group, base_lr, max_lr in param_lrs:
            base_height = (max_lr - base_lr) * np.maximum(0, (1 - x))
            if self.scale_mode == 'cycle':
                lr = base_lr + base_height * self.scale_fn(cycle)
            else:
                lr = base_lr + base_height * self.scale_fn(self.last_batch_iteration)
            lrs.append(lr)
        return lrs

    
# core caps_layer with squash func
class Caps_Layer(nn.Module):
    def __init__(self, input_dim_capsule=gru_len * 2, num_capsule=Num_capsule, dim_capsule=Dim_capsule, \
                 routings=Routings, kernel_size=(9, 1), share_weights=True,
                 activation='default', **kwargs):
        super(Caps_Layer, self).__init__(**kwargs)

        self.num_capsule = num_capsule
        self.dim_capsule = dim_capsule
        self.routings = routings
        self.kernel_size = kernel_size  # 暂时没用到
        self.share_weights = share_weights
        if activation == 'default':
            self.activation = self.squash
        else:
            self.activation = nn.ReLU(inplace=True)

        if self.share_weights:
            self.W = nn.Parameter(
                nn.init.xavier_normal_(t.empty(1, input_dim_capsule, self.num_capsule * self.dim_capsule)))
        else:
            self.W = nn.Parameter(
                t.randn(BATCH_SIZE, input_dim_capsule, self.num_capsule * self.dim_capsule))  # 64即batch_size

    def forward(self, x):

        if self.share_weights:
            u_hat_vecs = t.matmul(x, self.W)
        else:
            print('add later')

        batch_size = x.size(0)
        input_num_capsule = x.size(1)
        u_hat_vecs = u_hat_vecs.view((batch_size, input_num_capsule,
                                      self.num_capsule, self.dim_capsule))
        u_hat_vecs = u_hat_vecs.permute(0, 2, 1, 3)  # 转成(batch_size,num_capsule,input_num_capsule,dim_capsule)
        b = t.zeros_like(u_hat_vecs[:, :, :, 0])  # (batch_size,num_capsule,input_num_capsule)

        for i in range(self.routings):
            b = b.permute(0, 2, 1)
            c = F.softmax(b, dim=2)
            c = c.permute(0, 2, 1)
            b = b.permute(0, 2, 1)
            outputs = self.activation(t.einsum('bij,bijk->bik', (c, u_hat_vecs)))  # batch matrix multiplication
            # outputs shape (batch_size, num_capsule, dim_capsule)
            if i < self.routings - 1:
                b = t.einsum('bik,bijk->bij', (outputs, u_hat_vecs))  # batch matrix multiplication
        return outputs  # (batch_size, num_capsule, dim_capsule)

    # text version of squash, slight different from original one
    def squash(self, x, axis=-1):
        s_squared_norm = (x ** 2).sum(axis, keepdim=True)
        scale = t.sqrt(s_squared_norm + T_epsilon)
        return x / scale


In [None]:
class Attention(nn.Module):
    def __init__(self, feature_dim, step_dim, bias=True, **kwargs):
        super(Attention, self).__init__(**kwargs)
        
        self.supports_masking = True

        self.bias = bias
        self.feature_dim = feature_dim
        self.step_dim = step_dim
        self.features_dim = 0
        
        weight = torch.zeros(feature_dim, 1)
        nn.init.xavier_uniform_(weight)
        self.weight = nn.Parameter(weight)
        
        if bias:
            self.b = nn.Parameter(torch.zeros(step_dim))
        
    def forward(self, x, mask=None):
        feature_dim = self.feature_dim
        step_dim = self.step_dim

        eij = torch.mm(
            x.contiguous().view(-1, feature_dim), 
            self.weight
        ).view(-1, step_dim)
        
        if self.bias:
            eij = eij + self.b
            
        eij = torch.tanh(eij)
        a = torch.exp(eij)
        
        if mask is not None:
            a = a * mask

        a = a / torch.sum(a, 1, keepdim=True) + 1e-10

        weighted_input = x * torch.unsqueeze(a, -1)
        return torch.sum(weighted_input, 1)
    
    
class MyDataset(Dataset):
    def __init__(self,dataset):
        self.dataset = dataset

    def __getitem__(self, index):
        data, target = self.dataset[index]

        return data, target, index
    def __len__(self):
        return len(self.dataset)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [132]:
def nn_init(nn_module, method='xavier'):
    for param_name, _ in nn_module.named_parameters():
        if isinstance(nn_module, nn.Sequential):
            i, name = param_name.split('.', 1)
            param = getattr(nn_module[int(i)], name)
        else:
            param = getattr(nn_module, param_name)
        if param_name.find('weight') > -1:
            init_weight(param, method)
        elif param_name.find('bias') > -1:
            nn.init.uniform_(param, -1e-4, 1e-4)

def init_weight(weight, method):
    if method == 'orthogonal':
        nn.init.orthogonal_(weight)
    elif method == 'xavier':
        nn.init.xavier_uniform_(weight)
    elif method == 'kaiming':
        nn.init.kaiming_uniform_(weight)
    elif method == 'none':
        pass
    else:
        raise Exception('Unknown init method')

class SingleEmbedder1(nn.Module):
    def __init__(self,name):
        super(SingleEmbedder1, self).__init__()
        if name == 'glove':
            self.embedding_matrix = glove_embeddings
        elif name == 'fasttext':
            self.embedding_matrix = fasttext_embeddings
        self.embedder = nn.Embedding(max_features, 300)
        self.embedder.weight = nn.Parameter(torch.tensor(self.embedding_matrix, dtype=torch.float32))
        self.embedder.cuda()
        self.embedder.weight.requires_grad = False
        
    def forward(self, x):
        return self.embedder(x)

class ProjSumEmbedder1(nn.Module):
    def __init__(self,attnnet,embeds,proj_embed_sz):
        super(ProjSumEmbedder1, self).__init__()
#         assert attnnet in {'no_dep_softmax', 'dep_softmax', 'no_dep_gating', 'dep_gating'}
        self.attnnet = attnnet
        self.n_emb = len(embeds)
        self.emb_sz = proj_embed_sz
        self.emb_names = sorted([name for name in embeds])
        self.embedders = nn.ModuleDict()
        self.projectors = nn.ModuleDict()
        self.nonlin = 'relu'
        
        for name in embeds:
            self.embedders.update({name: SingleEmbedder1(name)})
            self.projectors.update({name: nn.Linear(300, proj_embed_sz).cuda()})
            nn_init(self.projectors[name], 'xavier')

        self.attn_0, self.attn_1 = None, None
        self.m_attn = None
        if self.n_emb > 1:
            if attnnet.startswith('dep_'):
                self.attn_0 = nn.LSTM(proj_embed_sz, 2, bidirectional=True)
                nn_init(self.attn_0, 'orthogonal')
                self.attn_1 = nn.Linear(2 * 2, 1)
                nn_init(self.attn_1, 'xavier')
            elif attnnet.startswith('no_dep_'):
                self.attn_0 = nn.Linear(proj_embed_sz, 2)
                nn_init(self.attn_0, 'xavier')
                self.attn_1 = nn.Linear(2, 1)
                nn_init(self.attn_1, 'xavier')

        self.dropout = nn.Dropout(p=emb_dropout)
        
    def forward(self, seq_nums):
        projected = [self.projectors[name](self.embedders[name](seq_nums)) for name in self.emb_names]

        if self.attnnet == 'none':
            out = sum(projected)
        else:
            projected_cat = torch.cat([p.unsqueeze(2) for p in projected], 2)
            s_len, b_size, _, emb_dim = projected_cat.size()
            attn_input = projected_cat

            if self.attnnet.startswith('dep_'):
                attn_input = attn_input.view(s_len, b_size * self.n_emb, -1)
                self.m_attn = self.attn_1(self.attn_0(attn_input)[0])
                self.m_attn = self.m_attn.view(s_len, b_size, self.n_emb)
            elif self.attnnet.startswith('no_dep_'):
                self.m_attn = self.attn_1(self.attn_0(attn_input)).squeeze(3)

            if self.attnnet.endswith('_gating'):
                self.m_attn = torch.sigmoid(self.m_attn)
            elif self.attnnet.endswith('_softmax'):
                self.m_attn = F.softmax(self.m_attn, dim=2)

            attended = projected_cat * self.m_attn.view(s_len, b_size, self.n_emb, 1).expand_as(projected_cat)
            out = attended.sum(2)

        if self.nonlin == 'relu':
            out = F.relu(out)
        if emb_dropout > 0.0:
            out = self.dropout(out)
        return out
    
def get_embedder():
    return ProjSumEmbedder1(attnnet,embeds,proj_embed_sz)

def bestThresshold(y_train,train_preds):
    y_train = y_train.cpu()
    tmp = [0,0,0] # idx, cur, max
    delta = 0
    for tmp[0] in np.arange(0.05, 0.801, 0.01):
        tmp[1] = f1_score(y_train, np.array(train_preds)>tmp[0])
        if tmp[1] > tmp[2]:
            delta = tmp[0]
            tmp[2] = tmp[1]
    print('best threshold is {:.4f} with F1 score: {:.4f}'.format(delta, tmp[2]))
    return delta

class FocalLoss(nn.Module):
    def __init__(self, alpha=0.5, gamma=1, logits=True, reduction='elementwise_mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.logits = logits
        self.reduction = reduction

    def forward(self, inputs, targets):
        if self.logits:
            BCE_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
        else:
            BCE_loss = F.binary_cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1-pt)**self.gamma * BCE_loss

        if self.reduction is None:
            return F_loss
        else:
            return torch.mean(F_loss)

In [None]:
###define models here
# Define train_fn

In [None]:
class NeuralNet_merge_attn_concat(nn.Module):
    def __init__(self):
        super(NeuralNet_merge_attn_concat, self).__init__()
        
        fc_layer = 16
        fc_layer1 = 16

        self.embedding = nn.Embedding(max_features, 600)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix_concat, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        
        self.embedding_dropout = nn.Dropout2d(0.1)
        self.lstm = nn.LSTM(600, hidden_size, bidirectional=True, batch_first=True)
        self.gru = nn.GRU(hidden_size * 2, hidden_size, bidirectional=True, batch_first=True)

        self.lstm2 = nn.LSTM(hidden_size * 2, hidden_size, bidirectional=True, batch_first=True)

        self.lstm_attention = Attention(hidden_size * 2, maxlen)
        self.gru_attention = Attention(hidden_size * 2, maxlen)
        self.bn = nn.BatchNorm1d(16, momentum=0.5)
        self.linear = nn.Linear(hidden_size*8+8, fc_layer1) #643:80 - 483:60 - 323:40
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(fc_layer**2,fc_layer)
        self.out = nn.Linear(fc_layer, 1)
        self.lincaps = nn.Linear(Num_capsule * Dim_capsule, 1)
        self.caps_layer = Caps_Layer()
    
    def forward(self, x):
        
#         Capsule(num_capsule=10, dim_capsule=10, routings=4, share_weights=True)(x)

        h_embedding = self.embedding(x[0])
        h_embedding = torch.squeeze(self.embedding_dropout(torch.unsqueeze(h_embedding, 0)))
        
        h_lstm, _ = self.lstm(h_embedding)
        h_gru, _ = self.gru(h_lstm)

        ##Capsule Layer        
        content3 = self.caps_layer(h_gru)
        content3 = self.dropout(content3)
        batch_size = content3.size(0)
        content3 = content3.view(batch_size, -1)
        content3 = self.relu(self.lincaps(content3))

        ##Attention Layer
        h_lstm_atten = self.lstm_attention(h_lstm)
        h_gru_atten = self.gru_attention(h_gru)
        
        # global average pooling
        avg_pool = torch.mean(h_gru, 1)
        # global max pooling
        max_pool, _ = torch.max(h_gru, 1)
        
        f = torch.tensor(x[1], dtype=torch.float).cuda() #[512,160]
        conc = torch.cat((h_lstm_atten, h_gru_atten,content3, avg_pool, max_pool,f), 1)
        conc = self.relu(self.linear(conc))
        conc = self.bn(conc)
        conc = self.dropout(conc)
        out = self.out(conc)
        
        return out
##################################################################################
class NeuralNet_merge_attn_mean(nn.Module):
    def __init__(self):
        super(NeuralNet_merge_attn_mean, self).__init__()
        
        fc_layer = 16
        fc_layer1 = 16

        self.embedding = nn.Embedding(max_features, 300)
        self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix_mean, dtype=torch.float32))
        self.embedding.weight.requires_grad = False
        
        self.embedding_dropout = nn.Dropout2d(0.1)
        self.lstm = nn.LSTM(300, hidden_size, bidirectional=True, batch_first=True)
        self.gru = nn.GRU(hidden_size * 2, hidden_size, bidirectional=True, batch_first=True)

        self.lstm2 = nn.LSTM(hidden_size * 2, hidden_size, bidirectional=True, batch_first=True)

        self.lstm_attention = Attention(hidden_size * 2, maxlen)
        self.gru_attention = Attention(hidden_size * 2, maxlen)
        self.bn = nn.BatchNorm1d(16, momentum=0.5)
        self.linear = nn.Linear(hidden_size*8+8, fc_layer1) #643:80 - 483:60 - 323:40
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(fc_layer**2,fc_layer)
        self.out = nn.Linear(fc_layer, 1)
        self.lincaps = nn.Linear(Num_capsule * Dim_capsule, 1)
        self.caps_layer = Caps_Layer()
    
    def forward(self, x):
        
#         Capsule(num_capsule=10, dim_capsule=10, routings=4, share_weights=True)(x)

        h_embedding = self.embedding(x[0])
        h_embedding = torch.squeeze(
            self.embedding_dropout(torch.unsqueeze(h_embedding, 0)))
        
        h_lstm, _ = self.lstm(h_embedding)
        h_gru, _ = self.gru(h_lstm)

        ##Capsule Layer        
        content3 = self.caps_layer(h_gru)
        content3 = self.dropout(content3)
        batch_size = content3.size(0)
        content3 = content3.view(batch_size, -1)
        content3 = self.relu(self.lincaps(content3))

        ##Attention Layer
        h_lstm_atten = self.lstm_attention(h_lstm)
        h_gru_atten = self.gru_attention(h_gru)
        
        # global average pooling
        avg_pool = torch.mean(h_gru, 1)
        # global max pooling
        max_pool, _ = torch.max(h_gru, 1)
        
        f = torch.tensor(x[1], dtype=torch.float).cuda()

                #[512,160]
        conc = torch.cat((h_lstm_atten, h_gru_atten,content3, avg_pool, max_pool,f), 1)
        conc = self.relu(self.linear(conc))
        conc = self.bn(conc)
        conc = self.dropout(conc)

        out = self.out(conc)
        
        return out
    ######################################################################################3

class NeuralNet_DME_2(nn.Module):
    def __init__(self):
        super(NeuralNet_DME_2, self).__init__()
        self.embedding = get_embedder()
        
        fc_layer = 16
        fc_layer1 = 16
        
        self.embedding_dropout = nn.Dropout2d(0.1)
        self.lstm = nn.LSTM(300, hidden_size, bidirectional=True, batch_first=True)
        self.gru = nn.GRU(hidden_size * 2, hidden_size, bidirectional=True, batch_first=True)

        self.lstm2 = nn.LSTM(hidden_size * 2, hidden_size, bidirectional=True, batch_first=True)

        self.lstm_attention = Attention(hidden_size * 2, maxlen)
        self.gru_attention = Attention(hidden_size * 2, maxlen)
        self.bn = nn.BatchNorm1d(16, momentum=0.5)
        self.linear = nn.Linear(hidden_size*8+8, fc_layer1) #643:80 - 483:60 - 323:40
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.1)
        self.fc = nn.Linear(fc_layer**2,fc_layer)
        self.out = nn.Linear(fc_layer, 1)
        self.lincaps = nn.Linear(Num_capsule * Dim_capsule, 1)
        self.caps_layer = Caps_Layer()

        
    def forward(self, x):
        h_embedding = self.embedding(x[0])
        
        h_embedding = torch.squeeze(
            self.embedding_dropout(torch.unsqueeze(h_embedding, 0)))
        
        h_lstm, _ = self.lstm(h_embedding)
        h_gru, _ = self.gru(h_lstm)

        ##Capsule Layer        
        content3 = self.caps_layer(h_gru)
        content3 = self.dropout(content3)
        batch_size = content3.size(0)
        content3 = content3.view(batch_size, -1)
        content3 = self.relu(self.lincaps(content3))

        ##Attention Layer
        h_lstm_atten = self.lstm_attention(h_lstm)
        h_gru_atten = self.gru_attention(h_gru)
        
        # global average pooling
        avg_pool = torch.mean(h_gru, 1)
        # global max pooling
        max_pool, _ = torch.max(h_gru, 1)
        
        f = torch.tensor(x[1], dtype=torch.float).cuda()

        conc = torch.cat((h_lstm_atten, h_gru_atten,content3, avg_pool, max_pool,f), 1)
        conc = self.relu(self.linear(conc))
        conc = self.bn(conc)
        conc = self.dropout(conc)

        out = self.out(conc)
        
        return out

In [None]:
seed_everything()
def train_test_split_by_idx(len_X,ratio):
    random_idx = np.random.permutation(len_X)
    loc = int(len_X*ratio)
    train_idx = random_idx[:loc]
    test_idx = random_idx[loc:]
    return train_idx, test_idx

In [None]:
x_test_cuda = torch.tensor(x_test, dtype=torch.long).cuda()
test = torch.utils.data.TensorDataset(x_test_cuda)

x_test_cuda_rev = torch.tensor(x_test_rev, dtype=torch.long).cuda()
test_rev = torch.utils.data.TensorDataset(x_test_cuda_rev)

# test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)
# test_loader_rev = torch.utils.data.DataLoader(test_rev, batch_size=batch_size, shuffle=False)

x_train = np.array(x_train)
y_train = np.array(y_train)
features = np.array(features)

#Train val split
train_idx,valid_idx = train_test_split_by_idx(len(x_train),0.9)

x_train_fold = torch.tensor(x_train[train_idx.astype(int)], dtype=torch.long).cuda()
y_train_fold = torch.tensor(y_train[train_idx.astype(int), np.newaxis], dtype=torch.float32).cuda()
kfold_X_features = features[train_idx.astype(int)]

kfold_X_valid_features = features[valid_idx.astype(int)]
x_val_fold = torch.tensor(x_train[valid_idx.astype(int)], dtype=torch.long).cuda()
y_val_fold = torch.tensor(y_train[valid_idx.astype(int), np.newaxis], dtype=torch.float32).cuda()

train = torch.utils.data.TensorDataset(x_train_fold, y_train_fold)
valid = torch.utils.data.TensorDataset(x_val_fold, y_val_fold)

train = MyDataset(train)
valid = MyDataset(valid)
# y_val_fold

In [None]:
# REV
x_train_rev = np.array(x_train_rev)

x_train_rev_fold = torch.tensor(x_train_rev[train_idx.astype(int)], dtype=torch.long).cuda()
x_val_rev_fold = torch.tensor(x_train_rev[valid_idx.astype(int)], dtype=torch.long).cuda()

train_rev = torch.utils.data.TensorDataset(x_train_rev_fold, y_train_fold)
valid_rev = torch.utils.data.TensorDataset(x_val_rev_fold, y_val_fold)

train_rev = MyDataset(train_rev)
valid_rev = MyDataset(valid_rev)

In [None]:
bce_loss = torch.nn.BCEWithLogitsLoss(reduction='sum')
focal_loss = FocalLoss()

In [114]:
def train_fn(model,train,valid,features,x_test,batch_size,loss_function,ratio,n_epochs):
    train_time = time.time()
    avg_losses_f = []
    avg_val_losses_f = []
    
    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=False)
    valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False)
    test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)

    val_preds = np.zeros((len(valid)))
    test_preds = np.zeros((len(x_test)))
    
    model = model.cuda()

    loss_fn = loss_function
    
    step_size = 300
    base_lr, max_lr = 0.001, 0.003   
    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), 
                             lr=max_lr)

    ################################################################################################
    scheduler = CyclicLR(optimizer, base_lr=base_lr, max_lr=max_lr,
               step_size=step_size, mode='exp_range',
               gamma=0.99994)
    ###############################################################################################
    
    for epoch in range(n_epochs):
        model.train()
        print('Start Epoch : '+ str(epoch+1) + '/'+str(n_epochs))
        # set train mode of the model. This enables operations which are only applied during training like dropout
        start_time = time.time()
        avg_loss = 0.  
        for i, (x_batch, y_batch, index) in enumerate(train_loader):
            # Forward pass: compute predicted y by passing x to the model.
            ################################################################################################            
            f = kfold_X_features[index]
#                 y_pred = model([x_batch,f])
            y_pred = model([x_batch,f])
            ################################################################################################

            ################################################################################################

            if scheduler:
                scheduler.batch_step()
            ################################################################################################


            # Compute and print loss.
            loss = loss_fn(y_pred, y_batch)

            # Before the backward pass, use the optimizer object to zero all of the
            # gradients for the Tensors it will update (which are the learnable weights
            # of the model)
            optimizer.zero_grad()

            # Backward pass: compute gradient of the loss with respect to model parameters
            loss.backward()

            # Calling the step function on an Optimizer makes an update to its parameters
            optimizer.step()
            avg_loss += loss.item() / len(train_loader)

        model.eval()
        valid_preds_fold = np.zeros((len(valid)))
        test_preds_fold = np.zeros((len(x_test)))

        avg_val_loss = 0.
        for i, (x_batch, y_batch, index) in enumerate(valid_loader):
            f = kfold_X_valid_features[index]
            y_pred = model([x_batch,f]).detach()
            avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
            valid_preds_fold[i * batch_size:(i+1) * batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]

        elapsed_time = time.time() - start_time 
        
        print('Epoch {}/{} \t loss={:.4f} \t val_loss={:.4f} \t time={:.2f}s'.format(
                epoch + 1, n_epochs, avg_loss, avg_val_loss, elapsed_time))
    avg_losses_f.append(avg_loss)
    avg_val_losses_f.append(avg_val_loss) 
        
    for i, (x_batch,) in enumerate(test_loader):
        f = test_features[i * batch_size:(i+1) * batch_size]
        y_pred = model([x_batch,f]).detach()

        test_preds_fold[i * batch_size:(i+1) * batch_size] = sigmoid(y_pred.cpu().numpy())[:, 0]
    
    del model
    torch.cuda.empty_cache()
    print('All \t loss={:.5f} \t val_loss={:.5f} \t '.format(np.average(avg_losses_f),np.average(avg_val_losses_f)))
    print(time.time()-train_time)
    return valid_preds_fold,test_preds_fold

In [108]:
# batch_size = 256
# val_preds_1,test_preds_1 = train_fn(NeuralNet_merge_attn_mean(),train,valid,features,x_test,batch_size,focal_loss,0.9,4)

Start Epoch : 1/4
Epoch 1/4 	 loss=0.0333 	 val_loss=0.0268 	 time=323.45s
Start Epoch : 2/4
Epoch 2/4 	 loss=0.0287 	 val_loss=0.0259 	 time=323.79s
Start Epoch : 3/4
Epoch 3/4 	 loss=0.0270 	 val_loss=0.0251 	 time=322.71s
Start Epoch : 4/4
Epoch 4/4 	 loss=0.0257 	 val_loss=0.0258 	 time=322.00s
All 	 loss=0.02568 	 val_loss=0.02582 	 


In [110]:
batch_size = 256
val_preds_2,test_preds_2 = train_fn(NeuralNet_merge_attn_mean(),train_rev,valid_rev,features,x_test,batch_size,focal_loss,0.9,3)

Start Epoch : 1/3
Epoch 1/3 	 loss=0.0349 	 val_loss=0.0266 	 time=320.86s
Start Epoch : 2/3
Epoch 2/3 	 loss=0.0287 	 val_loss=0.0255 	 time=319.92s
Start Epoch : 3/3
Epoch 3/3 	 loss=0.0271 	 val_loss=0.0254 	 time=322.40s
All 	 loss=0.02714 	 val_loss=0.02536 	 
968.594315290451


In [111]:
delta2 = bestThresshold(y_val_fold,val_preds_2)
bin_test_pred_2 = (test_preds_2 > delta2).astype(int)
bin_val_pred_2 = (val_preds_2 > delta2).astype(int)

best threshold is 0.3600 with F1 score: 0.6812


In [113]:
batch_size = 256
val_preds_3,test_preds_3 = train_fn(NeuralNet_merge_attn_concat(),train,valid,features,x_test,batch_size,focal_loss,0.9,3)

Start Epoch : 1/3
Epoch 1/3 	 loss=0.0338 	 val_loss=0.0261 	 time=362.57s
Start Epoch : 2/3
Epoch 2/3 	 loss=0.0282 	 val_loss=0.0252 	 time=361.97s
Start Epoch : 3/3
Epoch 3/3 	 loss=0.0263 	 val_loss=0.0254 	 time=362.11s
All 	 loss=0.02625 	 val_loss=0.02540 	 
1092.7241508960724


In [115]:
delta3 = bestThresshold(y_val_fold,val_preds_3)
bin_test_pred_3 = (test_preds_3 > delta3).astype(int)
bin_val_pred_3 = (val_preds_3 > delta3).astype(int)

best threshold is 0.3800 with F1 score: 0.6794


In [None]:
# batch_size = 512
# val_preds_4,test_preds_4 = train_fn(NeuralNet_merge_attn_concat(),train_rev,valid_rev,features,x_test,batch_size,focal_loss,0.9,1)

In [None]:
# batch_size = 512
# val_preds_5,test_preds_5 = train_fn(NeuralNet_DME_2(),train_rev,valid_rev,features,x_test,batch_size,focal_loss,0.9,5)

In [135]:
# delta5 = bestThresshold(y_val_fold,val_preds_5)
# bin_test_pred_5 = (test_preds_5 > delta5).astype(int)
# bin_val_pred_5 = (val_preds_5 > delta5).astype(int)

best threshold is 0.3900 with F1 score: 0.6606


In [None]:
batch_size = 512
val_preds_6,test_preds_6 = train_fn(NeuralNet_DME_2(),train,valid,features,x_test,batch_size,focal_loss,0.9,5)

In [None]:
# delta1 = bestThresshold(y_val_fold,val_preds_1)
# bin_test_pred_1 = (test_preds_1 > delta1).astype(int)
# bin_val_pred_1 = (val_preds_1 > delta1).astype(int)

In [None]:
# delta4 = bestThresshold(y_val_fold,val_preds_4)
# bin_test_pred_4 = (test_preds_4 > delta4).astype(int)
# bin_val_pred_4 = (val_preds_4 > delta4).astype(int)

In [None]:
delta6 = bestThresshold(y_val_fold,val_preds_6)
bin_test_pred_6 = (test_preds_6 > delta6).astype(int)
bin_val_pred_6 = (val_preds_6 > delta6).astype(int)

In [None]:
from statistics import mode

In [None]:
final_val_pred = bin_val_pred_2
for i in range(len(final_val_pred)):
    final_val_pred[i] = mode((bin_val_pred_2[i],bin_val_pred_3[i],bin_val_pred_6[i]))

In [133]:
delta_bin = bestThresshold(y_val_fold,final_val_pred)

best threshold is 0.0500 with F1 score: 0.6737


In [125]:
final_pred_bin = bin_test_pred_2
for i in range(len(final_pred_bin)):
    final_pred_bin[i] = mode((bin_test_pred_2[i],bin_test_pred_3[i],bin_test_pred_6[i]))

In [134]:
final_val_pred_mean = np.mean([val_preds_2,val_preds_3,val_preds_6],axis = 0)
delta_mean = bestThresshold(y_val_fold,final_val_pred_mean)
# final_test_pred_mean = np.mean([test_preds_2,test_preds_3,test_preds_5],axis = 0)

best threshold is 0.3800 with F1 score: 0.6868


In [129]:
df_test = pd.read_csv("../input/test.csv")
submission = df_test[['qid']].copy()
# submission['prediction'] = (final_test_pred_mean > delta_mean).astype(int)
submission['prediction'] = final_pred_bin
submission.to_csv('submission.csv', index=False)

In [130]:
! head submission.csv

qid,prediction
00014894849d00ba98a9,0
000156468431f09b3cae,0
000227734433360e1aae,0
0005e06fbe3045bd2a92,0
00068a0f7f41f50fc399,0
000a2d30e3ffd70c070d,0
000b67672ec9622ff761,0
000b7fb1146d712c1105,0
000d665a8ddc426a1907,0
