In [1]:
FOLD = 0

import os
import time
import math
import requests
import glob

import ast

import numpy as np
import pandas as pd

import mlcrate as mlc

import os

import cv2

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import fbeta_score, accuracy_score, precision_score, recall_score, f1_score

from skimage.transform import resize
from sklearn.preprocessing import StandardScaler
from PIL import Image, ImageDraw

from tqdm import tqdm, tqdm_notebook

import matplotlib.pyplot as plt
import re
import torch
import torch.nn as nn
import torch.nn.functional as F

from torch import optim
from torch.optim import Optimizer
from torch.utils.data import  DataLoader
from torchtext.data import Dataset
import torch.utils.checkpoint as checkpoint
from nltk.tokenize import word_tokenize, sent_tokenize, TweetTokenizer
import torchvision
from torchvision import transforms, utils
import random
import torchtext
import torchtext.data as data

from torch.nn.utils.rnn import pad_sequence

import spacy
from spacy.lang.en import English

SEED = 1337

NOTIFY_EACH_EPOCH = False

WORKERS = 0
BATCH_SIZE = 512

N_SPLITS = 10

np.random.seed(SEED)
# torch.manual_seed(SEED)
# torch.cuda.manual_seed(SEED)
# torch.backends.cudnn.deterministic = True

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

def get_n_params(model):
    pp=0
    for p in list(model.parameters()):
        nn=1
        for s in list(p.size()):
            nn = nn*s
        pp += nn
    return pp

# from https://github.com/floydhub/save-and-resume
def save_checkpoint(state):
    """Save checkpoint if a new best is achieved"""
    print (" Saving checkpoint")

    filename = f'./checkpoint-{state["epoch"]}.pt.tar'
    torch.save(state, filename)
    
def initialize(model, path=None, optimizer=None):   
    if path == None:
        checkpoints = glob.glob('./*.pt.tar')
        path = checkpoints[np.argmax([int(checkpoint.split('checkpoint-')[1].split('.')[0]) for checkpoint in checkpoints])]

    checkpoint = torch.load(path)

    model.load_state_dict(checkpoint['model'])

    print(f' Loaded checkpoint {path} | Trained for {checkpoint["epoch"] + 1} epochs')
    
    if optimizer:
        optimizer.load_state_dict(checkpoint['optimizer'])
          
        epoch = checkpoint['epoch'] + 1
        train_iteration = checkpoint['train_iteration']
        val_iteration = checkpoint['val_iteration']

        return model, optimizer, epoch, train_iteration, val_iteration
    else:
        return model

In [2]:

def seed_everything(seed=1029):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
seed_everything()

In [3]:

puncts = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
 '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
 '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
 '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
 '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

def clean_text(x):
    x = str(x)
    for punct in puncts:
        if punct in x:
            x = x.replace(punct, f' {punct} ')
    return x

def clean_numbers(x):
    x = re.sub('[0-9]{5,}', '#####', x)
    x = re.sub('[0-9]{4}', '####', x)
    x = re.sub('[0-9]{3}', '###', x)
    x = re.sub('[0-9]{2}', '##', x)
    return x

mispell_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have", 'colour': 'color', 'centre': 'center', 'favourite': 'favorite', 'travelling': 'traveling', 'counselling': 'counseling', 'theatre': 'theater', 'cancelled': 'canceled', 'labour': 'labor', 'organisation': 'organization', 'wwii': 'world war 2', 'citicise': 'criticize', 'youtu ': 'youtube ', 'Qoura': 'Quora', 'sallary': 'salary', 'Whta': 'What', 'narcisist': 'narcissist', 'howdo': 'how do', 'whatare': 'what are', 'howcan': 'how can', 'howmuch': 'how much', 'howmany': 'how many', 'whydo': 'why do', 'doI': 'do I', 'theBest': 'the best', 'howdoes': 'how does', 'mastrubation': 'masturbation', 'mastrubate': 'masturbate', "mastrubating": 'masturbating', 'pennis': 'penis', 'Etherium': 'Ethereum', 'narcissit': 'narcissist', 'bigdata': 'big data', '2k17': '2017', '2k18': '2018', 'qouta': 'quota', 'exboyfriend': 'ex boyfriend', 'airhostess': 'air hostess', "whst": 'what', 'watsapp': 'whatsapp', 'demonitisation': 'demonetization', 'demonitization': 'demonetization', 'demonetisation': 'demonetization'}

def _get_mispell(mispell_dict):
    mispell_re = re.compile('(%s)' % '|'.join(mispell_dict.keys()))
    return mispell_dict, mispell_re

mispellings, mispellings_re = _get_mispell(mispell_dict)
def replace_typical_misspell(text):
    def replace(match):
        return mispellings[match.group(0)]
    return mispellings_re.sub(replace, text)


def add_features(x):
    
    df = x.copy()
    
    df['question_text'] = df['question_text'].apply(lambda x:str(x))
    df['total_length'] = df['question_text'].apply(len)
    df['capitals'] = df['question_text'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
    df['caps_vs_length'] = df.apply(lambda row: float(row['capitals'])/float(row['total_length']), axis=1)
    df['num_words'] = df.question_text.str.count('\S+')
    df['num_unique_words'] = df['question_text'].apply(lambda comment: len(set(w for w in comment.split())))
    df['words_vs_unique'] = df['num_unique_words'] / df['num_words']  
    
    df = df[['total_length','caps_vs_length','words_vs_unique']].fillna(0)
    
    return df

def load_and_prec(train_df, test_df):
    features = add_features(train_df)
    test_features = add_features(test_df)
    ss = StandardScaler()
    ss.fit(np.vstack((features, test_features)))
    features = ss.transform(features)
    test_features = ss.transform(test_features)
    train_df["question_text"] = train_df["question_text"].apply(lambda x: x.lower())
    test_df["question_text"] = test_df["question_text"].apply(lambda x: x.lower())

    # Clean the text
    train_df["question_text"] = train_df["question_text"].apply(lambda x: clean_text(x))
    test_df["question_text"] = test_df["question_text"].apply(lambda x: clean_text(x))
    
    # Clean numbers
    train_df["question_text"] = train_df["question_text"].apply(lambda x: clean_numbers(x))
    test_df["question_text"] = test_df["question_text"].apply(lambda x: clean_numbers(x))
    
    # Clean speelings
    train_df["question_text"] = train_df["question_text"].apply(lambda x: replace_typical_misspell(x))
    test_df["question_text"] = test_df["question_text"].apply(lambda x: replace_typical_misspell(x))

    return train_df, test_df, features, test_features

In [4]:
sample_submission = pd.read_csv('../input/sample_submission.csv')
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
all_train_preds = np.zeros((len(train)))
real_valid_preds = np.zeros((len(train)))
test_preds = np.zeros((len(test)))
y_train_label = train.target

In [5]:
train, test, train_features, test_features = load_and_prec(train, test)
enable_local_test = False
if enable_local_test:
    n_test = len(test) * 4
    train, test = (train.iloc[:-n_test].reset_index(drop=True), 
                               train.iloc[-n_test:].reset_index(drop=True))
    test_y = test['target'].values
    y_train_label = train.target
    test_preds = np.zeros((len(test)))
    all_train_preds = np.zeros((len(train)))
    real_valid_preds = np.zeros((len(train)))
else:
    pass
train.to_csv('../train.csv', index=False)
test.to_csv('../test.csv', index=False)



In [6]:
def get_glove():
    EMBEDDING_FILE = '../input/embeddings/glove.840B.300d/glove.840B.300d.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')[:300]
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))
    glo_stoi = {}
    for i, j in enumerate(list(embeddings_index.keys())):
        glo_stoi[j] = i
    glo_vectors = torch.tensor(list(embeddings_index.values()),dtype=torch.float)
    return glo_stoi, glo_vectors

def get_par():
    EMBEDDING_FILE = '../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt'
    def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
    embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE, encoding="utf8", errors='ignore') if len(o)>100)
    par_stoi = {}
    for i, j in enumerate(list(embeddings_index.keys())):
        par_stoi[j] = i
    par_vectors = torch.tensor(list(embeddings_index.values()),dtype=torch.float)
    return par_stoi, par_vectors
    

In [7]:
%%time
# from http://mlexplained.com/2018/02/08/a-comprehensive-tutorial-to-torchtext
question_field = data.Field(lower=True, batch_first=True, include_lengths=True)
target_field = data.Field(sequential=False, use_vocab=False, batch_first=True)

train_fields = [
    ('qid', None),
    ('question_text', question_field),
    ('target', target_field)
]

test_fields = [
    ('qid', None),
    ('question_text', question_field)
]

train_dataset = data.TabularDataset('../train.csv',  format='CSV', skip_header=True, fields=train_fields)
test_dataset = data.TabularDataset('../test.csv', format='CSV', skip_header=True, fields=test_fields)

# vectors = torchtext.vocab.Vectors('../input/embeddings/glove.840B.300d/glove.840B.300d.txt', max_vectors=495000)
# vectors2 = torchtext.vocab.Vectors('../input/embeddings/paragram_300_sl999/paragram_300_sl999.txt', max_vectors=495000)
glo_stoi, glo_vectors = get_glove()
question_field.build_vocab(train_dataset, max_size=95000)
question_field.vocab.set_vectors(glo_stoi, glo_vectors, 300)
glove = question_field.vocab.vectors
del glo_stoi, glo_vectors

par_stoi, par_vectors = get_par()
question_field.vocab.set_vectors(par_stoi, par_vectors, 300)
paragram = question_field.vocab.vectors
del par_stoi, par_vectors
pretrained_embedding = torch.from_numpy(np.mean([glove.cpu().numpy(),paragram.cpu().numpy()], axis=0))



print(f'Train Dataset: {len(train_dataset)}')
print(f'Test Dataset: {len(test_dataset)}')

Train Dataset: 1306122
Test Dataset: 56370
CPU times: user 8min 20s, sys: 14.7 s, total: 8min 35s
Wall time: 8min 34s


In [8]:
del glove, paragram
import gc
gc.collect()

408

In [9]:
import torch as t
Routings = 4 #5
Num_capsule = 5
Dim_capsule = 5#16
dropout_p = 0.25
rate_drop_dense = 0.28
gru_len = 128
dropout_p = 0.25
rate_drop_dense = 0.28
LR = 0.001
T_epsilon = 1e-7

class CyclicLR(object):
    def __init__(self, optimizer, base_lr=1e-3, max_lr=6e-3,
                 step_size=2000, mode='triangular', gamma=1.,
                 scale_fn=None, scale_mode='cycle', last_batch_iteration=-1):

        if not isinstance(optimizer, Optimizer):
            raise TypeError('{} is not an Optimizer'.format(
                type(optimizer).__name__))
        self.optimizer = optimizer

        if isinstance(base_lr, list) or isinstance(base_lr, tuple):
            if len(base_lr) != len(optimizer.param_groups):
                raise ValueError("expected {} base_lr, got {}".format(
                    len(optimizer.param_groups), len(base_lr)))
            self.base_lrs = list(base_lr)
        else:
            self.base_lrs = [base_lr] * len(optimizer.param_groups)

        if isinstance(max_lr, list) or isinstance(max_lr, tuple):
            if len(max_lr) != len(optimizer.param_groups):
                raise ValueError("expected {} max_lr, got {}".format(
                    len(optimizer.param_groups), len(max_lr)))
            self.max_lrs = list(max_lr)
        else:
            self.max_lrs = [max_lr] * len(optimizer.param_groups)

        self.step_size = step_size

        if mode not in ['triangular', 'triangular2', 'exp_range'] \
                and scale_fn is None:
            raise ValueError('mode is invalid and scale_fn is None')

        self.mode = mode
        self.gamma = gamma

        if scale_fn is None:
            if self.mode == 'triangular':
                self.scale_fn = self._triangular_scale_fn
                self.scale_mode = 'cycle'
            elif self.mode == 'triangular2':
                self.scale_fn = self._triangular2_scale_fn
                self.scale_mode = 'cycle'
            elif self.mode == 'exp_range':
                self.scale_fn = self._exp_range_scale_fn
                self.scale_mode = 'iterations'
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode

        self.batch_step(last_batch_iteration + 1)
        self.last_batch_iteration = last_batch_iteration

    def batch_step(self, batch_iteration=None):
        if batch_iteration is None:
            batch_iteration = self.last_batch_iteration + 1
        self.last_batch_iteration = batch_iteration
        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
            param_group['lr'] = lr

    def _triangular_scale_fn(self, x):
        return 1.

    def _triangular2_scale_fn(self, x):
        return 1 / (2. ** (x - 1))

    def _exp_range_scale_fn(self, x):
        return self.gamma**(x)

    def get_lr(self):
        step_size = float(self.step_size)
        cycle = np.floor(1 + self.last_batch_iteration / (2 * step_size))
        x = np.abs(self.last_batch_iteration / step_size - 2 * cycle + 1)

        lrs = []
        param_lrs = zip(self.optimizer.param_groups, self.base_lrs, self.max_lrs)
        for param_group, base_lr, max_lr in param_lrs:
            base_height = (max_lr - base_lr) * np.maximum(0, (1 - x))
            if self.scale_mode == 'cycle':
                lr = base_lr + base_height * self.scale_fn(cycle)
            else:
                lr = base_lr + base_height * self.scale_fn(self.last_batch_iteration)
            lrs.append(lr)
        return lrs
    



class Embed_Layer(nn.Module):
    def __init__(self, embedding_matrix=None, vocab_size=None, embedding_dim=300):
        super(Embed_Layer, self).__init__()
        self.encoder = nn.Embedding(vocab_size + 1, embedding_dim)
        if use_pretrained_embedding:
            # self.encoder.weight.data.copy_(t.from_numpy(np.load(embedding_path))) # 方法一，加载np.save的npy文件
            self.encoder.weight.data.copy_(t.from_numpy(embedding_matrix))  # 方法二

    def forward(self, x, dropout_p=0.25):
        return nn.Dropout(p=dropout_p)(self.encoder(x))


class GRU_Layer(nn.Module):
    def __init__(self):
        super(GRU_Layer, self).__init__()
        self.gru = nn.GRU(input_size=300,
                          hidden_size=gru_len,
                          bidirectional=True)
        '''
        自己修改GRU里面的激活函数及加dropout和recurrent_dropout
        如果要使用，把rnn_revised import进来，但好像是使用cpu跑的，比较慢
       '''
        # # if you uncomment /*from rnn_revised import * */, uncomment following code aswell
        # self.gru = RNNHardSigmoid('GRU', input_size=300,
        #                           hidden_size=gru_len,
        #                           bidirectional=True)

    # 这步很关键，需要像keras一样用glorot_uniform和orthogonal_uniform初始化参数
    def init_weights(self):
        ih = (param.data for name, param in self.named_parameters() if 'weight_ih' in name)
        hh = (param.data for name, param in self.named_parameters() if 'weight_hh' in name)
        b = (param.data for name, param in self.named_parameters() if 'bias' in name)
        for k in ih:
            nn.init.xavier_uniform_(k)
        for k in hh:
            nn.init.orthogonal_(k)
        for k in b:
            nn.init.constant_(k, 0)

    def forward(self, x):
        return self.gru(x)


# core caps_layer with squash func
class Caps_Layer(nn.Module):
    def __init__(self, input_dim_capsule=gru_len * 2, num_capsule=Num_capsule, dim_capsule=Dim_capsule, \
                 routings=Routings, kernel_size=(9, 1), share_weights=True,
                 activation='default', **kwargs):
        super(Caps_Layer, self).__init__(**kwargs)

        self.num_capsule = num_capsule
        self.dim_capsule = dim_capsule
        self.routings = routings
        self.kernel_size = kernel_size  # 暂时没用到
        self.share_weights = share_weights
        if activation == 'default':
            self.activation = self.squash
        else:
            self.activation = nn.ReLU(inplace=True)

        if self.share_weights:
            self.W = nn.Parameter(
                nn.init.xavier_normal_(t.empty(1, input_dim_capsule, self.num_capsule * self.dim_capsule)))
        else:
            self.W = nn.Parameter(
                t.randn(BATCH_SIZE, input_dim_capsule, self.num_capsule * self.dim_capsule))  # 64即batch_size

    def forward(self, x):

        if self.share_weights:
            u_hat_vecs = t.matmul(x, self.W)
        else:
            print('add later')

        batch_size = x.size(0)
        input_num_capsule = x.size(1)
        u_hat_vecs = u_hat_vecs.view((batch_size, input_num_capsule,
                                      self.num_capsule, self.dim_capsule))
        u_hat_vecs = u_hat_vecs.permute(0, 2, 1, 3)  # 转成(batch_size,num_capsule,input_num_capsule,dim_capsule)
        b = t.zeros_like(u_hat_vecs[:, :, :, 0])  # (batch_size,num_capsule,input_num_capsule)

        for i in range(self.routings):
            b = b.permute(0, 2, 1)
            c = F.softmax(b, dim=2)
            c = c.permute(0, 2, 1)
            b = b.permute(0, 2, 1)
            outputs = self.activation(t.einsum('bij,bijk->bik', (c, u_hat_vecs)))  # batch matrix multiplication
            # outputs shape (batch_size, num_capsule, dim_capsule)
            if i < self.routings - 1:
                b = t.einsum('bik,bijk->bij', (outputs, u_hat_vecs))  # batch matrix multiplication
        return outputs  # (batch_size, num_capsule, dim_capsule)

    # text version of squash, slight different from original one
    def squash(self, x, axis=-1):
        s_squared_norm = (x ** 2).sum(axis, keepdim=True)
        scale = t.sqrt(s_squared_norm + T_epsilon)
        return x / scale
    
class Capsule_Main(nn.Module):
    def __init__(self, embedding_matrix=None, vocab_size=None):
        super(Capsule_Main, self).__init__()
        self.embed_layer = Embed_Layer(embedding_matrix, vocab_size)
        self.gru_layer = GRU_Layer()
        # 【重要】初始化GRU权重操作，这一步非常关键，acc上升到0.98，如果用默认的uniform初始化则acc一直在0.5左右
        self.gru_layer.init_weights()
        self.caps_layer = Caps_Layer()
        self.dense_layer = Dense_Layer()

    def forward(self, content):
        content1 = self.embed_layer(content)
        content2, _ = self.gru_layer(
            content1)  # 这个输出是个tuple，一个output(seq_len, batch_size, num_directions * hidden_size)，一个hn
        content3 = self.caps_layer(content2)
        output = self.dense_layer(content3)
        return output


In [10]:
# from https://discuss.pytorch.org/t/self-attention-on-words-and-masking/5671/4
class SelfAttention(nn.Module):
    def __init__(self, hidden_size, batch_first=False):
        super(SelfAttention, self).__init__()

        self.hidden_size = hidden_size
        self.batch_first = batch_first

        self.att_weights = nn.Parameter(torch.Tensor(1, hidden_size), requires_grad=True)

        stdv = 1.0 / math.sqrt(self.hidden_size)
        for weight in self.att_weights:
            nn.init.uniform_(weight, -stdv, stdv)

    def get_mask(self):
        pass

    def forward(self, inputs, lengths):
        if self.batch_first:
            batch_size, max_len = inputs.size()[:2]
        else:
            max_len, batch_size = inputs.size()[:2]
            
        # apply attention layer
        weights = torch.bmm(inputs,
                            self.att_weights  # (1, hidden_size)
                            .permute(1, 0)  # (hidden_size, 1)
                            .unsqueeze(0)  # (1, hidden_size, 1)
                            .repeat(batch_size, 1, 1) # (batch_size, hidden_size, 1)
                            )
    
        attentions = torch.softmax(F.relu(weights.squeeze()), dim=-1)

        # create mask based on the sentence lengths
        mask = torch.ones(attentions.size(), requires_grad=True).cuda()
        for i, l in enumerate(lengths):  # skip the first sentence
            if l < max_len:
                mask[i, l:] = 0

        # apply mask and renormalize attention scores (weights)
        masked = attentions * mask
        _sums = masked.sum(-1).unsqueeze(-1)  # sums per row
        
        attentions = masked.div(_sums)

        # apply attention weights
        weighted = torch.mul(inputs, attentions.unsqueeze(-1).expand_as(inputs))

        # get the final fixed vector representations of the sentences
        representations = weighted.sum(1).squeeze()

        return representations, attentions

class BaselineLSTM(nn.Module):
    def __init__(self, embedding):
        super(BaselineLSTM, self).__init__()
                
        self.embedding = nn.Embedding.from_pretrained(embedding)
        self.embedding_dropout = nn.Dropout2d(0.1)
        
        self.lstm = nn.LSTM(input_size=300, hidden_size=128, num_layers=2, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(0.1)
        self.lincaps = nn.Linear(Num_capsule * Dim_capsule, 1)
        self.relu = nn.ReLU()
        self.attention = SelfAttention(128*2, batch_first=True)
        self.caps_layer = Caps_Layer()
        self.fc = nn.Linear(128*2+1, 1)
        self.logit = nn.Linear(1, 1)
    
    @staticmethod
    def split_directions(outputs):
        direction_size = int(outputs.size(-1) / 2)
        forward = outputs[:, :, :direction_size]
        backward = outputs[:, :, direction_size:]
        return forward, backward
    
    @staticmethod
    def last_by_index(outputs, lengths):
        # Index of the last output for each sequence.
        idx = (lengths - 1).view(-1, 1).expand(outputs.size(0),
                                               outputs.size(2)).unsqueeze(1).cuda()
        return outputs.gather(1, idx).squeeze()
    
    
    def last_timestep(self, outputs, lengths, bi=False):
        if bi:
            forward, backward = self.split_directions(outputs)
            last_forward = self.last_by_index(forward, lengths)
            last_backward = backward[:, 0, :]
            return torch.cat((last_forward, last_backward), dim=-1)

        else:
            return self.last_by_index(outputs, lengths)
        
    def forward(self,x, x_len):
        x = self.embedding(x)
        x = self.embedding_dropout(x)
        
        x = nn.utils.rnn.pack_padded_sequence(x, x_len, batch_first=True)

        out, (hidden, _) = self.lstm(x)
        
        x, lengths = nn.utils.rnn.pad_packed_sequence(out, batch_first=True)
        
        
        content3 = self.caps_layer(x)
        content3 = self.dropout(content3)
        batch_size = content3.size(0)
        content3 = content3.view(batch_size, -1)
        content3 = self.relu(self.lincaps(content3))
        
        x, _ = self.attention(x, lengths)
        x = torch.cat([x, content3,],1)
        x = self.fc(x)
        x = self.logit(x).view(-1)
        
        return x

In [11]:
start_epoch = 0
epochs = 6
early_stopping = 10

train_iteration = 0
val_iteration = 0

threshold = 0.35


In [12]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

In [13]:
seed_everything()
weight = [3, 3, 3, 3, 1]
best_train_loss = 1e10
best_val_loss = 1e10

best_train_f1 = 0
best_val_f1 = 0

best_epoch = 0

timer = mlc.time.Timer()
logger = mlc.LinewiseCSVWriter('train_log.csv', header=['epoch', 'lr', 'train_loss', 'val_loss', 'train_f1', 'val_f1'])
splits = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED).split(train, y_train_label))
test_preds_local = np.zeros((len(test), len(splits)))
for num_fold, (train_idx, val_idx) in enumerate(splits):
    
    model = BaselineLSTM(pretrained_embedding)
    model.cuda()
    optimizer = optim.Adam(model.parameters())
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=10, verbose=True)

    criterion = nn.BCEWithLogitsLoss()
    get_n_params(model)
    exasample =  np.asarray(train_dataset.examples)
    folds_train_exm = Dataset(fields=train_fields, examples = exasample[train_idx])
    folds_val_exm = Dataset(fields=train_fields, examples = exasample[val_idx])
    train_dataloader, val_dataloader = data.BucketIterator.splits((folds_train_exm, folds_val_exm), (BATCH_SIZE, BATCH_SIZE), sort_key=lambda x: len(x.question_text), sort_within_batch=True, device=torch.device('cuda'))
    test_dataloader = data.BucketIterator(test_dataset, 512, sort=False, shuffle=False, device=torch.device('cuda'))
    for epoch in range(start_epoch, epochs):
        print(f'\n Starting Epoch {epoch} | LR: {optimizer.param_groups[0]["lr"]}')

        train_loss = 0
        val_loss = 0

        y_train = []
        train_preds = []

        timer.add(epoch)

        model.train()
        for i, batch in enumerate(train_dataloader):
            (question, length), label = batch.question_text, batch.target.float()
            question = question

            out = model(question, length)

            loss = criterion(out, label)

            train_loss += loss.item()

            optimizer.zero_grad()

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)

            optimizer.step()

            y_train.append(label.detach())
            train_preds.append(out.detach())

            train_iteration += 1

        model.eval()
        valid_preds_fold = np.zeros((len(val_idx)))
        with torch.no_grad():

            y_val = []
            val_preds = []

            for j, batch in enumerate(val_dataloader):
                (question, length), label = batch.question_text, batch.target.float()
                question = question

                out = model(question, length)

                loss = criterion(out, label)

                val_loss += loss.item()

                optimizer.zero_grad()

                y_val.append(label.detach())
                val_preds.append(out.detach())
    
                real_pred_y = sigmoid(out.detach().cpu().numpy())
                valid_preds_fold[j * BATCH_SIZE:(j+1) * BATCH_SIZE] = real_pred_y
                val_iteration += 1

        train_loss /= (i + 1)
        val_loss /= (j + 1)

        y_train = torch.cat(y_train, dim=0).reshape(-1, 1)
        y_val = torch.cat(y_val, dim=0).reshape(-1, 1)

        train_preds = torch.cat(train_preds, dim=0).reshape(-1, 1)
        val_preds = torch.cat(val_preds, dim=0).reshape(-1, 1)

        train_f1 = f1_score(y_train.cpu().numpy(), (train_preds.cpu().numpy() > threshold))
        val_f1 = f1_score(y_val.cpu().numpy(), (val_preds.cpu().numpy() > threshold))

        logger.write([epoch, optimizer.param_groups[0]['lr'], train_loss, val_loss, train_f1, val_f1])

        print(f' {timer.fsince(epoch)} | End of Epoch {epoch} | Train Loss: {train_loss} | Val Loss: {val_loss} | Train F1: {round(train_f1, 4)} | Val F1: {round(val_f1, 4)}')

        scheduler.step(val_loss)

        if val_loss < best_val_loss:
            best_epoch = epoch

            best_train_loss = train_loss
            best_val_loss = val_loss

            best_train_f1 = train_f1
            best_val_f1 = val_f1

            save_checkpoint({
                'model': model.state_dict(),
                'optimizer': optimizer.state_dict(),
                'epoch': epoch,
                'train_iteration': train_iteration,
                'val_iteration': val_iteration
            })

        elif epoch - best_epoch > early_stopping:
            print(f' Val loss has not decreased for {early_stopping} epochs, stopping training')
            break
#     model = BaselineLSTM(pretrained_embedding)
#     model = initialize(model)
#     model.cuda()
    preds = []

#     model.eval()
# #     with torch.no_grad():
    for i, batch in enumerate(test_dataloader):
        (question, length) = batch.question_text
        k = pd.DataFrame(length.cpu().numpy(),columns=["length"]).reset_index()
        k = k.sort_values(by="length",ascending=0)
        out = model(question[k['index'].values], length[k['index'].values])
        out = torch.sigmoid(out)
        pred = out.detach().cpu().numpy()
        mid_pred = np.zeros((pred.shape[0]))
        for pred_of,mid_of in enumerate(k['index'].values):
            mid_pred[mid_of] = pred[pred_of]
        preds.extend(mid_pred)
    all_train_preds[val_idx] = val_preds.cpu().numpy()[:,0]
    real_valid_preds[val_idx] = y_val.cpu().numpy()[:,0]
    test_preds_local[:, num_fold] = preds
    preds = np.array(preds) * weight[num_fold] / sum(weight)
    test_preds += preds


 Starting Epoch 0 | LR: 0.001
 3m10s | End of Epoch 0 | Train Loss: 0.12371740045207555 | Val Loss: 0.10681805572310306 | Train F1: 0.4808 | Val F1: 0.4775
 Saving checkpoint

 Starting Epoch 1 | LR: 0.001
 3m09s | End of Epoch 1 | Train Loss: 0.10660146894677196 | Val Loss: 0.10080975141019037 | Train F1: 0.5739 | Val F1: 0.6171
 Saving checkpoint

 Starting Epoch 2 | LR: 0.001
 3m10s | End of Epoch 2 | Train Loss: 0.10094865910537333 | Val Loss: 0.09940137528612072 | Train F1: 0.6065 | Val F1: 0.5559
 Saving checkpoint

 Starting Epoch 3 | LR: 0.001
 3m10s | End of Epoch 3 | Train Loss: 0.09587397286371821 | Val Loss: 0.09614817840420803 | Train F1: 0.6306 | Val F1: 0.6402
 Saving checkpoint

 Starting Epoch 4 | LR: 0.001
 3m10s | End of Epoch 4 | Train Loss: 0.09088067978525617 | Val Loss: 0.09684395751462058 | Train F1: 0.6556 | Val F1: 0.6285

 Starting Epoch 5 | LR: 0.001
 3m10s | End of Epoch 5 | Train Loss: 0.08564619204629029 | Val Loss: 0.09742063982321092 | Train F1: 0.6807

In [14]:
pd.DataFrame(test_preds_local).corr()

Unnamed: 0,0,1,2,3,4
0,1.0,0.937299,0.939289,0.932785,0.935587
1,0.937299,1.0,0.941644,0.935862,0.94041
2,0.939289,0.941644,1.0,0.94012,0.942462
3,0.932785,0.935862,0.94012,1.0,0.93951
4,0.935587,0.94041,0.942462,0.93951,1.0


In [15]:
def bestThresshold(y_train,train_preds):
    tmp = [0,0,0] # idx, cur, max
    delta = 0
    for tmp[0] in np.arange(0.1, 0.901, 0.01):
        tmp[1] = f1_score(y_train, np.array(train_preds)>tmp[0])
        if tmp[1] > tmp[2]:
            delta = tmp[0]
            tmp[2] = tmp[1]
    print('best threshold is {:.4f} with F1 score: {:.4f}'.format(delta, tmp[2]))
    return delta

In [16]:
thres = bestThresshold(real_valid_preds, sigmoid(all_train_preds))
# y_train_label

best threshold is 0.3700 with F1 score: 0.6835


In [17]:
submission = test[['qid']].copy()
submission['prediction'] = (test_preds > thres).astype(int)
submission.to_csv('submission.csv', index=False)

In [18]:
sample_submission.head()

Unnamed: 0,qid,prediction
0,00014894849d00ba98a9,0
1,000156468431f09b3cae,0
2,000227734433360e1aae,0
3,0005e06fbe3045bd2a92,0
4,00068a0f7f41f50fc399,0
