# Imports

In [1]:
import gc
import operator
import os
import random
import re
import string
import time

import nltk
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.utils.data
from IPython.display import display
from scipy import sparse
from sklearn.metrics import f1_score
from sklearn.model_selection import RepeatedStratifiedKFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from torch.optim.optimizer import Optimizer
from torch.utils.data import DataLoader, Dataset
from tqdm.auto import tqdm

from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

notebook_start = time.time()

tqdm.pandas()

Using TensorFlow backend.


# Loading the data

In [2]:
train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")

# Get the target values
y_train = train_df["target"].values

print("Train data dimension: ", train_df.shape)
print("Test data dimension: ", test_df.shape)

since = time.time() - notebook_start
print("\nLoading data: {:.0f} min {:.0f} sec".format(since // 60, since % 60))

Train data dimension:  (1306122, 3)
Test data dimension:  (56370, 2)

Loading data: 0 min 6 sec


# Utility functions

`seed_torch` sets the seed for numpy and torch to make sure functions with a random component behave deterministically. `torch.backends.cudnn.deterministic = true` sets the CuDNN to deterministic mode.<br>
This function allows us to run experiments 100% deterministically.

In [3]:
SEED = 1209


def seed_everything(seed=SEED):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True


seed_everything()

Sigmoid function in plain numpy.

In [4]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

Function to search for best threshold regarding the F1 score given labels and predictions from the network.

In [5]:
def threshold_search(y_true, y_proba):
    from sklearn.metrics import roc_curve, precision_recall_curve, f1_score

    precision, recall, thresholds = precision_recall_curve(y_true, y_proba)
    thresholds = np.append(thresholds, 1.001)
    F = 2 / (1 / precision + 1 / recall)
    best_score = np.max(F)
    best_th = thresholds[np.argmax(F)]
    search_result = {"threshold": best_th, "f1": best_score}
    return search_result

# Analyze Vocabulary

In [6]:
start = time.time()

In [7]:
# Load embedding index
def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype="float32")[:300]


# Glove
EMBEDDING_FILE = "../input/embeddings/glove.840B.300d/glove.840B.300d.txt"
glove_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))

# Fasttext
EMBEDDING_FILE = "../input/embeddings/wiki-news-300d-1M/wiki-news-300d-1M.vec"
fast_index = dict(
    get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE) if len(o) > 100
)

since = time.time() - start
print("Loading Embeddings: {:.0f} min {:.0f} sec".format(since // 60, since % 60))

Loading Embeddings: 4 min 19 sec


In [8]:
start = time.time()

In [9]:
# Utility function to build vocabulary
def build_vocab(texts):
    sentences = texts.apply(lambda x: x.split()).values
    vocab = {}
    for sentence in sentences:
        for word in sentence:
            try:
                vocab[word] += 1
            except KeyError:
                vocab[word] = 1
    return vocab

In [10]:
# Utility function to check coverage by embedding
def check_coverage(vocab, embeddings_index):
    known_words = {}
    unknown_words = {}
    nb_known_words = 0
    nb_unknown_words = 0
    for word in vocab.keys():
        try:
            known_words[word] = embeddings_index[word]
            nb_known_words += vocab[word]
        except:
            unknown_words[word] = vocab[word]
            nb_unknown_words += vocab[word]
            pass

    print("Found embeddings for {:.2%} of vocab".format(len(known_words) / len(vocab)))
    print(
        "Found embeddings for  {:.2%} of all text".format(
            nb_known_words / (nb_known_words + nb_unknown_words)
        )
    )
    unknown_words = sorted(unknown_words.items(), key=operator.itemgetter(1))[::-1]

    return unknown_words[:20]

In [11]:
vocab = build_vocab(train_df["question_text"])

In [12]:
# function to add lowercase words to embedding index
def add_lower(embedding, vocab):
    count = 0
    for word in vocab:
        if word in embedding and word.lower() not in embedding:
            embedding[word.lower()] = embedding[word]
            count += 1
    print(f"Added {count} words to embedding")

In [13]:
# function to add lowercase words to embedding index
def add_higher(embedding, vocab):
    count = 0
    for word in vocab:
        if word.lower() in embedding and word not in embedding:
            embedding[word] = embedding[word.lower()]
            count += 1
    print(f"Added {count} words to embedding")

In [14]:
add_lower(glove_index, vocab)
add_lower(fast_index, vocab)

Added 15044 words to embedding
Added 27176 words to embedding


In [15]:
add_higher(glove_index, vocab)
add_higher(fast_index, vocab)

Added 2462 words to embedding
Added 3703 words to embedding


In [16]:
# function to add misspelled words to embedding index
def add_misspells(embedding, vocab):
    count = 0
    misspells = {}
    for token in vocab:
        new_token = token
        if new_token not in embedding:
            new_token = re.sub(r"(.)\1{2,}", r"\1", new_token)
            if new_token not in embedding:
                new_token = re.sub(r"(.)\1{1,}", r"\1", new_token)
        if new_token not in embedding:
            new_token = nltk.stem.WordNetLemmatizer().lemmatize(new_token)

        if new_token in embedding and token not in embedding:
            embedding[token] = embedding[new_token]
            count += 1
    print(f"Added {count} words to embedding")

In [17]:
add_misspells(glove_index, vocab)
add_misspells(fast_index, vocab)

Added 2331 words to embedding
Added 3008 words to embedding


In [18]:
since = time.time() - start
print("Correcting vocabulary: {:.0f} min {:.0f} sec".format(since // 60, since % 60))

Correcting vocabulary: 0 min 21 sec


# Processing input

In [19]:
start = time.time()

In [20]:
embed_size = 300  # how big is each word vector
max_features = 120000  # how many unique words to use (i.e num rows in embedding vector)
maxlen = 70  # max number of words in a question to use

In [21]:
def spacing_misspell(text):
    """
    'deadbody' -> 'dead body'
    """
    misspell_list = ["(S|s)hit", "(F|f)uck"]  # ,'Trump'
    misspell_re = re.compile("(%s)" % "|".join(misspell_list))
    return misspell_re.sub(r" \1 ", text)


def clean_latex(text):
    """
    replace latex math with 'mathematical formula' tag
    """
    corr_t = []
    for t in text.split(" "):
        t = t.strip()
        if t != "":
            corr_t.append(t)
    text = " ".join(corr_t)
    text = re.sub(r"\[math].+?\[/math]", "mathematical formula", text)
    return text


def normalize_unicode(text):
    """
    unicode string normalization
    """
    return unicodedata.normalize("NFKD", text)


def remove_newline(text):
    """
    remove \n and  \t
    """
    text = re.sub("\n", " ", text)
    text = re.sub("\t", " ", text)
    text = re.sub("\b", " ", text)
    text = re.sub("\r", " ", text)
    return text


def decontracted(text):
    """
    de-contract the contraction
    """
    # specific
    text = re.sub(r"(W|w)on(\'|\’)t", "will not", text)
    text = re.sub(r"(C|c)an(\'|\’)t", "can not", text)
    text = re.sub(r"(Y|y)(\'|\’)all", "you all", text)
    text = re.sub(r"(Y|y)a(\'|\’)ll", "you all", text)

    # general
    text = re.sub(r"(I|i)(\'|\’)m", "i am", text)
    text = re.sub(r"(A|a)in(\'|\’)t", "is not", text)
    text = re.sub(r"n(\'|\’)t", " not", text)
    text = re.sub(r"(\'|\’)re", " are", text)
    text = re.sub(r"(\'|\’)s", " is", text)
    text = re.sub(r"(\'|\’)d", " would", text)
    text = re.sub(r"(\'|\’)ll", " will", text)
    text = re.sub(r"(\'|\’)t", " not", text)
    text = re.sub(r"(\'|\’)ve", " have", text)

    # quora
    text = re.sub(r"(Q|q)uoran", "quora contributor", text)
    text = re.sub(r"(Q|q)uorans", "quora contributors", text)

    return text


def spacing_punctuation(text):
    """
    add space before and after punctuation and symbols
    """
    regular_punct = list(string.punctuation)
    extra_punct = [
        ',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&',
        '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£',
        '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',
        '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', '“', '★', '”',
        '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾',
        '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', '▒', '：', '¼', '⊕', '▼',
        '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲',
        'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', '∙', '）', '↓', '、', '│', '（', '»',
        '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø',
        '¹', '≤', '‡', '√', '«', '»', '´', 'º', '¾', '¡', '§', '£', '₤']
    all_punct = "".join(sorted(list(set(regular_punct + extra_punct))))
    re_tok = re.compile(f"([{all_punct}])")
    return re_tok.sub(r" \1 ", text)


def spacing_digit(text):
    """
    add space before and after digits
    """
    re_tok = re.compile("([0-9])")
    return re_tok.sub(r" \1 ", text)


def spacing_number(text):
    """
    add space before and after numbers
    """
    re_tok = re.compile("([0-9]{1,})")
    return re_tok.sub(r" \1 ", text)


def remove_number(text):
    """
    numbers are not toxic
    """
    return re.sub("\d+", " ", text)


def remove_space(text):
    """
    remove extra spaces and ending space if any
    """
    text = re.sub("\s+", " ", text)
    text = re.sub("\s+$", "", text)
    return text


def substitute(text):
    """
    substitute some words after de-contraction
    """
    # text = re.sub(r" e g ", " eg ", text)
    # text = re.sub(r" b g ", " bg ", text)
    # text = re.sub(r" u s ", " US ", text)
    # text = re.sub(r" u s a ", " USA ", text)
    text = re.sub(r"e - mail", "email", text)

    return text

In [22]:
"""
additional features from 
https://github.com/thinline72/toxic/blob/master/skolbachev/toxic/tokenizers/glove_twitter_tokenizer.py
"""
FLAGS = re.MULTILINE | re.DOTALL


def hashtag(text):
    text = text.group()
    hashtag_body = text[1:]
    result = "<hashtag> " + hashtag_body.lower()
    return result


def allcaps(text):
    text = text.group()
    return text.lower() + " <allcaps>"


def add_feats(
    text,
    numbers=False,
    smiley=False,
    twitter=False,
    allcaps=True,
    replong=False,
    FLAGS=FLAGS,
):

    # Different regex parts for smiley faces
    if smiley:
        eyes = r"[8:=;]"
        nose = r"['`\-]?"
        text = re.sub(
            r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes),
            "<smile>",
            text,
            flags=FLAGS,
        )
        text = re.sub(r"{}{}p+".format(eyes, nose), "<lolface>", text, flags=FLAGS)
        text = re.sub(
            r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes),
            "<sadface>",
            text,
            flags=FLAGS,
        )
        text = re.sub(
            r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>", text, flags=FLAGS
        )
        text = re.sub(r"<3", "<heart>", text, flags=FLAGS)

    if numbers:
        text = re.sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>", text, flags=FLAGS)

    if twitter:
        text = re.sub(r"#(\w+)", hashtag, text, flags=FLAGS)
        text = re.sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>", text, flags=FLAGS)
        text = re.sub(r"@\w+", "<user>", text, flags=FLAGS)

    if allcaps:
        try:
            text = re.sub(r"([A-Z]){2,}", allcaps, text, flags=FLAGS)
        except TypeError:
            print(text)

    if replong:
        text = re.sub(r"([!?.]){2,}", r"\1 <repeat>", text, flags=FLAGS)
        text = re.sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>", text, flags=FLAGS)

    return text

In [23]:
def preprocess(text, add_features=False, sub=False, remove_num=True):
    """
    preprocess text into clean text for tokenization
    NOTE:
        1. glove supports uppper case words
        2. glove supports digit
        3. glove supports punctuation
        5. glove supports domains e.g. www.apple.com
        6. glove supports misspelled words e.g. FUCKKK
    """
    ## add tags
    if add_features:
        text = add_feats(text)
    ## remove new line
    text = remove_newline(text)
    ## de-contract
    text = decontracted(text)
    ## space misspell
    text = spacing_misspell(text)
    ## clean_latex
    text = clean_latex(text)
    ## space
    text = spacing_punctuation(text)
    ## substitute after decontract
    if sub:
        text = substitute(text)
    ## handle numbers
    if remove_num:
        text = remove_number(text)
    else:
        text = spacing_number(text)
        text = spacing_digit(text)
    # 9. remove space
    text = remove_space(text)

    return text

# Additional Features

In [24]:
def add_features(df):

    df["question_text"] = df["question_text"].apply(lambda x: str(x))
    df["total_length"] = df["question_text"].apply(len)
    df["capitals"] = df["question_text"].apply(
        lambda comment: sum(1 for c in comment if c.isupper())
    )
    df["caps_vs_length"] = df.apply(
        lambda row: float(row["capitals"]) / float(row["total_length"]), axis=1
    )
    df["num_words"] = df.question_text.str.count("\S+")
    df["num_unique_words"] = df["question_text"].apply(
        lambda comment: len(set(w for w in comment.split()))
    )
    df["words_vs_unique"] = df["num_unique_words"] / df["num_words"]

    return df


train = add_features(train_df)
test = add_features(test_df)


features = train[["caps_vs_length", "words_vs_unique"]].fillna(0)
test_features = test[["caps_vs_length", "words_vs_unique"]].fillna(0)


ss = StandardScaler()
ss.fit(np.vstack((features, test_features)))
features = ss.transform(features)
test_features = ss.transform(test_features)


del train, test
gc.collect()

42

In [25]:
since = time.time() - start
print("Adding meta features: {:.0f} min {:.0f} sec".format(since // 60, since % 60))

Adding meta features: 0 min 48 sec


# Preprocessing

In [26]:
start_preprocessing = time.time()
start = time.time()

In [27]:
x_train = train_df["question_text"].progress_apply(preprocess)
x_test = test_df["question_text"].progress_apply(preprocess)

HBox(children=(IntProgress(value=0, max=1306122), HTML(value='')))




HBox(children=(IntProgress(value=0, max=56370), HTML(value='')))




In [28]:
vocab = build_vocab(x_train)

print("Add lower after preprocessing:")
add_lower(glove_index, vocab)
add_lower(fast_index, vocab)

print("\nAdd misspells after preprocessing:")
add_misspells(glove_index, vocab)
add_misspells(fast_index, vocab)

Add lower after preprocessing:
Added 5819 words to embedding
Added 9474 words to embedding

Add misspells after preprocessing:
Added 809 words to embedding
Added 953 words to embedding


In [29]:
add_higher(glove_index, vocab)
add_higher(fast_index, vocab)

Added 794 words to embedding
Added 1289 words to embedding


In [30]:
del vocab
gc.collect()

0

In [31]:
since = time.time() - start
print("Text preprocessing: {:.0f} min {:.0f} sec".format(since // 60, since % 60))

Text preprocessing: 2 min 53 sec


# Tokenization

In [32]:
start = time.time()

In [33]:
# tokenizer
tokenizer = Tokenizer(num_words=max_features, filters="", lower=False, split=" ")
# fit to data
tokenizer.fit_on_texts(list(x_train))
# tokenize the texts into sequences
train_sequences = tokenizer.texts_to_sequences(x_train)
test_sequences = tokenizer.texts_to_sequences(x_test)
# Pad the sentences
x_train = pad_sequences(train_sequences, maxlen=maxlen)
x_test = pad_sequences(test_sequences, maxlen=maxlen)

# shuffling the data
np.random.seed(SEED)
trn_idx = np.random.permutation(len(x_train))
x_train = x_train[trn_idx]
y_train = y_train[trn_idx]
features = features[trn_idx]

In [34]:
since = time.time() - start
print("Tokenization: {:.0f} min {:.0f} sec".format(since // 60, since % 60))

Tokenization: 0 min 50 sec


In [35]:
since = time.time() - start_preprocessing
print("Total preprocessing time: {:.0f} min {:.0f} sec".format(since // 60, since % 60))

Total preprocessing time: 3 min 43 sec


# Creating the embeddings matrix

In [36]:
start = time.time()

In [37]:
def load_glove(word_index, embedding_index):

    all_embs = np.stack(embedding_index.values())
    emb_mean, emb_std = -0.005838499, 0.48782197
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features:
            continue
        embedding_vector = embedding_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return embedding_matrix

In [38]:
def load_fast(word_index, embeddings_index):
    all_embs = np.stack(embeddings_index.values())
    emb_mean, emb_std = all_embs.mean(), all_embs.std()
    embed_size = all_embs.shape[1]

    # word_index = tokenizer.word_index
    nb_words = min(max_features, len(word_index))
    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
    for word, i in word_index.items():
        if i >= max_features:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    return embedding_matrix

In [39]:
# missing entries in the embedding are set using np.random.normal so we have to seed here too
seed_everything()

glove_matrix = load_glove(tokenizer.word_index, glove_index)
fast_matrix = load_fast(tokenizer.word_index, fast_index)

embedding_matrix = np.concatenate(([glove_matrix, fast_matrix]), axis=1)

del glove_index
del glove_matrix
del fast_index
del fast_matrix
gc.collect()

print(np.shape(embedding_matrix))

since = time.time() - start
print("Embedding matrices time: {:.0f} min {:.0f} sec".format(since // 60, since % 60))

  This is separate from the ipykernel package so we can avoid doing imports until
  


(120000, 600)
Embedding matrices time: 0 min 18 sec


# Defining the model

In [40]:
start = time.time()

In [41]:
splits = list(
    StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED).split(x_train, y_train)
)

In [42]:
class MyDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __getitem__(self, index):
        data, target = self.dataset[index]

        return data, target, index

    def __len__(self):
        return len(self.dataset)

In [43]:
class Attention(nn.Module):
    def __init__(self, feature_dim, step_dim, bias=True, **kwargs):
        super(Attention, self).__init__(**kwargs)

        self.supports_masking = True

        self.bias = bias
        self.feature_dim = feature_dim
        self.step_dim = step_dim
        self.features_dim = 0

        weight = torch.zeros(feature_dim, 1)
        nn.init.xavier_uniform_(weight)
        self.weight = nn.Parameter(weight)

        if bias:
            self.b = nn.Parameter(torch.zeros(step_dim))

    def forward(self, x, mask=None):
        feature_dim = self.feature_dim
        step_dim = self.step_dim

        eij = torch.mm(x.contiguous().view(-1, feature_dim), self.weight).view(
            -1, step_dim
        )

        if self.bias:
            eij = eij + self.b

        eij = torch.tanh(eij)
        a = torch.exp(eij)

        if mask is not None:
            a = a * mask

        a = a / torch.sum(a, 1, keepdim=True) + 1e-10

        weighted_input = x * torch.unsqueeze(a, -1)
        return torch.sum(weighted_input, 1)

In [44]:
# code inspired from: https://github.com/anandsaha/pytorch.cyclic.learning.rate/blob/master/cls.py
class CyclicLR(object):
    def __init__(
        self,
        optimizer,
        base_lr=1e-3,
        max_lr=6e-3,
        step_size=2000,
        factor=0.6,
        min_lr=1e-4,
        mode="triangular",
        gamma=1.0,
        scale_fn=None,
        scale_mode="cycle",
        last_batch_iteration=-1,
    ):

        if not isinstance(optimizer, torch.optim.Optimizer):
            raise TypeError("{} is not an Optimizer".format(type(optimizer).__name__))
        self.optimizer = optimizer

        if isinstance(base_lr, list) or isinstance(base_lr, tuple):
            if len(base_lr) != len(optimizer.param_groups):
                raise ValueError(
                    "expected {} base_lr, got {}".format(
                        len(optimizer.param_groups), len(base_lr)
                    )
                )
            self.base_lrs = list(base_lr)
        else:
            self.base_lrs = [base_lr] * len(optimizer.param_groups)

        if isinstance(max_lr, list) or isinstance(max_lr, tuple):
            if len(max_lr) != len(optimizer.param_groups):
                raise ValueError(
                    "expected {} max_lr, got {}".format(
                        len(optimizer.param_groups), len(max_lr)
                    )
                )
            self.max_lrs = list(max_lr)
        else:
            self.max_lrs = [max_lr] * len(optimizer.param_groups)

        self.step_size = step_size

        if mode not in ["triangular", "triangular2", "exp_range"] and scale_fn is None:
            raise ValueError("mode is invalid and scale_fn is None")

        self.mode = mode
        self.gamma = gamma

        if scale_fn is None:
            if self.mode == "triangular":
                self.scale_fn = self._triangular_scale_fn
                self.scale_mode = "cycle"
            elif self.mode == "triangular2":
                self.scale_fn = self._triangular2_scale_fn
                self.scale_mode = "cycle"
            elif self.mode == "exp_range":
                self.scale_fn = self._exp_range_scale_fn
                self.scale_mode = "iterations"
        else:
            self.scale_fn = scale_fn
            self.scale_mode = scale_mode

        self.batch_step(last_batch_iteration + 1)
        self.last_batch_iteration = last_batch_iteration

        self.last_loss = np.inf
        self.min_lr = min_lr
        self.factor = factor

    def batch_step(self, batch_iteration=None):
        if batch_iteration is None:
            batch_iteration = self.last_batch_iteration + 1
        self.last_batch_iteration = batch_iteration
        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
            param_group["lr"] = lr

    def step(self, loss):
        if loss > self.last_loss:
            self.base_lrs = [max(lr * self.factor, self.min_lr) for lr in self.base_lrs]
            self.max_lrs = [max(lr * self.factor, self.min_lr) for lr in self.max_lrs]

    def _triangular_scale_fn(self, x):
        return 1.0

    def _triangular2_scale_fn(self, x):
        return 1 / (2.0 ** (x - 1))

    def _exp_range_scale_fn(self, x):
        return self.gamma ** (x)

    def get_lr(self):
        step_size = float(self.step_size)
        cycle = np.floor(1 + self.last_batch_iteration / (2 * step_size))
        x = np.abs(self.last_batch_iteration / step_size - 2 * cycle + 1)

        lrs = []
        param_lrs = zip(self.optimizer.param_groups, self.base_lrs, self.max_lrs)
        for param_group, base_lr, max_lr in param_lrs:
            base_height = (max_lr - base_lr) * np.maximum(0, (1 - x))
            if self.scale_mode == "cycle":
                lr = base_lr + base_height * self.scale_fn(cycle)
            else:
                lr = base_lr + base_height * self.scale_fn(self.last_batch_iteration)
            lrs.append(lr)
        return lrs

In [45]:
class GaussianNoise(nn.Module):
    """Gaussian noise regularizer.

    Args:
        sigma (float, optional): relative standard deviation used to generate the
            noise. Relative means that it will be multiplied by the magnitude of
            the value your are adding the noise to. This means that sigma can be
            the same regardless of the scale of the vector.
        is_relative_detach (bool, optional): whether to detach the variable before
            computing the scale of the noise. If `False` then the scale of the noise
            won't be seen as a constant but something to optimize: this will bias the
            network to generate vectors with smaller values.
    """

    def __init__(self, sigma=0.07, is_relative_detach=True):
        super().__init__()
        self.sigma = sigma
        self.is_relative_detach = is_relative_detach
        self.noise = torch.tensor(0).to(torch.device("cuda")).float()

    def forward(self, x):
        if self.training and self.sigma != 0:
            scale = (
                self.sigma * x.detach() if self.is_relative_detach else self.sigma * x
            )
            sampled_noise = self.noise.repeat(*x.size()).normal_() * scale
            x = x + sampled_noise
        return x

Now define the neural network. Defining a neural network in PyTorch is done by defining a class. This is almost as intuitive as Keras. The main difference is that you have one function (`__init__`) where it is defined which layers there are in the network and another function (`forward`) which defines the flow of data through the net.

I replicated the architecture used in [@Shujian Liu's kernel](https://www.kaggle.com/shujian/single-rnn-with-4-folds-clr) in the network.

# Neural Net Parameters

In [46]:
h_size = 64
dense_out = False
embedding_dropout = 0.3

In [47]:
class NeuralNet(nn.Module):
    def __init__(self):
        super(NeuralNet, self).__init__()

        hidden_size = h_size
        emb_dropout = embedding_dropout

        self.embedding = nn.Embedding(max_features, embed_size * 2)
        self.embedding.weight = nn.Parameter(
            torch.tensor(embedding_matrix, dtype=torch.float32)
        )
        self.embedding.weight.requires_grad = False
        self.gaussian_noise = GaussianNoise()
        self.embedding_dropout = nn.Dropout(emb_dropout)
        self.lstm = nn.LSTM(
            embed_size * 2, hidden_size, bidirectional=True, batch_first=True
        )
        self.gru = nn.GRU(
            hidden_size * 2, hidden_size, bidirectional=True, batch_first=True
        )

        self.lstm_attention = Attention(hidden_size * 2, maxlen)
        self.gru_attention = Attention(hidden_size * 2, maxlen)

        self.linear = nn.Linear(hidden_size * 8 + 2, 16)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.1)
        self.out = nn.Linear(16, 1)
        self.out2 = nn.Linear(hidden_size * 8 + 2, 1)

    def forward(self, x):

        dense = dense_out

        h_embedding = self.embedding(x[0])
        h_embedding = self.gaussian_noise(h_embedding)
        h_embedding = torch.squeeze(
            self.embedding_dropout(torch.unsqueeze(h_embedding, 0))
        )

        h_lstm, _ = self.lstm(h_embedding)
        h_gru, _ = self.gru(h_lstm)

        h_lstm_atten = self.lstm_attention(h_lstm)
        h_gru_atten = self.gru_attention(h_gru)

        # global average pooling
        avg_pool = torch.mean(h_gru, 1)
        # global max pooling
        max_pool, _ = torch.max(h_gru, 1)

        f = torch.tensor(x[1], dtype=torch.float).cuda()

        conc = torch.cat((h_lstm_atten, h_gru_atten, avg_pool, max_pool, f), 1)

        if dense:
            conc = self.relu(self.linear(conc))
            conc = self.dropout(conc)
            out = self.out(conc)
        else:
            out = self.out2(conc)

        return out

# Training

In [48]:
batch_size = 512  # how many samples to process at once
n_epochs = 5  # how many times to iterate over all samples

# scheduler parameters
step_size = 1000
base_lr, max_lr = 0.001, 0.003

Now we can already train the network. Unfortunately, we do not have an API as high-level as keras's `.fit` in PyTorch. However, the code is still not too complicated and I have added comments where necessary.

In [49]:
# matrix for the out-of-fold predictions
train_preds = np.zeros((len(x_train)))
# matrix for the predictions on the test set
test_preds = np.zeros((len(x_test)))

avg_losses_f = []
avg_val_losses_f = []

# always call this before training for deterministic results
seed_everything()

x_test_cuda = torch.tensor(x_test, dtype=torch.long).cuda()
test = torch.utils.data.TensorDataset(x_test_cuda)
test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)

x_train = np.array(x_train)
y_train = np.array(y_train)
features = np.array(features)

for i, (train_idx, valid_idx) in enumerate(splits):
    x_train_fold = torch.tensor(x_train[train_idx.astype(int)], dtype=torch.long).cuda()
    y_train_fold = torch.tensor(
        y_train[train_idx.astype(int), np.newaxis], dtype=torch.float32
    ).cuda()

    kfold_X_features = features[train_idx.astype(int)]
    kfold_X_valid_features = features[valid_idx.astype(int)]
    x_val_fold = torch.tensor(x_train[valid_idx.astype(int)], dtype=torch.long).cuda()
    y_val_fold = torch.tensor(
        y_train[valid_idx.astype(int), np.newaxis], dtype=torch.float32
    ).cuda()

    model = NeuralNet()
    # make sure everything in the model is running on the GPU
    model.cuda()

    # define binary cross entropy loss
    # note that the model returns logit to take advantage of the log-sum-exp trick
    # for numerical stability in the loss
    loss_fn = torch.nn.BCEWithLogitsLoss(reduction="mean")
    optimizer = torch.optim.Adam(
        filter(lambda p: p.requires_grad, model.parameters()), lr=base_lr
    )

    scheduler = CyclicLR(
        optimizer,
        base_lr=base_lr,
        max_lr=max_lr,
        step_size=step_size,
        mode="exp_range",
        gamma=0.99994,
    )

    train = torch.utils.data.TensorDataset(x_train_fold, y_train_fold)
    valid = torch.utils.data.TensorDataset(x_val_fold, y_val_fold)

    train = MyDataset(train)
    valid = MyDataset(valid)

    train_loader = torch.utils.data.DataLoader(
        train, batch_size=batch_size, shuffle=True
    )
    valid_loader = torch.utils.data.DataLoader(
        valid, batch_size=batch_size, shuffle=False
    )

    print(f"Fold {i + 1}")

    for epoch in range(n_epochs):
        # set train mode of the model. This enables operations which are only applied during training like dropout
        start_time = time.time()
        model.train()
        avg_loss = 0.0
        for i, (x_batch, y_batch, index) in enumerate(train_loader):
            # Forward pass: compute predicted y by passing x to the model.
            f = kfold_X_features[index]
            y_pred = model([x_batch, f])

            scheduler.batch_step()

            # Compute and print loss.
            loss = loss_fn(y_pred, y_batch)

            # Before the backward pass, use the optimizer object to zero all of the
            # gradients for the Tensors it will update (which are the learnable weights
            # of the model)
            optimizer.zero_grad()

            # Backward pass: compute gradient of the loss with respect to model parameters
            loss.backward()

            # Calling the step function on an Optimizer makes an update to its parameters
            optimizer.step()
            avg_loss += loss.item() / len(train_loader)

        # set evaluation mode of the model. This disabled operations which are only applied during training like dropout
        model.eval()

        # predict all the samples in y_val_fold batch per batch
        valid_preds_fold = np.zeros((x_val_fold.size(0)))
        test_preds_fold = np.zeros((len(x_test)))

        avg_val_loss = 0.0
        for i, (x_batch, y_batch, index) in enumerate(valid_loader):
            f = kfold_X_valid_features[index]
            y_pred = model([x_batch, f]).detach()

            avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
            valid_preds_fold[i * batch_size : (i + 1) * batch_size] = sigmoid(
                y_pred.cpu().numpy()
            )[:, 0]

        elapsed_time = time.time() - start_time
        print(
            "Epoch {}/{} \t loss={:.4f} \t val_loss={:.4f} \t time={:.2f}s".format(
                epoch + 1, n_epochs, avg_loss, avg_val_loss, elapsed_time
            )
        )

    avg_losses_f.append(avg_loss)
    avg_val_losses_f.append(avg_val_loss)

    # predict all samples in the test set batch per batch
    for i, (x_batch,) in enumerate(test_loader):
        f = test_features[i * batch_size : (i + 1) * batch_size]
        y_pred = model([x_batch, f]).detach()

        test_preds_fold[i * batch_size : (i + 1) * batch_size] = sigmoid(
            y_pred.cpu().numpy()
        )[:, 0]

    train_preds[valid_idx] = valid_preds_fold
    test_preds += test_preds_fold / len(splits)

print(
    "\nTotal loss = {:.4f} \t val_loss={:.4f}".format(
        np.average(avg_losses_f), np.average(avg_val_losses_f)
    )
)

Fold 1
Epoch 1/5 	 loss=0.1154 	 val_loss=0.1024 	 time=229.36s
Epoch 2/5 	 loss=0.1000 	 val_loss=0.0959 	 time=232.44s
Epoch 3/5 	 loss=0.0950 	 val_loss=0.0949 	 time=233.03s
Epoch 4/5 	 loss=0.0911 	 val_loss=0.0947 	 time=233.05s
Epoch 5/5 	 loss=0.0879 	 val_loss=0.0949 	 time=232.56s
Fold 2
Epoch 1/5 	 loss=0.1155 	 val_loss=0.1003 	 time=234.02s
Epoch 2/5 	 loss=0.1002 	 val_loss=0.0967 	 time=233.27s
Epoch 3/5 	 loss=0.0950 	 val_loss=0.0957 	 time=232.96s
Epoch 4/5 	 loss=0.0914 	 val_loss=0.0955 	 time=232.74s
Epoch 5/5 	 loss=0.0881 	 val_loss=0.0952 	 time=232.82s
Fold 3
Epoch 1/5 	 loss=0.1158 	 val_loss=0.1032 	 time=232.77s
Epoch 2/5 	 loss=0.1000 	 val_loss=0.0978 	 time=233.86s
Epoch 3/5 	 loss=0.0947 	 val_loss=0.0961 	 time=234.83s
Epoch 4/5 	 loss=0.0908 	 val_loss=0.0958 	 time=232.84s
Epoch 5/5 	 loss=0.0874 	 val_loss=0.0963 	 time=232.31s
Fold 4
Epoch 1/5 	 loss=0.1163 	 val_loss=0.0996 	 time=231.18s
Epoch 2/5 	 loss=0.1004 	 val_loss=0.0968 	 time=229.75s
Epo

In [50]:
del x_train, x_test
gc.collect()

0

In [51]:
since = time.time() - start
print(
    "Training the Neural Network: {:.0f} min {:.0f} sec".format(since // 60, since % 60)
)

Training the Neural Network: 97 min 1 sec


# Submission

In [52]:
search_result = threshold_search(y_train, train_preds)
print(search_result)

{'threshold': 0.36109811067581177, 'f1': 0.6938563299731376}


  """


Finally submit the predictions with the threshold we have just found.

In [53]:
submission = test_df[["qid"]].copy()
submission["prediction"] = test_preds > search_result["threshold"]
submission.to_csv("submission.csv", index=False)

In [54]:
since = time.time() - notebook_start
print("Total notebook runtime: {:.0f} min {:.0f} sec".format(since // 60, since % 60))

Total notebook runtime: 106 min 37 sec
