In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%%writefile single_model.py
""" build a TFAutoModel and load its data from npz or tfrec dataset """
import random
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.models import Model

from transformers import TFAutoModel


def build_model(model_id='jplu/tf-xlm-roberta-large',
                max_len=192, dropout=0.2,
                **_):
    """ build a TFAutoModel """
    transformer = TFAutoModel.from_pretrained(model_id)

    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    if dropout > 0:
        cls_token = Dropout(dropout)(cls_token)
    out = Dense(1, activation='sigmoid')(cls_token)
    model = Model(inputs=input_word_ids, outputs=out)

    return model



def np_dataset(dataset, batch_size, seed):
    """ load npz datasets """
    array = np.load(dataset)
    x_train, x_valid, x_test, y_train, y_valid = [array[k] for k in list(array)]
    # Shuffle
    x_train = pd.DataFrame(np.concatenate([x_train.T, [y_train]]).T
                          ).sample(frac=1, random_state=seed).values
    assert abs(x_train[..., :-1] - x_train[..., :-1].astype('int32')).max() == 0
    x_train, y_train = x_train[..., :-1].astype('int32'), x_train[..., -1].astype('float32')
    print(x_train.shape, x_valid.shape, x_test.shape, y_train.shape, y_valid.shape)

    ## Set Datasets
    auto_tune = tf.data.experimental.AUTOTUNE
    train_dataset = (
        tf.data.Dataset
        .from_tensor_slices((x_train, y_train))
        .repeat()
        .shuffle(2048)
        .batch(batch_size)
        .prefetch(auto_tune)
    )

    valid_dataset = (
        tf.data.Dataset
        .from_tensor_slices((x_valid, y_valid))
        .batch(batch_size)
        .cache()
        .prefetch(auto_tune)
    )

    test_dataset = (
        tf.data.Dataset
        .from_tensor_slices(x_test)
        .batch(batch_size)
        .prefetch(auto_tune)
    )

    return train_dataset, valid_dataset, test_dataset


def val_np_dataset(dataset='../input/jigsaw20-val-test-ds/jigsaw20_val_ds.npz', batch_size=128):
    """ load npz datasets """
    array = np.load(dataset)
    x_valid, x_test, y_valid = [array[k] for k in list(array)]
    print(x_valid.shape, x_test.shape, y_valid.shape)

    ## Set Datasets
    auto_tune = tf.data.experimental.AUTOTUNE
    valid_dataset = (
        tf.data.Dataset
        .from_tensor_slices((x_valid, y_valid))
        .batch(batch_size)
        .cache()
        .prefetch(auto_tune)
    )

    test_dataset = (
        tf.data.Dataset
        .from_tensor_slices(x_test)
        .batch(batch_size)
        .prefetch(auto_tune)
    )

    return valid_dataset, test_dataset


def tf_dataset(dataset, batch_size, max_len, seed):
    """ load tfrec datasets """
    auto_tune = tf.data.experimental.AUTOTUNE

    train_dataset = (
        load_tf_dataset(dataset+'train*.tfrec', max_len, seed)
        .repeat()
        .shuffle(2048)
        .batch(batch_size)
        .prefetch(auto_tune)
    )

    valid_dataset, test_dataset = val_np_dataset(batch_size=batch_size)

    return train_dataset, valid_dataset, test_dataset

def load_tf_dataset(filenames, max_len, seed, ordered=False):
    """ load a tfrec dataset """
    # Read from TFRecords. For optimal performance, reading from multiple files at once and
    # disregarding data order. Order does not matter since we will be shuffling the data anyway.
    auto_tune = tf.data.experimental.AUTOTUNE

    def read_labeled_tfrecord(example, max_len=max_len):
        """ decode a tfrec """
        tf_format = {
            "data": tf.io.FixedLenFeature(max_len, tf.int64),
            "label": tf.io.FixedLenFeature([], tf.float32),  # shape [] means single element
        }
        example = tf.io.parse_single_example(example, tf_format)
        return example['data'], example['label']

    ignore_order = tf.data.Options()
    if not ordered:
        ignore_order.experimental_deterministic = False # disable order, increase speed

    # expand and shuffle files
    filenames = tf.io.gfile.glob(filenames)
    random.Random(seed).shuffle(filenames)
    # automatically interleaves reads from multiple files
    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=auto_tune)
    # uses data as soon as it streams in, rather than in its original order
    dataset = dataset.with_options(ignore_order)
    dataset = dataset.map(read_labeled_tfrecord, num_parallel_calls=auto_tune)
    return dataset


Overwriting single_model.py


In [3]:
!pylint3 single_model.py --ignored-modules=tensorflow.keras

No config file found, using default configuration

--------------------------------------------------------------------
Your code has been rated at 10.00/10 (previous run: 10.00/10, +0.00)



In [4]:
%%writefile dual_model.py
""" build a dual TFAutoModel and load its data from npz or tfrec dataset """
import glob
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.models import Model

from transformers import TFAutoModel


def build_model(model_id1='bert-base-multilingual-cased',
                model_id2='bert-base-multilingual-uncased',
                max_len=192, dropout=0.2,
                **_):
    """ build a dual TFAutoModel """
    print(model_id1, model_id2)

    transformer1 = TFAutoModel.from_pretrained(model_id1)
    transformer2 = TFAutoModel.from_pretrained(model_id2)

    input_word_ids1 = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids1")
    out1 = transformer1(input_word_ids1)

    input_word_ids2 = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids2")
    out2 = transformer2(input_word_ids2)

    sequence_output1 = out1[0]
    sequence_output2 = out2[0]
    cls_token1 = sequence_output1[:, 0, :]
    cls_token2 = sequence_output2[:, 0, :]

    x = Dropout(dropout)(cls_token1) + Dropout(dropout)(cls_token2)
    out = Dense(1, activation='sigmoid')(x)

    model = Model(inputs=[input_word_ids1, input_word_ids2], outputs=out)

    return model



def np_dataset(datasets, batch_size, seed):
    """ load npz datasets """
    datasets = sorted(glob.glob(datasets))
    print(datasets)

    ## x_train1
    array = np.load(datasets[0])
    x_train, x_valid, x_test, y_train, y_valid = [array[k] for k in list(array)]
    # Shuffle
    x_train = pd.DataFrame(np.concatenate([x_train.T, [y_train]]).T
                          ).sample(frac=1, random_state=seed).values
    assert abs(x_train[..., :-1] - x_train[..., :-1].astype('int32')).max() == 0
    x_train, y_train = x_train[..., :-1].astype('int32'), x_train[..., -1].astype('float32')
    print(x_train.shape, x_valid.shape, x_test.shape, y_train.shape, y_valid.shape)

    x_train1, x_valid1, x_test1 = x_train, x_valid, x_test

    ## x_train2
    array = np.load(datasets[1])
    x_train, x_valid, x_test, y_train, y_valid = [array[k] for k in list(array)]
    # Shuffle
    x_train = pd.DataFrame(np.concatenate([x_train.T, [y_train]]).T
                          ).sample(frac=1, random_state=seed).values
    assert abs(x_train[..., :-1] - x_train[..., :-1].astype('int32')).max() == 0
    x_train, y_train = x_train[..., :-1].astype('int32'), x_train[..., -1].astype('float32')
    print(x_train.shape, x_valid.shape, x_test.shape, y_train.shape, y_valid.shape)

    x_train2, x_valid2, x_test2 = x_train, x_valid, x_test

    ## Set Datasets
    auto_tune = tf.data.experimental.AUTOTUNE
    train_dataset = (
        tf.data.Dataset.zip((
            tf.data.Dataset.from_tensor_slices((x_train1, x_train2)),
            tf.data.Dataset.from_tensor_slices(y_train)
        ))
        .repeat()
        .shuffle(2048)
        .batch(batch_size)
        .prefetch(auto_tune)
    )

    valid_dataset = (
        tf.data.Dataset.zip((
            tf.data.Dataset.from_tensor_slices((x_valid1, x_valid2)),
            tf.data.Dataset.from_tensor_slices(y_valid)
        ))
        .batch(batch_size)
        .cache()
        .prefetch(auto_tune)
    )

    test_dataset = (
        tf.data.Dataset.zip((
            tf.data.Dataset.from_tensor_slices((x_test1, x_test2)),
            tf.data.Dataset.from_tensor_slices(np.ones(len(x_test1), dtype='float32'))
        ))
        .batch(batch_size)
        .prefetch(auto_tune)
    )

    return train_dataset, valid_dataset, test_dataset


def tf_dataset(*_):
    """ load tfrec datasets """
    raise "Not Implemented"


Overwriting dual_model.py


In [5]:
!pylint3 dual_model.py --ignored-modules=tensorflow.keras

No config file found, using default configuration
************* Module dual_model
R: 13, 0: Too many local variables (18/15) (too-many-locals)
C: 34, 4: Variable name "x" doesn't conform to snake_case naming style (invalid-name)
R: 43, 0: Too many local variables (19/15) (too-many-locals)
E: 98,75: Module 'numpy' has no 'float32' member (no-member)

------------------------------------------------------------------
Your code has been rated at 8.30/10 (previous run: 9.36/10, -1.06)



In [6]:
%%writefile train.py
""" build and train a TFAutoModel from npz or tfrec dataset """
import os
import gc
import time
import random

import logging
import numpy as np
import pandas as pd
# from tqdm.notebook import tqdm
from matplotlib import pyplot as plt
from sklearn.metrics import roc_auc_score

import tensorflow_addons as tfa
# from tensorflow_addons.optimizers.utils import fit_bn

import tensorflow as tf

from one_cycle_scheduler import OneCycleScheduler
from visual import save_fig, plot_history
from focal_loss import focal_loss

logging.getLogger('tensorflow').setLevel(logging.ERROR)


def compile_model(model,
                  optimizer='LAMB', lr=2e-5, weight_decay=1e-6,
                  loss_fn='bce', label_smoothing=0.01,
                  pos_weight=5, gamma=2.0,  ## focal loss
                  amp=False,
                  **_):
    """ compile the model with a loss function and an optimizer """
    if loss_fn == 'focal':
        loss = focal_loss(pos_weight=pos_weight, gamma=gamma, label_smoothing=label_smoothing)
    elif loss_fn == 'bce':
        loss = tf.keras.losses.BinaryCrossentropy(label_smoothing=label_smoothing)

    if optimizer == 'LAMB':
        opt = tfa.optimizers.LAMB(lr=lr, weight_decay_rate=weight_decay)
    elif optimizer == 'AdamW':
        opt = tfa.optimizers.AdamW(lr=lr, weight_decay=weight_decay)
    print(opt)

    if amp:
        print('Using auto_mixed_precision.')
        tf.config.optimizer.set_jit(True)
        tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})
        opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, 'dynamic')

    model.compile(
        optimizer=opt,
        loss=loss,
        metrics=[tf.keras.metrics.AUC(name='auc')]
    )

    return model



def train_model(model, strategy, checkpoint_path, datasets,
                epochs=30, steps_per_epoch=250,
                lr=2e-5, one_cycle=True, warm_up=1,
                mom_min=0.85, mom_max=0.95,
                div_factor=100, final_div_factor=250,
                callback=None,
                **_):
    """ train the given model """
    train_dataset, valid_dataset, test_dataset = datasets

    ## Train
    callbacks = [] if callback is None else [callback]
    callbacks.append(tf.keras.callbacks.EarlyStopping(monitor='val_auc', min_delta=1e-4,
                                                      mode='max', patience=epochs//5, verbose=1,
                                                      restore_best_weights=False)) # restore later

    if one_cycle:
        callbacks.append(OneCycleScheduler(lr_max=lr, steps=steps_per_epoch*epochs,
                                           mom_min=mom_min, mom_max=mom_max,
                                           phase_1_pct=warm_up/epochs,
                                           div_factor=div_factor,
                                           final_div_factor=final_div_factor))
    else:
        callbacks.append(tf.keras.callbacks.ReduceLROnPlateau(monitor='val_auc', factor=0.31,
                                                              patience=2, cooldown=1, mode='max',
                                                              verbose=1, min_delta=1e-4))

    callbacks.append(tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                        monitor='val_auc',
                                                        verbose=1, mode='max',
                                                        save_best_only=True,
                                                        save_weights_only=True))
    print(callbacks)

    model.fit(
        train_dataset,
        steps_per_epoch=steps_per_epoch,
        validation_data=valid_dataset,
        epochs=epochs,
        callbacks=callbacks,
    )

    # load best
    if epochs > 1:
        # latest = tf.train.latest_checkpoint(checkpoint_dir)
        with strategy.scope():
            model.load_weights(checkpoint_path)

    return (model,
            model.predict(valid_dataset, verbose=1),
            model.predict(test_dataset, verbose=1))



def setup_tpu(tpu_id):
    """ resolve a tpu cluster """
    if tpu_id is None:
        with open('tpu', 'r') as content_file:
            tpu_id = content_file.read()
            print(dict(tpu_id=tpu_id))

    ## Detect hardware, return appropriate distribution strategy
    try:
        # TPU detection. No parameters necessary if TPU_NAME environment variable is
        # set: this is always the case on Kaggle.
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=tpu_id)
        print('Running on TPU ', tpu.master())
    except ValueError:
        tpu = None

    if tpu:
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
    else:
        # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
        strategy = tf.distribute.get_strategy()
    print("REPLICAS: ", strategy.num_replicas_in_sync)
    return strategy



##################
###### MAIN ######
##################

def train(dataset, gcs='hm-eu-w4', path='jigsaw/test',
          seed=0, max_len=192, batch_size=28,
          tpu_id=None, dual=False,
          **kwargs):
    """ build and train a TFAutoModel from npz or tfrec dataset """
    params = dict(locals())
    params.update(kwargs)
    params = pd.DataFrame(params, index=[0])
    del params['kwargs']
    if params.loc[0, 'loss_fn'] != 'focal':
        del params['gamma']
        del params['pos_weight']
    kw_params = params.T[0].to_dict()
    print(params.T)
    gc.collect()

    random.seed(seed)
    np.random.seed(seed)
    tf.random.set_seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'

    strategy = setup_tpu(tpu_id)

    ## Configuration
    path = f'{path}/{time.strftime("%Y%m%d_%H%M%S")}_{tpu_id}'
    gcs_path = f'gs://{gcs}/{path}'
    checkpoint_path = f"{gcs_path}/best_model.tf"
    print('gcs_path:', gcs_path)
    params['gcs_path'] = gcs_path
    batch_size = batch_size * strategy.num_replicas_in_sync
    print('batch_size:', batch_size)

    if dual: ## HACK: dynamic import :/
        from dual_model import build_model, tf_dataset, np_dataset
    else:
        from single_model import build_model, tf_dataset, np_dataset

    if dataset.startswith('gs://'):
        datasets = tf_dataset(dataset, batch_size, max_len, seed)
    else:
        datasets = np_dataset(dataset, batch_size, seed)

    ## Load and Train
    with strategy.scope():
        model = build_model(**kw_params)
        model = compile_model(model, **kw_params)
    model, preds, sub_y = train_model(model, strategy, checkpoint_path, datasets, **kw_params)

    ## Save results
    plot_history(model.history, path, gcs)
    history = pd.DataFrame(model.history.history)
    print(history)
    history.to_csv(f'{gcs_path}/history.csv', index=False)

    ## Load Dataset
    comp_ds = '../input/jigsaw-multilingual-toxic-comment-classification'
    valid = pd.read_csv(f'{comp_ds}/validation.csv')
#     test = pd.read_csv(f'{comp_ds}/test.csv')
    sub = pd.read_csv(f'{comp_ds}/sample_submission.csv')

    valid['pred'] = preds
    valid.to_csv(f'{gcs_path}/valid_oof.csv', index=False)

    valid.groupby('toxic').pred.hist(bins=100, log=True, alpha=0.5)
    plt.legend([0, 1])
    save_fig('valid_hist.png', path, gcs)

    valid[valid.toxic == 1].groupby('lang').pred.hist(bins=50, log=True, alpha=0.34)
    plt.legend(valid.lang.unique())
    save_fig('valid_toxic_hist.png', path, gcs)

    valid_auc = roc_auc_score(valid.toxic, valid.pred)
    print('AUC:', valid_auc,
          'toxic:', valid.toxic.mean(),
          'pred:', valid.pred.mean(),
          'ratio:', (valid.pred > 0.5).mean())

    ## Submission
    sub['toxic'] = sub_y
    sub.to_csv(f'{gcs_path}/submission.csv', index=False)

    sub.toxic.hist(bins=100, log=True)
    save_fig('sub_hist.png', path, gcs)
    print('mean:', sub.toxic.mean(), 'ratio:', (sub.toxic > 0.5).mean())

    ## Save params
    params['auc'] = valid_auc
    params.to_csv(f'{gcs_path}/params{valid_auc:04f}.csv', index=False)
    print(params.T)

    return valid_auc

Overwriting train.py


In [7]:
!pylint3 train.py --ignored-modules=tensorflow.keras

No config file found, using default configuration
************* Module train
C: 26, 0: Argument name "lr" doesn't conform to snake_case naming style (invalid-name)
R: 26, 0: Too many arguments (9/5) (too-many-arguments)
C: 60, 0: Argument name "lr" doesn't conform to snake_case naming style (invalid-name)
R: 60, 0: Too many arguments (14/5) (too-many-arguments)
R: 60, 0: Too many local variables (19/15) (too-many-locals)
R:146, 0: Too many arguments (8/5) (too-many-arguments)
R:146, 0: Too many local variables (26/15) (too-many-locals)
R:146, 0: Too many statements (61/50) (too-many-statements)

------------------------------------------------------------------
Your code has been rated at 9.35/10 (previous run: 9.35/10, +0.00)



In [8]:
import numpy as np
from train import train

best = (0, None)

for i in range(1):
    params = dict(
        optimizer=np.random.choice(['LAMB', 'AdamW']),
        lr=10**np.random.uniform(low=-5.5, high=-4),
        weight_decay=10**np.random.uniform(low=-6.5, high=-4.5),
        loss_fn='focal',
        label_smoothing=np.random.uniform(low=0.01, high=0.04),
        pos_weight=np.random.uniform(low=1.5, high=5),
        gamma=np.random.uniform(low=1.0, high=2.5),
        warm_up=np.random.uniform(low=1, high=5),
        epochs=1, #np.random.randint(low=28, high=38),
        steps_per_epoch=5,
        batch_size=8,
#         dataset='gs://hm-eu-w4/jigsaw/translated-distilled-ds4/',
        dataset='../input/jigsaw-translated-distilled-ds5/jigsaw20_ds985588s5_ml_bert-base-multilingual-*.npz',
        dual=True,
        path=f'jigsaw/test',
        tpu_id='t8a',
        seed=np.random.randint(999),
    )

    auc = train(**params)
    if auc > best[0]:
        best = (auc, i, params)
        print('Best params:', best)

print('### Grid Search Done:')
print(best)

                                                                 0
dual                                                          True
tpu_id                                                         t8a
batch_size                                                       8
max_len                                                        192
seed                                                            78
path                                                   jigsaw/test
gcs                                                       hm-eu-w4
dataset          ../input/jigsaw-translated-distilled-ds5/jigsa...
optimizer                                                    AdamW
lr                                                     7.77939e-05
weight_decay                                           4.46321e-07
loss_fn                                                      focal
label_smoothing                                          0.0215838
pos_weight                                                 4.3

In [10]:
# import pdb; pdb.pm()

In [None]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
img = mpimg.imread('valid_hist.png')
_ = plt.imshow(img)
plt.show()
img = mpimg.imread('valid_toxic_hist.png')
_ = plt.imshow(img)