In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/translated-train-bias-all-langs/All languages/train-bias-toxic-google-api-ru-cleaned.csv
/kaggle/input/translated-train-bias-all-langs/All languages/train-bias-toxic-google-api-pt-cleaned.csv
/kaggle/input/translated-train-bias-all-langs/All languages/train-bias-toxic-google-api-tr-cleaned.csv
/kaggle/input/translated-train-bias-all-langs/All languages/train-bias-toxic-google-api-fr-cleaned.csv
/kaggle/input/translated-train-bias-all-langs/All languages/train-bias-toxic-google-api-it-cleaned.csv
/kaggle/input/translated-train-bias-all-langs/All languages/train-bias-toxic-google-api-es-cleaned.csv
/kaggle/input/epoch2-turkish/__results__.html
/kaggle/input/epoch2-turkish/model_val_untrained.h5
/kaggle/input/epoch2-turkish/__output__.json
/kaggle/input/epoch2-turkish/custom.css
/kaggle/input/epoch2-turkish/__notebook__.ipynb
/kaggle/input/epoch2-turkish/model.h5
/kaggle/input/epoch2-turkish/submission.csv
/kaggle/input/epoch2-turkish/submission_all.csv
/kaggle/input/sub9481

In [2]:
MAX_LEN = 192 
DROPOUT = 0.5 # using aggressive dropout to avoid overfitting
BATCH_SIZE = 16 # per TPU core
TOTAL_STEPS_STAGE1 = 2000
VALIDATE_EVERY_STAGE1 = 200
TOTAL_STEPS_STAGE2 = 200
VALIDATE_EVERY_STAGE2 = 10

### Different learning rate for transformer and head ###
LR_TRANSFORMER = 5e-6
LR_HEAD = 1e-3

PRETRAINED_TOKENIZER=  'jplu/tf-xlm-roberta-large'
PRETRAINED_MODEL = '/kaggle/input/jigsaw-mlm-finetuned-xlm-r-large'
D = '/kaggle/input/jigsaw-multilingual-toxic-comment-classification/'
D_TRANS = '/kaggle/input/translated-train-bias-all-langs/All languages/train-bias-toxic-google-api-'


import os
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split
import tensorflow as tf
print(tf.__version__)
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
import transformers
from transformers import *
import logging
# no extensive logging 
logging.getLogger().setLevel(logging.NOTSET)

AUTO = tf.data.experimental.AUTOTUNE

2.2.0




In [3]:
def connect_to_TPU():
    """Detect hardware, return appropriate distribution strategy"""
    try:
        tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
        print('Running on TPU ', tpu.master())
    except ValueError:
        tpu = None

    if tpu:
        tf.config.experimental_connect_to_cluster(tpu)
        tf.tpu.experimental.initialize_tpu_system(tpu)
        strategy = tf.distribute.experimental.TPUStrategy(tpu)
    else:
        strategy = tf.distribute.get_strategy()

    global_batch_size = BATCH_SIZE * strategy.num_replicas_in_sync

    return tpu, strategy, global_batch_size


tpu, strategy, global_batch_size = connect_to_TPU()
print("REPLICAS: ", strategy.num_replicas_in_sync)

Running on TPU  grpc://10.0.0.2:8470
REPLICAS:  8


In [4]:
train = pd.read_csv(D+'test.csv',usecols=['content']).rename(columns={'content':'comment_text'})
val_df = pd.read_csv(D+'validation.csv')
train = pd.concat([train,val_df],ignore_index=True)
test_df = pd.read_csv('/kaggle/input/jigsaw-data/test_all.csv')
sub_df = pd.read_csv(D+'sample_submission.csv')

In [5]:
%%time
def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_attention_masks=False, 
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen
    )
    
    return np.array(enc_di['input_ids'])
    

tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_TOKENIZER)
X_train = regular_encode(train.comment_text.values, tokenizer, maxlen=MAX_LEN)
X_val = regular_encode(val_df.comment_text.values, tokenizer, maxlen=MAX_LEN)
X_test = regular_encode(test_df.content.values, tokenizer, maxlen=MAX_LEN)

X_test_google = regular_encode(test_df.google.values, tokenizer, maxlen=MAX_LEN)
X_test_yandex = regular_encode(test_df.yandex.values, tokenizer, maxlen=MAX_LEN)
X_test_en_blob = regular_encode(test_df.en_blob.values, tokenizer, maxlen=MAX_LEN)

y_train = np.concatenate((pd.read_csv('/kaggle/input/sub9481/submission.csv')['toxic'].values,
                    pd.read_csv('/kaggle/input/val-correct/valid-pseudo.csv')['pseudo'].values)).reshape(-1,1)
y_val = val_df.toxic.values.reshape(-1,1)

CPU times: user 3min 10s, sys: 1.03 s, total: 3min 11s
Wall time: 3min 14s


In [6]:
def create_dist_dataset(X, y=None, training=False):
    dataset = tf.data.Dataset.from_tensor_slices(X)

    ### Add y if present ###
    if y is not None:
        dataset_y = tf.data.Dataset.from_tensor_slices(y)
        dataset = tf.data.Dataset.zip((dataset, dataset_y))
        
    ### Repeat if training ###
    if training:
        dataset = dataset.shuffle(len(X)).repeat()

    dataset = dataset.batch(global_batch_size).prefetch(AUTO)

    ### make it distributed  ###
    dist_dataset = strategy.experimental_distribute_dataset(dataset)

    return dist_dataset
    
    
train_dist_dataset = create_dist_dataset(X_train, y_train, True)
val_dist_dataset   = create_dist_dataset(X_val)
test_dist_dataset  = create_dist_dataset(X_test)

test_dist_dataset_google  = create_dist_dataset(X_test_google)
test_dist_dataset_yandex  = create_dist_dataset(X_test_yandex)
test_dist_dataset_en_blob  = create_dist_dataset(X_test_en_blob)

In [7]:
%%time

def create_model_and_optimizer():
    with strategy.scope():
        transformer_layer = TFRobertaModel.from_pretrained(PRETRAINED_MODEL)                
        model = build_model(transformer_layer)
        optimizer_transformer = Adam(learning_rate=LR_TRANSFORMER)
        optimizer_head = Adam(learning_rate=LR_HEAD)
        model.load_weights('/kaggle/input/mlm947/model_val_untrained.h5')
    return model, optimizer_transformer, optimizer_head


def build_model(transformer):
    inp = Input(shape=(MAX_LEN,), dtype=tf.int32, name="input_word_ids")
    # Huggingface transformers have multiple outputs, embeddings are the first one
    # let's slice out the first position, the paper says its not worse than pooling
    x = transformer(inp)[0][:, 0, :]  
    x = Dropout(DROPOUT)(x)
    ### note, adding the name to later identify these weights for different LR
    out = Dense(1, activation='sigmoid', name='custom_head')(x)
    model = Model(inputs=[inp], outputs=[out])
    
    return model


model, optimizer_transformer, optimizer_head = create_model_and_optimizer()
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 192)]             0         
_________________________________________________________________
tf_roberta_model (TFRobertaM ((None, 192, 1024), (None 559890432 
_________________________________________________________________
tf_op_layer_strided_slice (T [(None, 1024)]            0         
_________________________________________________________________
dropout_74 (Dropout)         (None, 1024)              0         
_________________________________________________________________
custom_head (Dense)          (None, 1)                 1025      
Total params: 559,891,457
Trainable params: 559,891,457
Non-trainable params: 0
_________________________________________________________________
CPU times: user 48.6 s, sys: 54.3 s, total: 1min 42s
Wall time: 1min 28s


In [8]:
def define_losses_and_metrics():
    with strategy.scope():
        loss_object = tf.keras.losses.BinaryCrossentropy(
            reduction=tf.keras.losses.Reduction.NONE, from_logits=False)

        def compute_loss(labels, predictions):
            per_example_loss = loss_object(labels, predictions)
            loss = tf.nn.compute_average_loss(
                per_example_loss, global_batch_size = global_batch_size)
            return loss

        train_accuracy_metric = tf.keras.metrics.AUC(name='training_AUC')

    return compute_loss, train_accuracy_metric


def train(train_dist_dataset, val_dist_dataset=None, y_val=None,
          total_steps=2000, validate_every=200):
    best_weights, history = None, []
    step = 0
    ### Training lopp ###
    for tensor in train_dist_dataset:
        distributed_train_step(tensor) 
        step+=1

        if (step % validate_every == 0):   
            ### Print train metrics ###  
            train_metric = train_accuracy_metric.result().numpy()
            print("Step %d, train AUC: %.5f" % (step, train_metric))   
            
            ### Test loop with exact AUC ###
            if val_dist_dataset:
                val_metric = roc_auc_score(y_val, predict(val_dist_dataset))
                print("Step %d,   val AUC: %.5f" %  (step,val_metric))   
                
                # save weights if it is the best yet
                history.append(val_metric)
                if history[-1] == max(history):
                    best_weights = model.get_weights()

            ### Reset (train) metrics ###
            train_accuracy_metric.reset_states()
            
        if step  == total_steps:
            break
    
    ### Restore best weighths ###
    model.set_weights(best_weights)



@tf.function
def distributed_train_step(data):
    strategy.experimental_run_v2(train_step, args=(data,))

def train_step(inputs):
    features, labels = inputs
    
    ### get transformer and head separate vars
    # get rid of pooler head with None gradients
    transformer_trainable_variables = [ v for v in model.trainable_variables 
                                       if (('pooler' not in v.name)  and 
                                           ('custom' not in v.name))]
    head_trainable_variables = [ v for v in model.trainable_variables 
                                if 'custom'  in v.name]

    # calculate the 2 gradients ( note persistent, and del)
    with tf.GradientTape(persistent=True) as tape:
        predictions = model(features, training=True)
        loss = compute_loss(labels, predictions)
    gradients_transformer = tape.gradient(loss, transformer_trainable_variables)
    gradients_head = tape.gradient(loss, head_trainable_variables)
    del tape
        
    ### make the 2 gradients steps
    optimizer_transformer.apply_gradients(zip(gradients_transformer, 
                                              transformer_trainable_variables))
    optimizer_head.apply_gradients(zip(gradients_head, 
                                       head_trainable_variables))

    train_accuracy_metric.update_state(labels, predictions)



def predict(dataset):  
    predictions = []
    for tensor in dataset:
        predictions.append(distributed_prediction_step(tensor))
    ### stack replicas and batches
    predictions = np.vstack(list(map(np.vstack,predictions)))
    return predictions

@tf.function
def distributed_prediction_step(data):
    predictions = strategy.experimental_run_v2(prediction_step, args=(data,))
    return strategy.experimental_local_results(predictions)

def prediction_step(inputs):
    features = inputs  # note datasets used in prediction do not have labels
    predictions = model(features, training=False)
    return predictions


compute_loss, train_accuracy_metric = define_losses_and_metrics()

In [9]:
%%time
train(train_dist_dataset, val_dist_dataset, y_val,
      TOTAL_STEPS_STAGE1, VALIDATE_EVERY_STAGE1)

  num_elements)


Step 200, train AUC: 0.00000
Step 200,   val AUC: 0.96127
Step 400, train AUC: 0.00000
Step 400,   val AUC: 0.96688
Step 600, train AUC: 0.00000
Step 600,   val AUC: 0.97168
Step 800, train AUC: 0.00000
Step 800,   val AUC: 0.97344
Step 1000, train AUC: 0.00000
Step 1000,   val AUC: 0.97646
Step 1200, train AUC: 0.00000
Step 1200,   val AUC: 0.97941
Step 1400, train AUC: 0.00000
Step 1400,   val AUC: 0.98068
Step 1600, train AUC: 0.00000
Step 1600,   val AUC: 0.98218
Step 1800, train AUC: 0.00000
Step 1800,   val AUC: 0.98340
Step 2000, train AUC: 0.00000
Step 2000,   val AUC: 0.98481
CPU times: user 3min 33s, sys: 1min 6s, total: 4min 39s
Wall time: 18min 7s


In [10]:
model.save_weights('model_val_pseudo.h5')

In [11]:
val_df['pseudo'] = predict(val_dist_dataset)[:,0]

In [12]:
val_df.to_csv('val_pseudo.csv',index=False)

In [13]:
%%time
sub_fin = sub_df.copy()
sub_df['toxic_o'] = predict(test_dist_dataset)[:,0]
sub_df['toxic_google'] = predict(test_dist_dataset_google)[:,0]
sub_df['toxic_yandex'] = predict(test_dist_dataset_yandex)[:,0]
sub_df['toxic_en_blob'] = predict(test_dist_dataset_en_blob)[:,0]

CPU times: user 1min 8s, sys: 17.4 s, total: 1min 26s
Wall time: 4min 43s


In [14]:
pass1 = test_df[test_df.en_blob.str.find("pass") == 0]
for i in pass1.id:
    if i in sub_df.id:
        sub_df.toxic_en_blob.loc[i] = (sub_df.toxic_google.loc[i] + sub_df.toxic_yandex.loc[i] + sub_df.toxic_o.loc[i]) / 3

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [15]:
sub_fin['toxic'] = (0.6*sub_df['toxic_o']) + (0.1*sub_df['toxic_google']) + (0.15*sub_df['toxic_yandex']) + (0.15*sub_df['toxic_en_blob'])

In [17]:
sub_tarek = pd.read_csv('/kaggle/input/howling-with-wolf-on-l-genpresse/submission.csv')
sub_shonenkov = pd.read_csv('/kaggle/input/tpu-inference-super-fast-xlmroberta/submission.csv')
sub = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv')

In [19]:
sub['toxic'] = (0.4*sub_tarek['toxic']) + (0.2*sub_shonenkov['toxic']) + (0.4*sub_fin['toxic'])

In [21]:
sub.to_csv('submission.csv', index=False)