In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from kaggle_datasets import KaggleDatasets
import transformers
from transformers import TFAutoModel, AutoTokenizer
from tqdm.notebook import tqdm
from tokenizers import Tokenizer, models, pre_tokenizers, decoders, processors

In [2]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(max_length=maxlen)
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

In [3]:
def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_attention_masks=False, 
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen
    )
    
    return np.array(enc_di['input_ids'])

In [4]:
def build_model(transformer, max_len=512):
    """
    https://www.kaggle.com/xhlulu/jigsaw-tpu-distilbert-with-huggingface-and-keras
    """
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

In [5]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

Running on TPU  grpc://10.0.0.2:8470
REPLICAS:  8


In [6]:
AUTO = tf.data.experimental.AUTOTUNE

# Data access
GCS_DS_PATH = KaggleDatasets().get_gcs_path('jigsaw-multilingual-toxic-comment-classification')

# Configuration
EPOCHS = 2
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MAX_LEN = 192
MODEL = 'jplu/tf-xlm-roberta-large'

In [7]:
# First load the real tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=738.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=5069051.0, style=ProgressStyle(descript…




In [8]:
os.listdir('../input/jigsaw-train-multilingual-coments-google-api')
l=['fr','tr','it','pt','es']
train_multi = pd.read_csv('../input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-ru-cleaned.csv')
for i in l : 
    a = pd.read_csv(f'../input/jigsaw-train-multilingual-coments-google-api/jigsaw-toxic-comment-train-google-{i}-cleaned.csv')
    train_multi =train_multi.append(a,ignore_index=True)

In [9]:
train_multi['comment_text']=train_multi['comment_text'].astype(str)

In [10]:
df_valid = pd.read_csv('../input/jigsaw-multilingual-toxic-test-translated/jigsaw_miltilingual_valid_translated.csv')
df_valid_tran = df_valid.copy() 
df_valid_tran['comment_text']=df_valid_tran['translated']
df_valid_tran = df_valid_tran[['comment_text','toxic']]

df_valid = df_valid[['comment_text','toxic']]
train1 = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv")
train2 = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv")
train2.toxic = train2.toxic.round().astype(int)
train_multi.toxic = train_multi.toxic.round().astype(int)

df_train= pd.concat([
    train1[['comment_text', 'toxic']],
    train2[['comment_text', 'toxic']].query('toxic==1'),
    train2[['comment_text', 'toxic']].query('toxic==0').sample(n=160000, random_state=0),
    train_multi[['comment_text', 'toxic']].query('toxic==1'),
    train_multi[['comment_text', 'toxic']].query('toxic==0').sample(n=160000, random_state=0)  
    
])
df_train = df_train.sample(frac=1)
del train1, train2 ,train_multi
import gc; gc.collect();

df_train.shape, df_valid.shape

((784023, 2), (8000, 2))

In [11]:
df_test = pd.read_csv('../input/jigsaw-multilingual-toxic-test-translated/jigsaw_miltilingual_test_translated.csv')
df_test['comment_text'] = df_test['translated']
df_test=df_test[['id','comment_text']]
sub = pd.DataFrame(columns=['id','toxic'])
sub['id'] =df_test['id']

In [12]:
df_test_tran = pd.read_csv('../input/jigsaw-multilingual-toxic-test-translated/jigsaw_miltilingual_test_translated.csv')
df_test_tran['comment_text'] = df_test_tran['translated']
df_test_tran=df_test_tran[['id','comment_text']]
sub_tran = pd.DataFrame(columns=['id','toxic'])
sub_tran['id'] =df_test_tran['id']

In [13]:
%%time 

x_train = regular_encode(df_train.comment_text.values, tokenizer, maxlen=MAX_LEN)
x_valid = regular_encode(df_valid.comment_text.values, tokenizer, maxlen=MAX_LEN)
x_test = regular_encode(df_test.comment_text.values, tokenizer, maxlen=MAX_LEN)
x_valid_tran = regular_encode(df_valid_tran.comment_text.values, tokenizer, maxlen=MAX_LEN)
x_test_tran = regular_encode(df_test_tran.comment_text.values, tokenizer, maxlen=MAX_LEN)

y_train = df_train.toxic.values
y_valid = df_valid.toxic.values
y_valid_tran = df_valid_tran.toxic.values

CPU times: user 16min 12s, sys: 3.58 s, total: 16min 16s
Wall time: 16min 15s


In [14]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat() 
    .shuffle(2048)
    .batch(BATCH_SIZE)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_valid, y_valid))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)

valid_dataset_tran = (
    tf.data.Dataset
    .from_tensor_slices((x_valid_tran, y_valid_tran))
    .batch(BATCH_SIZE)
    .cache()
    .prefetch(AUTO)
)


test_dataset = (
    tf.data.Dataset
    .from_tensor_slices(x_test)
    .batch(BATCH_SIZE)
)
test_dataset_tran = (
    tf.data.Dataset
    .from_tensor_slices(x_test_tran)
    .batch(BATCH_SIZE)
)

In [15]:
%%time
with strategy.scope():
    transformer_layer = TFAutoModel.from_pretrained(MODEL)
    model = build_model(transformer_layer, max_len=MAX_LEN)
model.summary()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=3271420488.0, style=ProgressStyle(descr…


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 192)]             0         
_________________________________________________________________
tf_roberta_model (TFRobertaM ((None, 192, 1024), (None 559890432 
_________________________________________________________________
tf_op_layer_strided_slice (T [(None, 1024)]            0         
_________________________________________________________________
dense (Dense)                (None, 1)                 1025      
Total params: 559,891,457
Trainable params: 559,891,457
Non-trainable params: 0
_________________________________________________________________
CPU times: user 1min 51s, sys: 40.7 s, total: 2min 32s
Wall time: 2min 30s


In [16]:
n_steps = x_train.shape[0] // BATCH_SIZE
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=EPOCHS
)

Train for 6125 steps, validate for 63 steps
Epoch 1/2


  num_elements)
  num_elements)


Epoch 2/2


In [17]:
EPOCHS = 4
n_steps = x_valid.shape[0] // BATCH_SIZE
train_history_2 = model.fit(
    valid_dataset.repeat(),
    steps_per_epoch=n_steps,
    epochs=EPOCHS
)

Train for 62 steps
Epoch 1/4


  num_elements)


Epoch 2/4
Epoch 3/4
Epoch 4/4


In [18]:
sub['toxic'] = model.predict(test_dataset, verbose=1)
sub.to_csv('submission_no_translated.csv', index=False)



In [19]:
n_steps = x_valid_tran.shape[0] // BATCH_SIZE
train_history_2 = model.fit(
    valid_dataset_tran.repeat(),
    steps_per_epoch=n_steps,
    epochs=EPOCHS
)

Train for 62 steps
Epoch 1/4


  num_elements)


Epoch 2/4
Epoch 3/4
Epoch 4/4


In [20]:
sub_tran['toxic'] = model.predict(test_dataset_tran, verbose=1)
sub_tran.to_csv('submission_translated.csv', index=False)



In [21]:
# Save the model
!mkdir models
model.save('models/Tensor_flow_model.h5')

NotImplementedError: 