In [1]:
import os
import warnings

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from kaggle_datasets import KaggleDatasets
import transformers
import traitlets
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
from tokenizers import BertWordPieceTokenizer
from sklearn.metrics import roc_auc_score
#from transformers import GPT2Tokenizer, TFGPT2Model


warnings.simplefilter("ignore")

## Helper Functions

In [2]:
def fast_encode(texts, tokenizer, chunk_size=256, maxlen=512):
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(max_length=maxlen)
    all_ids = []
    
    for i in tqdm(range(0, len(texts), chunk_size)):
        text_chunk = texts[i:i+chunk_size].tolist()
        encs = tokenizer.encode_batch(text_chunk)
        all_ids.extend([enc.ids for enc in encs])
    
    return np.array(all_ids)

In [3]:
def regular_encode(texts, tokenizer, maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        return_attention_masks=False, 
        return_token_type_ids=False,
        pad_to_max_length=True,
        max_length=maxlen
    )
    
    return np.array(enc_di['input_ids'])

In [4]:
def regular_encode1(texts, tokenizer, maxlen=512):
    #enc_di = tokenizer.batch_encode_plus(
    #    texts, 
    #    return_attention_masks=False, 
    #    return_token_type_ids=False,
    #    pad_to_max_length=True,
    #    add_special_tokens=True,
    #    max_length=maxlen
    #)
    all_ids = []
    
    for text in texts:
        enc = tokenizer.encode_plus(text,pad_to_max_length=True,add_special_tokens=True,max_length=512)
        all_ids.append(enc['input_ids'])
    
    return np.array(all_ids)

In [5]:
#tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]
#tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)

In [6]:
def build_model(transformer,max_len=512):
    input_word_ids = Input(shape=(max_len, ),dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    out = Dense(1, activation='sigmoid')(cls_token)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])
    
    return model

Cosine similarity calculates similarity by measuring the cosine of angle between two vectors. This is calculated as:
![](https://miro.medium.com/max/426/1*hub04IikybZIBkSEcEOtGA.png)

Cosine Similarity calculation for two vectors A and B [source]
With cosine similarity, we need to convert sentences into vectors. One way to do that is to use bag of words with either TF (term frequency) or TF-IDF (term frequency- inverse document frequency). The choice of TF or TF-IDF depends on application and is immaterial to how cosine similarity is actually performed — which just needs vectors. TF is good for text similarity in general, but TF-IDF is good for search query relevance.

## TPU Configs

In [7]:
# Detect hardware, return appropriate distribution strategy
try:
    # TPU detection. No parameters necessary if TPU_NAME environment variable is
    # set: this is always the case on Kaggle.
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
    strategy = tf.distribute.get_strategy()

print("REPLICAS: ", strategy.num_replicas_in_sync)

Running on TPU  grpc://10.0.0.2:8470
REPLICAS:  8


In [8]:
#tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
#model = TFGPT2Model.from_pretrained('gpt2')

In [9]:
AUTO = tf.data.experimental.AUTOTUNE


# Configuration
EPOCHS = 3
BATCH_SIZE = 16 * strategy.num_replicas_in_sync
MAX_LEN = 512

## Create fast tokenizer

In [10]:
#albert_path = '../input/albertlargev2huggingface/'
#tokenizer = AlbertTokenizer.from_pretrained(albert_path, do_lower_case=True)
#albert_model = TFAlbertModel.from_pretrained(albert_path)



In [11]:
#tokenizer = transformers.GPT2Tokenizer.from_pretrained('gpt2-medium',pad_token="<PAD>",add_special_tokens=True)
tokenizer = transformers.AlbertTokenizer.from_pretrained('albert-base-v2')

# Save the loaded tokenizer locally
save_path = '/kaggle/working/distilbert_base_uncased/'
if not os.path.exists(save_path):
    os.makedirs(save_path)
tokenizer.save_pretrained(save_path)

#special_tokens_dict = {'pad_token': '<PAD>'}
#num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
#fast_tokenizer = transformers.GPT2Tokenizer('distilbert_base_uncased/vocab.json','distilbert_base_uncased/merges.txt',pad_token="<PAD>",add_special_tokens=True)
#fast_tokenizer_gpt = tokenizer('distilbert_base_uncased/vocab.json','distilbert_base_uncased/merges.txt')
#fast_tokenizer_gpt

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=760289.0, style=ProgressStyle(descripti…




('/kaggle/working/distilbert_base_uncased/spiece.model',
 '/kaggle/working/distilbert_base_uncased/special_tokens_map.json',
 '/kaggle/working/distilbert_base_uncased/added_tokens.json')

## Load text data into memory

In [12]:
train1 = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-toxic-comment-train.csv")
#train2 = pd.read_csv("/kaggle/input/jigsaw-multilingual-toxic-comment-classification/jigsaw-unintended-bias-train.csv")

#valid = pd.read_csv('../input/jigsaw-multilingual-toxic-comment-classification/validation.csv')
valid = pd.read_csv('/kaggle/input/val-en-df/validation_en.csv')

#test = pd.read_csv("../input/jigsaw-multilingual-toxic-comment-classification/test.csv")
test1 = pd.read_csv('/kaggle/input/test-en-df/test_en.csv')
test2 = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-test-translated/jigsaw_miltilingual_test_translated.csv')
sub = pd.read_csv('/kaggle/input/jigsaw-multilingual-toxic-comment-classification/sample_submission.csv')

## Fast encode

In [13]:
x_train = regular_encode(train1.comment_text.astype(str), tokenizer, maxlen=MAX_LEN)
x_valid = regular_encode(valid.comment_text_en.astype(str), tokenizer, maxlen=MAX_LEN)
x_test1 = regular_encode(test1.content_en.astype(str), tokenizer, maxlen=MAX_LEN)
x_test2 = regular_encode(test2.translated.astype(str), tokenizer, maxlen=MAX_LEN)

y_train = train1.toxic.values
y_valid = valid.toxic.values

In [14]:
train_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_train, y_train))
    .repeat()
    .shuffle(2048)
    .batch(64)
    .prefetch(AUTO)
)

valid_dataset = (
    tf.data.Dataset
    .from_tensor_slices((x_valid, y_valid))
    .batch(64)
    .cache()
    .prefetch(AUTO)
)

test_dataset = [(
    tf.data.Dataset
    .from_tensor_slices(x_test1)
    .batch(64)
),
    (
    tf.data.Dataset
    .from_tensor_slices(x_test2)
    .batch(64)
)]

In [15]:
%%time
with strategy.scope():
    transformer_layer = transformers.TFAlbertModel.from_pretrained('albert-base-v2')
    model = build_model(transformer_layer, max_len=512)
model.summary()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=684.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=63048440.0, style=ProgressStyle(descrip…


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 512)]             0         
_________________________________________________________________
tf_albert_model (TFAlbertMod ((None, 512, 768), (None, 11683584  
_________________________________________________________________
tf_op_layer_strided_slice (T [(None, 768)]             0         
_________________________________________________________________
dense (Dense)                (None, 1)                 769       
Total params: 11,684,353
Trainable params: 11,684,353
Non-trainable params: 0
_________________________________________________________________
CPU times: user 8.69 s, sys: 1.2 s, total: 9.89 s
Wall time: 12.7 s


In [16]:
n_steps = x_train.shape[0] // BATCH_SIZE
train_history = model.fit(
    train_dataset,
    steps_per_epoch=n_steps,
    validation_data=valid_dataset,
    epochs=3
)

Train for 1746 steps, validate for 125 steps
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [17]:
n_steps = x_valid.shape[0] // BATCH_SIZE
train_history_2 = model.fit(
    valid_dataset.repeat(),
    steps_per_epoch=n_steps,
    epochs=3
)

Train for 62 steps
Epoch 1/3
Epoch 2/3
Epoch 3/3


## Submission

In [18]:
test_pred = []
for td in test_dataset:
    test_pred.append(model.predict(td))    

In [19]:
#sub['toxic'] = model.predict(test_dataset, verbose=1)
sub['toxic'] = np.mean(test_pred,axis=0)
sub.to_csv('submission.csv', index=False)