In [1]:
# Load Huggingface transformers
from transformers import TFBertModel,  BertConfig, BertTokenizerFast, TFAutoModel

# Then what you need from tensorflow.keras
from tensorflow.keras.layers import Input, Dropout, Dense, GlobalAveragePooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical

# And pandas for data import + sklearn because you allways need sklearn
import pandas as pd
import tensorflow as tf
import re
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip


In [3]:
df=pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')

#df = df.sample(frac=1)
#159571, 8

In [4]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [5]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [6]:
df['comment_text'] = df['comment_text'].map(lambda x : clean_text(x))

In [7]:
train_sentences = df["comment_text"].fillna("CVxTz").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
train_y = df[list_classes].values

In [8]:
train_sentences.shape

(159571,)

In [9]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation why the edits made under my userna...,0,0,0,0,0,0
1,000103f0d9cfb60f,d aww he matches this background colour i am s...,0,0,0,0,0,0
2,000113f07ec002fd,hey man i am really not trying to edit war it ...,0,0,0,0,0,0
3,0001b41b1c6bb37e,more i cannot make any real suggestions on imp...,0,0,0,0,0,0
4,0001d958c54c6e35,you sir are my hero any chance you remember wh...,0,0,0,0,0,0


In [10]:
# Name of the BERT model to use
model_name = 'bert-base-uncased'

# Max length of tokens
max_length = 128 # max 512

# Load transformers config and set output_hidden_states to False
config = BertConfig.from_pretrained(model_name)
#config.output_hidden_states = False

# Load BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained(pretrained_model_name_or_path = model_name, config = config)
bert = TFAutoModel.from_pretrained(model_name)

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/536M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [11]:
input_ids = Input(shape=(max_length,), name='input_ids', dtype='int32')
attention_mask = Input(shape=(max_length,), name='attention_mask', dtype='int32') 
inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}
x = bert.bert(inputs)

In [12]:
input_ids

<KerasTensor: shape=(None, 128) dtype=int32 (created by layer 'input_ids')>

In [13]:
attention_mask

<KerasTensor: shape=(None, 128) dtype=int32 (created by layer 'attention_mask')>

In [14]:
x

TFBaseModelOutputWithPooling(last_hidden_state=<KerasTensor: shape=(None, 128, 768) dtype=float32 (created by layer 'bert')>, pooler_output=<KerasTensor: shape=(None, 768) dtype=float32 (created by layer 'bert')>, hidden_states=None, attentions=None)

In [15]:
#x2 =Dense(512, activation='relu')(x[1])
x2 = GlobalAveragePooling1D()(x[0])
#x3 = Dropout(0.5)(x2)
y =Dense(len(list_classes), activation='sigmoid', name='outputs')(x2)

model = Model(inputs=inputs, outputs=y)
#model.layers[2].trainable = False

# Take a look at the model
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
attention_mask (InputLayer)     [(None, 128)]        0                                            
__________________________________________________________________________________________________
input_ids (InputLayer)          [(None, 128)]        0                                            
__________________________________________________________________________________________________
bert (TFBertMainLayer)          TFBaseModelOutputWit 109482240   attention_mask[0][0]             
                                                                 input_ids[0][0]                  
__________________________________________________________________________________________________
global_average_pooling1d (Globa (None, 768)          0           bert[0][0]                   

In [16]:
optimizer = Adam(lr=1e-5, decay=1e-6)
model.compile(loss='binary_crossentropy',
                  optimizer=optimizer,
                  metrics=['accuracy'])

In [17]:
# Tokenize the input 
x = tokenizer(
    text=list(train_sentences),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding='max_length', # padding=True initial value,
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)


In [18]:
history = model.fit(
    x={'input_ids': x['input_ids'], 'attention_mask': x['attention_mask']},
    #x={'input_ids': x['input_ids']},
    y={'outputs': train_y},
    validation_split=0.1,
    batch_size=32,
    epochs=1)



In [19]:
test_df=pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')
test_df['comment_text']=test_df['comment_text'].map(lambda x : clean_text(x))
test_sentences = test_df["comment_text"].fillna("CVxTz").values

In [20]:
test_x = tokenizer(
    text=list(test_sentences),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding='max_length',  
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

In [21]:
del test_sentences
del x
del df

In [22]:
import gc
gc.collect()

40

In [23]:
test_x

{'input_ids': <tf.Tensor: shape=(159571, 128), dtype=int32, numpy=
array([[  101,  7526,  2339, ...,     0,     0,     0],
       [  101,  1040, 22091, ...,     0,     0,     0],
       [  101,  4931,  2158, ...,     0,     0,     0],
       ...,
       [  101, 13183,  6290, ...,     0,     0,     0],
       [  101,  1998,  2009, ...,     0,     0,     0],
       [  101,  1998,  1045, ...,     0,     0,     0]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(159571, 128), dtype=int32, numpy=
array([[1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]], dtype=int32)>}

In [24]:
predictions=model.predict(x={'input_ids': test_x['input_ids'], 'attention_mask': test_x['attention_mask']},batch_size=32)

In [25]:
predictions

array([[9.9149789e-04, 2.3875693e-05, 1.5620681e-04, 3.3332170e-05,
        1.2906051e-04, 7.3448733e-05],
       [2.6284985e-04, 4.6903431e-05, 1.1291747e-04, 2.0804227e-05,
        6.4100699e-05, 4.9708087e-05],
       [1.5763283e-03, 2.2878798e-05, 1.9805254e-04, 3.8970338e-05,
        2.2685186e-04, 6.6173765e-05],
       ...,
       [1.4240363e-01, 3.6619487e-04, 2.8548745e-02, 5.8443483e-04,
        8.0316411e-03, 5.3925766e-03],
       [6.9780560e-04, 2.4109782e-05, 9.1678623e-05, 3.4081160e-05,
        1.8030388e-04, 6.8993555e-05],
       [8.3428442e-02, 8.1349412e-05, 8.0972124e-04, 5.2279065e-04,
        1.7353789e-03, 4.4069570e-04]], dtype=float32)

In [26]:
submission=pd.DataFrame(predictions,columns=list_classes)
submission['id'] = test_df['id']
submission=submission[['id']+(list_classes)]
submission.to_csv("submission.csv", index=False)

In [27]:
submission

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,0.000991,0.000024,0.000156,0.000033,0.000129,0.000073
1,000103f0d9cfb60f,0.000263,0.000047,0.000113,0.000021,0.000064,0.000050
2,000113f07ec002fd,0.001576,0.000023,0.000198,0.000039,0.000227,0.000066
3,0001b41b1c6bb37e,0.000211,0.000048,0.000193,0.000033,0.000115,0.000066
4,0001d958c54c6e35,0.014668,0.000070,0.000322,0.000546,0.001902,0.000188
...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,0.000683,0.000030,0.000144,0.000026,0.000110,0.000084
159567,ffea4adeee384e90,0.596419,0.000376,0.005690,0.003234,0.038713,0.003663
159568,ffee36eab5c267c9,0.142404,0.000366,0.028549,0.000584,0.008032,0.005393
159569,fff125370e4aaaf3,0.000698,0.000024,0.000092,0.000034,0.000180,0.000069


In [28]:
!ls

__notebook__.ipynb  submission.csv


In [29]:
model.save('Bert_Dcnn_model',save_format='tf')

# Tests


In [30]:
#input_ids = Input(shape=(3,), name='input_ids', dtype='int32')
#attention_mask = Input(shape=(3,), name='attention_mask', dtype='int32') 
#inputs = {'input_ids': input_ids, 'attention_mask': attention_mask}
#x = bert.bert(inputs)
# padding='max_length

In [31]:
raw_text = ["you are a sore loser and you deserve nothing"]
test_token= tokenizer(
    text=list(raw_text),
    add_special_tokens=True,
    max_length=max_length,
    truncation=True,
    padding='max_length', 
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

In [32]:
test_token

{'input_ids': <tf.Tensor: shape=(1, 128), dtype=int32, numpy=
array([[  101,  2017,  2024,  1037, 14699, 10916,  1998,  2017, 10107,
         2498,   102,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0, 

In [33]:
results = model.predict(x={'input_ids': test_token['input_ids'], 'attention_mask': test_token['attention_mask']},batch_size=32)

In [34]:
print(results)

[[0.9910919  0.03494258 0.2981257  0.03010858 0.79706556 0.00459326]]
