In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip


In [2]:
!pip install transformers clean-text

Collecting clean-text
  Downloading clean_text-0.3.0-py3-none-any.whl (9.6 kB)
Collecting ftfy<6.0,>=5.8
  Downloading ftfy-5.8.tar.gz (64 kB)
[K     |████████████████████████████████| 64 kB 1.2 MB/s 
Building wheels for collected packages: ftfy
  Building wheel for ftfy (setup.py) ... [?25l- \ | done
[?25h  Created wheel for ftfy: filename=ftfy-5.8-py3-none-any.whl size=45612 sha256=3daf31568c2f44c62ae8a21d2e51d54c18182a8536c24e3011d3a95e32add666
  Stored in directory: /root/.cache/pip/wheels/49/1c/fc/8b19700f939810cd8fd9495ae34934b246279791288eda1c31
Successfully built ftfy
Installing collected packages: ftfy, clean-text
Successfully installed clean-text-0.3.0 ftfy-5.8


In [3]:
train_path = '../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip'

data = pd.read_csv(train_path)

In [4]:
print('Number of Records: {}, Number of features/columns: {}'.format(data.shape[0], data.shape[1]))

Number of Records: 159571, Number of features/columns: 8


In [5]:
print('Null values: {}'.format(data.isnull().values.sum()))

Null values: 0


In [6]:
target_columns = list(data.columns)[2:]
y_labels = data[target_columns].values

In [7]:
from transformers import BertTokenizer, BertConfig, TFBertModel
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tqdm import tqdm
from cleantext import clean

In [8]:
bert = 'bert-base-uncased'

tokenizer = BertTokenizer.from_pretrained(bert, do_lower_case=True, add_special_tokens=True,
                                                max_length=128, pad_to_max_length=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [9]:
def cleaning(text):
    return clean(text, no_line_breaks=True, no_urls=True, no_punct=True)

def tokenize(sentences, tokenizer):
    
    input_ids = []
    input_masks = []
    #input_segments = []
    
    for sentence in tqdm(sentences):
        inputs = tokenizer.encode_plus(sentence, add_special_tokens=True, 
                                       max_length=128, pad_to_max_length=True, 
                                       return_attention_mask=True, return_token_type_ids=True)
        
        input_ids.append(inputs['input_ids'])
        input_masks.append(inputs['attention_mask'])
        #input_segments.append(inputs['token_type_ids'])        
        
    return np.asarray(input_ids, dtype='int32'),np.asarray(input_masks, dtype='int32')

In [10]:
data['comment_text'] = data['comment_text'].apply(cleaning)
input_ids, input_masks = tokenize(data['comment_text'], tokenizer)

  0%|          | 0/159571 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 159571/159571 [08:17<00:00, 320.63it/s]


In [11]:
config = BertConfig(dropout=0.2, attention_dropout=0.2)

config.output_hidden_states = False

transformer_model = TFBertModel.from_pretrained(bert, config=config)

input_ids_in = tf.keras.layers.Input(shape=(128,), name='input_token', dtype='int32')
input_masks_in = tf.keras.layers.Input(shape=(128,), name='masked_token', dtype='int32')

embedding_layer = transformer_model(input_ids_in, attention_mask=input_masks_in)[0]
X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50, 
                                                       return_sequences=True, 
                                                       dropout=0.1, 
                                                       recurrent_dropout=0.1))(embedding_layer)
X = tf.keras.layers.GlobalMaxPool1D()(X)
X = tf.keras.layers.Dense(50, activation='relu')(X)
X = tf.keras.layers.Dropout(0.2)(X)
X = tf.keras.layers.Dense(6, activation='sigmoid')(X)

model = tf.keras.models.Model(inputs=[input_ids_in, input_masks_in], outputs=X)

for layer in model.layers[:3]:
    layer.trainable = False

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=536063208.0, style=ProgressStyle(descri…




Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [12]:
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_token (InputLayer)        [(None, 128)]        0                                            
__________________________________________________________________________________________________
masked_token (InputLayer)       [(None, 128)]        0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     ((None, 128, 768), ( 109482240   input_token[0][0]                
                                                                 masked_token[0][0]               
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 128, 100)     327600      tf_bert_model[0][0]   

In [13]:
# model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [14]:
X_train_id, X_test_id, X_train_mask, X_test_mask, y_train, y_test = train_test_split(input_ids, 
                                                                                     input_masks, 
                                                                                     y_labels,
                                                                                     test_size=0.2, 
                                                                                     random_state=42)

In [15]:
hist = model.fit([X_train_id, X_train_mask], 
                 y_train, 
                 validation_data=([X_test_id, X_test_mask], y_test),
                 epochs=2,
                 batch_size=64)

Epoch 1/2
Epoch 2/2


In [16]:
model.save_weights('bert_2_epochs_BCE.h5')

In [17]:
sample_text = 'I hate you, you idiot!'
clean_txt = cleaning(sample_text)
input_ids_test, input_masks_test = tokenize(clean_txt, tokenizer)

100%|██████████| 20/20 [00:00<00:00, 4199.34it/s]


In [18]:
preds = model.predict([input_ids_test, input_masks_test])[0]
prediction = target_columns[np.argmax(preds, axis=0)]
print(prediction)

toxic


In [19]:
sample_submission_path = '../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip'
sample_submission = pd.read_csv(sample_submission_path)
sample_submission.head()


test_path = '../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip'
df_test = pd.read_csv(test_path)

df_test.head()


ids = df_test['id']
X_t = df_test['comment_text'].apply(cleaning)
sub_input_ids, sub_input_masks = tokenize(X_t, tokenizer)


predictions = model.predict([sub_input_ids, sub_input_masks])

ids = pd.Series(ids)
y_preds = pd.DataFrame(predictions, columns=target_columns)

final_submission = pd.concat([ids, y_preds], axis=1)

final_submission.head()

final_submission.to_csv('submission_2_epochs.csv', index=False)

100%|██████████| 153164/153164 [07:12<00:00, 353.83it/s]
