In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sklearn.metrics

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip
/kaggle/input/transformers/electra_5_epochs.h5
/kaggle/input/transformers/__results__.html
/kaggle/input/transformers/submission_5_epochs.csv
/kaggle/input/transformers/__notebook__.ipynb
/kaggle/input/transformers/__output__.json
/kaggle/input/transformers/custom.css


In [2]:
!pip install transformers clean-text

Collecting clean-text
  Downloading clean_text-0.3.0-py3-none-any.whl (9.6 kB)
Collecting ftfy<6.0,>=5.8
  Downloading ftfy-5.8.tar.gz (64 kB)
[K     |████████████████████████████████| 64 kB 371 kB/s 
Building wheels for collected packages: ftfy
  Building wheel for ftfy (setup.py) ... [?25l- \ done
[?25h  Created wheel for ftfy: filename=ftfy-5.8-py3-none-any.whl size=45612 sha256=8a3a39280dcdaaa8986341089c191f9b3f20bfeca7afaa4d61a25f88d731d82a
  Stored in directory: /root/.cache/pip/wheels/49/1c/fc/8b19700f939810cd8fd9495ae34934b246279791288eda1c31
Successfully built ftfy
Installing collected packages: ftfy, clean-text
Successfully installed clean-text-0.3.0 ftfy-5.8
You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [3]:
train_path = '../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip'

data = pd.read_csv(train_path)

In [4]:
print('Number of Records: {}, Number of features/columns: {}'.format(data.shape[0], data.shape[1]))

Number of Records: 159571, Number of features/columns: 8


In [5]:
print('Null values: {}'.format(data.isnull().values.sum()))

Null values: 0


In [6]:
target_columns = list(data.columns)[2:]
y_labels = data[target_columns].values

In [7]:
from transformers import BertTokenizer, ElectraConfig, TFElectraModel
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tqdm import tqdm
from cleantext import clean



In [8]:
electra = 'google/electra-small-discriminator'

tokenizer = BertTokenizer.from_pretrained(electra, do_lower_case=True, add_special_tokens=True,
                                                max_length=128, pad_to_max_length=True)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




In [9]:
def cleaning(text):
    return clean(text, no_line_breaks=True, no_urls=True, no_punct=True)

def tokenize(sentences, tokenizer):
    
    input_ids = []
    input_masks = []
    #input_segments = []
    
    for sentence in tqdm(sentences):
        inputs = tokenizer.encode_plus(sentence, add_special_tokens=True, 
                                       max_length=128, pad_to_max_length=True, 
                                       return_attention_mask=True, return_token_type_ids=True)
        
        input_ids.append(inputs['input_ids'])
        input_masks.append(inputs['attention_mask'])
        #input_segments.append(inputs['token_type_ids'])        
        
    return np.asarray(input_ids, dtype='int32'),np.asarray(input_masks, dtype='int32')

In [10]:
tokenize(['heun nsinm ub'], tokenizer)

100%|██████████| 1/1 [00:00<00:00, 690.65it/s]


(array([[  101,  2002,  4609, 24978,  2378,  2213,  1057,  2497,   102,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,

In [11]:
data['comment_text'] = data['comment_text'].apply(cleaning)
input_ids, input_masks = tokenize(data['comment_text'], tokenizer)

100%|██████████| 159571/159571 [06:24<00:00, 415.41it/s]


In [12]:
config = ElectraConfig(dropout=0.2, attention_dropout=0.2)

config.output_hidden_states = False

transformer_model = TFElectraModel.from_pretrained(electra, config=config)

input_ids_in = tf.keras.layers.Input(shape=(128,), name='input_token', dtype='int32')
input_masks_in = tf.keras.layers.Input(shape=(128,), name='masked_token', dtype='int32')

embedding_layer = transformer_model(input_ids_in, attention_mask=input_masks_in)[0]
X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50, 
                                                       return_sequences=True, 
                                                       dropout=0.1, 
                                                       recurrent_dropout=0.1))(embedding_layer)
X = tf.keras.layers.GlobalMaxPool1D()(X)
X = tf.keras.layers.Dense(50, activation='relu')(X)
X = tf.keras.layers.Dropout(0.2)(X)
X = tf.keras.layers.Dense(6, activation='sigmoid')(X)

model = tf.keras.models.Model(inputs=[input_ids_in, input_masks_in], outputs=X)

for layer in model.layers[:3]:
    layer.trainable = False

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=54466044.0, style=ProgressStyle(descrip…




In [13]:
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_token (InputLayer)        [(None, 128)]        0                                            
__________________________________________________________________________________________________
masked_token (InputLayer)       [(None, 128)]        0                                            
__________________________________________________________________________________________________
tf_electra_model (TFElectraMode ((None, 128, 256),)  13483008    input_token[0][0]                
                                                                 masked_token[0][0]               
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 128, 100)     122800      tf_electra_model[0][0]

In [14]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [15]:
X_train_id, X_test_id, X_train_mask, X_test_mask, y_train, y_test = train_test_split(input_ids, 
                                                                                     input_masks, 
                                                                                     y_labels,
                                                                                     test_size=0.2, 
                                                                                     random_state=42)

In [16]:
hist = model.fit([X_train_id, X_train_mask], 
                 y_train, 
                 validation_data=([X_test_id, X_test_mask], y_test),
                 epochs=1,
                 batch_size=64)



In [17]:
model.save_weights('electra_1_bce_epochs.h5')

In [18]:
data.sample(20)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
9996,1a7a4868968e2b9e,those two love to disagree dont they 206170111187,0,0,0,0,0,0
126388,a3f3efcd86915222,romanesque architecture hi wlu thanks much for...,0,0,0,0,0,0
114034,6209fca1e88f03c7,if the author of the image is ok with the idea...,0,0,0,0,0,0
151187,79035e14bff6f1c1,on a personal note thanks for cleaning up the ...,0,0,0,0,0,0
96264,02dbaad7cba51d2a,toolserver is useless and the last dump hasnt ...,0,0,0,0,0,0
92922,f87289a4b59ad39b,before returning home and being disbanded in june,0,0,0,0,0,0
150822,729f3083ffc7e0c3,you personally may not have edited the list of...,0,0,0,0,0,0
61524,a4aafd043b7f12a4,corgan has said himself that the band is the s...,0,0,0,0,0,0
94573,fcda5caccc204861,copyediting ive gone through the article most ...,0,0,0,0,0,0
96287,02fdc006d720e5e7,in their marketing publications,0,0,0,0,0,0


In [19]:
sample_text = 'Buck Winston \n\nFuck you, I wont do'
clean_txt = cleaning(sample_text)
input_ids_test, input_masks_test = tokenize(clean_txt, tokenizer)

100%|██████████| 31/31 [00:00<00:00, 2740.16it/s]


In [20]:
preds = model.predict([input_ids_test, input_masks_test])[0]
prediction = target_columns[np.argmax(preds, axis=0)]
print(preds)

[0.02709789 0.0006781  0.00950498 0.00074795 0.00628975 0.00124687]


In [21]:
print(preds)

[0.02709789 0.0006781  0.00950498 0.00074795 0.00628975 0.00124687]


In [22]:
# test_path = '../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip'
# test_label_path = '../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip'

# test = pd.read_csv(test_path)
# test_labels = pd.read_csv(test_label_path)

# test_labels.head()

In [23]:
# test_set = test.merge(test_labels, left_index=True, right_index=True)
# test_set = test_set[test_set.toxic!=-1]
# test_set = test_set[["id_x", "comment_text", "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]]
# test_set = test_set.reset_index(drop=True)
# test_set = test_set.rename(columns={"id_x": "id"})
# ids = test_set['id']
# X_t = test_set['comment_text'].apply(cleaning)
# gold = test_set[target_columns]
# sub_input_ids, sub_input_masks = tokenize(X_t, tokenizer)
# test_set.head()

In [24]:

# predictions = model.predict([sub_input_ids, sub_input_masks])



In [25]:
# print(gold[:5], predictions[:5])

In [26]:
# print(sklearn.metrics.classification_report(np.matrix(gold), (predictions>0.8)*1.0))

In [27]:
sample_submission_path = '../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip'
sample_submission = pd.read_csv(sample_submission_path)
sample_submission.head()


test_path = '../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip'
df_test = pd.read_csv(test_path)

df_test.head()


ids = df_test['id']
X_t = df_test['comment_text'].apply(cleaning)
sub_input_ids, sub_input_masks = tokenize(X_t, tokenizer)


predictions = model.predict([sub_input_ids, sub_input_masks])

ids = pd.Series(ids)
y_preds = pd.DataFrame(predictions, columns=target_columns)

final_submission = pd.concat([ids, y_preds], axis=1)

final_submission.head()

final_submission.to_csv('submission_1_bce_epochs.csv', index=False)



100%|██████████| 153164/153164 [06:10<00:00, 412.99it/s]
