In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import sklearn.metrics

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip
/kaggle/input/transformers/electra_5_epochs.h5
/kaggle/input/transformers/__results__.html
/kaggle/input/transformers/submission_5_epochs.csv
/kaggle/input/transformers/__notebook__.ipynb
/kaggle/input/transformers/__output__.json
/kaggle/input/transformers/custom.css
/kaggle/input/trans-gpt2/__results__.html
/kaggle/input/trans-gpt2/__notebook_source__.ipynb
/kaggle/input/trans-gpt2/submission_5_epochs.csv
/kaggle/input/trans-gpt2/__notebook__.ipynb
/kaggle/input/trans-gpt2/__output__.json
/kaggle/input/trans-gpt2/custom.css


In [2]:
!pip install transformers clean-text

Collecting clean-text
  Downloading clean_text-0.3.0-py3-none-any.whl (9.6 kB)
Collecting ftfy<6.0,>=5.8
  Downloading ftfy-5.8.tar.gz (64 kB)
[K     |████████████████████████████████| 64 kB 296 kB/s 
Building wheels for collected packages: ftfy
  Building wheel for ftfy (setup.py) ... [?25l- \ done
[?25h  Created wheel for ftfy: filename=ftfy-5.8-py3-none-any.whl size=45612 sha256=c0d9a108cc7204a8693466b2ce805004fb66a6b172d8f7754df1176302f62cc6
  Stored in directory: /root/.cache/pip/wheels/49/1c/fc/8b19700f939810cd8fd9495ae34934b246279791288eda1c31
Successfully built ftfy
Installing collected packages: ftfy, clean-text
Successfully installed clean-text-0.3.0 ftfy-5.8
You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [3]:
train_path = '../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip'

data = pd.read_csv(train_path)

In [4]:
print('Number of Records: {}, Number of features/columns: {}'.format(data.shape[0], data.shape[1]))

Number of Records: 159571, Number of features/columns: 8


In [5]:
print('Null values: {}'.format(data.isnull().values.sum()))

Null values: 0


In [6]:
target_columns = list(data.columns)[2:]
y_labels = data[target_columns].values

In [7]:
from transformers import GPT2Tokenizer, GPT2Config, TFGPT2Model
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tqdm import tqdm
from cleantext import clean



In [8]:
electra = 'gpt2'

tokenizer = GPT2Tokenizer.from_pretrained(electra, do_lower_case=True, add_special_tokens=True,
                                                max_length=128, pad_to_max_length=True)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1042301.0, style=ProgressStyle(descript…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=456318.0, style=ProgressStyle(descripti…




1

In [9]:
def cleaning(text):
    return clean(text, no_line_breaks=True, no_urls=True, no_punct=True)

def tokenize(sentences, tokenizer):
    
    input_ids = []
    input_masks = []
    #input_segments = []
    for sentence in tqdm(sentences):
        inp = "sentence" if sentence=='' else sentence
        inputs = tokenizer.encode_plus(inp, add_special_tokens=True, 
                       max_length=128, pad_to_max_length=True, 
                       return_attention_mask=True, return_token_type_ids=True)
        input_ids.append(inputs['input_ids'])
        input_masks.append(inputs['attention_mask'])
        #input_segments.append(inputs['token_type_ids'])        
        
    return np.asarray(input_ids, dtype='int32'),np.asarray(input_masks, dtype='int32')

In [10]:
data['comment_text'] = data['comment_text'].apply(cleaning)
input_ids, input_masks = tokenize(data['comment_text'], tokenizer)

100%|██████████| 159571/159571 [03:06<00:00, 855.55it/s]


In [11]:
config = GPT2Config(dropout=0.2, attention_dropout=0.2)

config.output_hidden_states = False

transformer_model = TFGPT2Model.from_pretrained(electra, config=config)

input_ids_in = tf.keras.layers.Input(shape=(128,), name='input_token', dtype='int32')
input_masks_in = tf.keras.layers.Input(shape=(128,), name='masked_token', dtype='int32')

embedding_layer = transformer_model(input_ids_in, attention_mask=input_masks_in)[0]
X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50, 
                                                       return_sequences=True, 
                                                       dropout=0.1, 
                                                       recurrent_dropout=0.1))(embedding_layer)
X = tf.keras.layers.GlobalMaxPool1D()(X)
X = tf.keras.layers.Dense(50, activation='relu')(X)
X = tf.keras.layers.Dropout(0.2)(X)
X = tf.keras.layers.Dense(6, activation='sigmoid')(X)

model = tf.keras.models.Model(inputs=[input_ids_in, input_masks_in], outputs=X)

for layer in model.layers[:3]:
    layer.trainable = False

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=497933648.0, style=ProgressStyle(descri…




In [12]:
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_token (InputLayer)        [(None, 128)]        0                                            
__________________________________________________________________________________________________
masked_token (InputLayer)       [(None, 128)]        0                                            
__________________________________________________________________________________________________
tfgp_t2model (TFGPT2Model)      ((None, 128, 768), ( 124439808   input_token[0][0]                
                                                                 masked_token[0][0]               
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 128, 100)     327600      tfgp_t2model[0][0]    

In [13]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [14]:
X_train_id, X_test_id, X_train_mask, X_test_mask, y_train, y_test = train_test_split(input_ids, 
                                                                                     input_masks, 
                                                                                     y_labels,
                                                                                     test_size=0.2, 
                                                                                     random_state=42)

In [15]:
hist = model.fit([X_train_id, X_train_mask], 
                 y_train, 
                 validation_data=([X_test_id, X_test_mask], y_test),
                 epochs=1,
                 batch_size=64)



In [16]:
model.save_weights('gpt2_bce_1_epochs.h5')

In [17]:
data.sample(20)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
65891,b02f06855b8c687c,y0u l0se at life 0h n0 my last chance what d0 ...,1,0,0,0,0,0
110147,4d1d0a20c6333bb0,replaceable fair use imagefashid moussavijpg t...,0,0,0,0,0,0
8115,159887dee3689d6d,i never said all of them now did i i still don...,0,0,0,0,0,0
42166,7084c3eadd7b4b59,question i have uploaded the correct logo unde...,0,0,0,0,0,0
104385,2e8065c2aa531f4f,million award the million award for your contr...,0,0,0,0,0,0
61686,a514f853d48dcf3a,i agree with kingjeff that there some other wa...,0,0,0,0,0,0
155765,c350b56646851989,if you knew any better you would know that onl...,0,0,0,0,0,0
7355,138cbc77177fc03a,i am from canada and i have never witnessed ma...,0,0,0,0,0,0
93240,f949a4b4b197be1a,just discovered you have clearing up your talk...,0,0,0,0,0,0
30040,4fc4355c2639d1ef,when people do not piss me off i will act civi...,1,0,0,0,0,0


In [18]:
# sample_text = 'Buck Winston \n\nFuck you, I wont do'
# clean_txt = cleaning(sample_text)
# input_ids_test, input_masks_test = tokenize([clean_txt], tokenizer)

In [19]:
# preds = model.predict([input_ids_test, input_masks_test])[0]
# prediction = target_columns[np.argmax(preds, axis=0)]
# print(preds)

In [20]:
# print(preds)

In [21]:
# test_path = '../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip'
# test_label_path = '../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip'

# test = pd.read_csv(test_path)
# test_labels = pd.read_csv(test_label_path)

# test_labels.head()

In [22]:
# test_set = test.merge(test_labels, left_index=True, right_index=True)
# test_set = test_set[test_set.toxic!=-1]
# test_set = test_set[["id_x", "comment_text", "toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]]
# test_set = test_set.reset_index(drop=True)
# test_set = test_set.rename(columns={"id_x": "id"})
# ids = test_set['id']
# X_t = test_set['comment_text'].apply(cleaning)
# gold = test_set[target_columns]
# sub_input_ids, sub_input_masks = tokenize(X_t, tokenizer)
# test_set.head()

In [23]:

# predictions = model.predict([sub_input_ids, sub_input_masks])



In [24]:
# print(gold[:5], predictions[:5])

In [25]:
# print(sklearn.metrics.classification_report(np.matrix(gold), (predictions>0.8)*1.0))

In [26]:
sample_submission_path = '../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip'
sample_submission = pd.read_csv(sample_submission_path)
sample_submission.head()


test_path = '../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip'
df_test = pd.read_csv(test_path)

df_test.head()


ids = df_test['id']
X_t = df_test['comment_text'].apply(cleaning)

sub_input_ids, sub_input_masks = tokenize(X_t.to_numpy(), tokenizer)

predictions = model.predict([sub_input_ids, sub_input_masks])

ids = pd.Series(ids)
y_preds = pd.DataFrame(predictions, columns=target_columns)

final_submission = pd.concat([ids, y_preds], axis=1)



final_submission.head()

final_submission.to_csv('submission_1_bce_epochs.csv', index=False)

final_submission.head()


100%|██████████| 153164/153164 [02:47<00:00, 913.42it/s]


Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.958908,0.342784,0.827695,0.122103,0.868561,0.268531
1,0000247867823ef7,0.116088,0.003074,0.028823,0.024465,0.056758,0.022502
2,00013b17ad220c46,0.102799,0.002539,0.024178,0.019776,0.051933,0.020836
3,00017563c3f7919a,0.103355,0.002203,0.021004,0.01775,0.052656,0.01885
4,00017695ad8997eb,0.121792,0.002978,0.028365,0.023032,0.061283,0.023682


In [27]:
# test_labels = '../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip'
# df_test_labels = pd.read_csv(test_labels)

# df_test_labels.head()

In [28]:
# df_test_labels = df_test_labels[df_test_labels.toxic!=-1].reset_index(drop=True)
# df_ids = df_test_labels['id']
# df_preds = df_ids.merge(final_submission, left_index=True, right_index=True)