In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip
/kaggle/input/jigsaw-toxic-comment-classification-challenge/test.csv.zip


In [2]:
!pip install transformers clean-text

Collecting clean-text
  Downloading clean_text-0.3.0-py3-none-any.whl (9.6 kB)
Collecting ftfy<6.0,>=5.8
  Downloading ftfy-5.8.tar.gz (64 kB)
[K     |████████████████████████████████| 64 kB 763 kB/s 
Building wheels for collected packages: ftfy
  Building wheel for ftfy (setup.py) ... [?25l- \ done
[?25h  Created wheel for ftfy: filename=ftfy-5.8-py3-none-any.whl size=45612 sha256=129766179adebfef6ba7e24f091dfeef419b9e81a442a0e5334e93d8b28b4e3e
  Stored in directory: /root/.cache/pip/wheels/49/1c/fc/8b19700f939810cd8fd9495ae34934b246279791288eda1c31
Successfully built ftfy
Installing collected packages: ftfy, clean-text
Successfully installed clean-text-0.3.0 ftfy-5.8
You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [3]:
train_path = '../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip'

data = pd.read_csv(train_path)

In [4]:
print('Number of Records: {}, Number of features/columns: {}'.format(data.shape[0], data.shape[1]))

Number of Records: 159571, Number of features/columns: 8


In [5]:
print('Null values: {}'.format(data.isnull().values.sum()))

Null values: 0


In [6]:
target_columns = list(data.columns)[2:]
y_labels = data[target_columns].values

In [7]:
from transformers import BertTokenizer, BertConfig, TFBertModel, ElectraConfig, TFElectraModel, ElectraTokenizer
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tqdm import tqdm
from cleantext import clean



In [8]:
bert = 'bert-base-uncased'#'google/electra-small-discriminator'

tokenizer = BertTokenizer.from_pretrained(bert, do_lower_case=True, add_special_tokens=True,
                                                max_length=128, pad_to_max_length=True)


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [9]:
def cleaning(text):
    return clean(text, no_line_breaks=True, no_urls=True, no_punct=True)

def tokenize(sentences, tokenizer):
    
    input_ids = []
    input_masks = []
    #input_segments = []
    
    for sentence in tqdm(sentences):
        inputs = tokenizer.encode_plus(sentence, add_special_tokens=True, 
                                       max_length=128, pad_to_max_length=True, 
                                       return_attention_mask=True, return_token_type_ids=True)
        
        input_ids.append(inputs['input_ids'])
        input_masks.append(inputs['attention_mask'])
        #input_segments.append(inputs['token_type_ids'])        
        
    return np.asarray(input_ids, dtype='int32'),np.asarray(input_masks, dtype='int32')

In [10]:
data['comment_text'] = data['comment_text'].apply(cleaning)
input_ids, input_masks = tokenize(data['comment_text'], tokenizer)

100%|██████████| 159571/159571 [06:27<00:00, 411.95it/s]


In [11]:
config = BertConfig(dropout=0.2, attention_dropout=0.2)

config.output_hidden_states = False

transformer_model = TFBertModel.from_pretrained(bert, config=config)
# transformer_model = tf.keras.layers.Embedding(len(tokenizer), 256, input_length=128)

input_ids_in = tf.keras.layers.Input(shape=(128,), name='input_token', dtype='int32')
input_masks_in = tf.keras.layers.Input(shape=(128,), name='masked_token', dtype='int32')

embedding_layer = transformer_model(input_ids_in, attention_mask=input_masks_in)[0]
# embedding_layer = transformer_model(input_ids_in)
X = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50, 
                                                       return_sequences=True, 
                                                       dropout=0.1, 
                                                       recurrent_dropout=0.1))(embedding_layer)
X = tf.keras.layers.GlobalMaxPool1D()(X)
X = tf.keras.layers.Dense(50, activation='relu')(X)
X = tf.keras.layers.Dropout(0.2)(X)
X = tf.keras.layers.Dense(6, activation='sigmoid')(X)

model = tf.keras.models.Model(inputs=[input_ids_in, input_masks_in], outputs=X)

# for layer in model.layers[:3]:
#     layer.trainable = False

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=536063208.0, style=ProgressStyle(descri…




In [12]:
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_token (InputLayer)        [(None, 128)]        0                                            
__________________________________________________________________________________________________
masked_token (InputLayer)       [(None, 128)]        0                                            
__________________________________________________________________________________________________
tf_bert_model (TFBertModel)     ((None, 128, 768), ( 109482240   input_token[0][0]                
                                                                 masked_token[0][0]               
__________________________________________________________________________________________________
bidirectional (Bidirectional)   (None, 128, 100)     327600      tf_bert_model[0][0]   

In [13]:
# model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [14]:
X_train_id, X_test_id, X_train_mask, X_test_mask, y_train, y_test = train_test_split(input_ids, 
                                                                                     input_masks, 
                                                                                     y_labels,
                                                                                     test_size=0.2, 
                                                                                     random_state=42)

In [15]:
hist = model.fit([X_train_id, X_train_mask], 
                 y_train, 
                 validation_data=([X_test_id, X_test_mask], y_test),
                 epochs=1,
                 batch_size=64)



In [16]:
model.save_weights('bert_complete_1_epochs_bce.h5')

In [17]:
buf = data[data.toxic!=0]

In [18]:
buf.sample(1).to_numpy()

array([['055d4741028724de',
        'fuck off please isnt my fault he is a leech on society with no job friends or scholastic ambition is it really vandalism if its the truth i think not kthnxbai',
        1, 0, 1, 0, 0, 0]], dtype=object)

In [19]:
# sample_text = 'wtf fuck you what the hell do you think your doing because your an admin on a power hungry strike doesnt mean you should tell people what to do you dumb faget you fucking jap bitch'
# sample_text = 'order of each section varies while reading this article i noticed that depending on the section it varied between islamjudaismchristian and islamchristianjudaism for the sake of neatness and uniformity i suggest we reorganize each to be alphabeticalneutral order so as not to lend bias towards any thus ordering each section christianislamjudaism 2418719109'
# sample_text = "I eat an apple today"
sample_text = "I am not taking your crap"
sample_text = "Please can you change your attitude"
sample_text = "Please get out of here"
sample_text = "Please get the fuck out of here"
clean_txt = cleaning(sample_text)
print(clean_txt)
input_ids_test, input_masks_test = tokenize([clean_txt], tokenizer)

100%|██████████| 1/1 [00:00<00:00, 566.42it/s]

please get the fuck out of here





In [20]:
preds = model.predict([input_ids_test, input_masks_test])[0]
print(preds)
print(preds > 0.85)
# prediction = target_columns[np.argmax(preds, axis=0)]
# print(prediction)

[0.10550249 0.01092458 0.0533733  0.00365109 0.04972906 0.00856117]
[False False False False False False]


In [21]:
sample_submission_path = '../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip'
sample_submission = pd.read_csv(sample_submission_path)
sample_submission.head()


test_path = '../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip'
df_test = pd.read_csv(test_path)

df_test.head()


ids = df_test['id']
X_t = df_test['comment_text'].apply(cleaning)
sub_input_ids, sub_input_masks = tokenize(X_t, tokenizer)


predictions = model.predict([sub_input_ids, sub_input_masks])

ids = pd.Series(ids)
y_preds = pd.DataFrame(predictions, columns=target_columns)

final_submission = pd.concat([ids, y_preds], axis=1)

final_submission.head()

final_submission.to_csv('submission_bert_complete_1_epochs.csv', index=False)

100%|██████████| 153164/153164 [05:38<00:00, 452.99it/s]


In [22]:
final_submission.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.105502,0.010925,0.053373,0.003651,0.049729,0.008561
1,0000247867823ef7,0.105502,0.010925,0.053373,0.003651,0.049729,0.008561
2,00013b17ad220c46,0.105502,0.010925,0.053373,0.003651,0.049729,0.008561
3,00017563c3f7919a,0.105502,0.010925,0.053373,0.003651,0.049729,0.008561
4,00017695ad8997eb,0.105502,0.010925,0.053373,0.003651,0.049729,0.008561


In [23]:
df_ans = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip')
df_ans.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,-1,-1,-1,-1,-1,-1
3,00017563c3f7919a,-1,-1,-1,-1,-1,-1
4,00017695ad8997eb,-1,-1,-1,-1,-1,-1


In [24]:
df = final_submission.copy()

In [25]:
df = df.sort_values('id') 

In [26]:
df_ans = df_ans.sort_values('id')
df_ans = df_ans.replace(to_replace=-1,value=0)

In [27]:
df_ans = df_ans.drop(['id'], axis=1)
df = df.drop(['id'], axis=1)

In [28]:
gold = df_ans.to_numpy()
preds = df.to_numpy()

In [29]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
accuracy_score(gold, (preds>0.96))

0.9592397691363506

In [30]:
count_when_atleast_one = 0
total_when_atleast_one = 0
count_both_zero =0
total_both_zero = 0
tp, fp, tn, fn = 0,0,0,0
for i in range(len(gold)):
#     if (gold[i]==(preds[i] > 0.96)).all():
#         count+=1
    if (np.sum(gold[i])!=0 and np.sum(preds[i] > 0.8)!=0):
        tp+=1
        count_when_atleast_one +=1
        total_when_atleast_one +=1
    elif (np.sum(gold[i])!=0):
        fn+=1
        total_when_atleast_one +=1
    elif np.sum(gold[i])==0 and np.sum(preds[i] > 0.80)==0:
        tn+=1
        count_both_zero +=1
        total_both_zero +=1
    elif (np.sum(gold[i]))==0:
        fp+=1
        total_both_zero +=1
        
print(count_when_atleast_one/total_when_atleast_one, count_when_atleast_one, total_when_atleast_one)
print(count_both_zero/ total_both_zero, count_both_zero, total_both_zero)
print((count_when_atleast_one+count_both_zero)/ (total_when_atleast_one+total_both_zero))
print([[tp, fn],[fp, tn]])

0.0 0 6243
1.0 146921 146921
0.9592397691363506
[[0, 6243], [0, 146921]]


In [31]:
print("Precision: ", tp/(tp+fp) )
print("Recall:", tp/(tp+fn))
p = tp/(tp+fp)
r = tp/(tp+fn)
print("F1: ", 2/((1/r) + (1/p)))

ZeroDivisionError: division by zero

In [32]:
f1_score(gold, preds)

ValueError: Classification metrics can't handle a mix of multilabel-indicator and continuous-multioutput targets

In [33]:
preds_int = np.zeros_like(preds)
preds_int[preds>0.96] = 1
print(accuracy_score(gold, preds_int))

0.9592397691363506


In [34]:
df = final_submission > 0.5

TypeError: '>' not supported between instances of 'str' and 'float'

In [35]:
df['test'] = np.where(df_ans.id==final_submission.id & df_ans.toxic=)

SyntaxError: invalid syntax (<ipython-input-35-9b78b55cb820>, line 1)

In [36]:
from IPython.display import HTML

def create_download_link(title = "Download CSV file", filename = "data.csv"):  
    html = '<a href={filename}>{title}</a>'
    html = html.format(title=title,filename=filename)
    return HTML(html)

# create a link to download the dataframe which was saved with .to_csv method
# create_download_link(filename='submission_5_epochs.csv')
create_download_link(filename='./submission_electra_1_epochs.csv')