In [1]:
! unzip ../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip 
! unzip ../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip 
! unzip ../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip 
! unzip ../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip

Archive:  ../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip
  inflating: train.csv               
Archive:  ../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip
  inflating: test_labels.csv         
Archive:  ../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip
  inflating: sample_submission.csv   
Archive:  ../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip
  inflating: test.csv                


In [None]:
! pip install transformers
# filter the data, take the same portion of each sense
# save the best weight ,means less loss valued trained batch 
# do some warm up step initially to make it more effiecient

In [2]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf
import transformers
from tqdm.notebook import tqdm

In [3]:
bert_model = transformers.TFBertModel.from_pretrained("bert-base-uncased") 
bert_model.trainable = False

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/511M [00:00<?, ?B/s]

2022-01-22 10:43:44.106039: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-22 10:43:44.106994: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-22 10:43:44.107698: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-01-22 10:43:44.108540: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compil

In [4]:
batch_size=32
max_len=128
EPOCHS=2

In [5]:
data=pd.read_csv("./train.csv")
test_labels=pd.read_csv("./test_labels.csv")
data.head()
test_labels.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,-1,-1,-1,-1,-1,-1
3,00017563c3f7919a,-1,-1,-1,-1,-1,-1
4,00017695ad8997eb,-1,-1,-1,-1,-1,-1


In [14]:
# count the individual data
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'] 
positive_comment=data[data[label_cols].sum(axis=1)==0]
toxic_comment=data[data[label_cols].sum(axis=1)>0]

print(len(positive_comment)) #143346
print(len(toxic_comment))# 16225

# random samling from positve commnet to tkae out of 14000 comment

data=pd.concat([
    toxic_comment,
    positive_comment.sample(n=14_000,random_state=1)
])

data.shape

58715
16225


(30225, 8)

In [None]:
test_df=pd.read_csv("./test.csv")
test_df.shape

In [None]:
data["comment_text"]=data["comment_text"].map(lambda x:re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", x))

In [None]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'] 
sense_count_pd=pd.DataFrame(data[label_cols].value_counts()) 
sense_count_pd

In [None]:
labels =  data[label_cols].values

In [None]:
# indexes = np.arange(12)
# np.array(labels[indexes], dtype="int32")
labels

In [None]:
from sklearn.model_selection import train_test_split 
input_sen=data["comment_text"].values
# print(input_sen)
train_inputs,validation_inputs,train_labels,validation_labels=train_test_split(input_sen,labels,random_state=0,test_size=0.2) 


print(train_inputs.shape)
print(train_labels.shape)

print(validation_inputs.shape)
print(validation_labels.shape)
 

In [None]:
class BertSemanticDataGenerator(tf.keras.utils.Sequence): 
    def __init__(
        self,
        sentence_pairs,
        labels,
        batch_size=batch_size,
        shuffle=True,
        include_targets=True,
    ):
        self.sentence_pairs = sentence_pairs
        self.labels = labels
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.include_targets = include_targets
         
        
        self.tokenizer = transformers.BertTokenizer.from_pretrained(
            "bert-base-uncased", do_lower_case=True
        )
        self.indexes = np.arange(len(self.sentence_pairs))
        self.on_epoch_end()

    def __len__(self):
        # Denotes the number of batches per epoch.
        return len(self.sentence_pairs) // self.batch_size

    def __getitem__(self, idx):
        # Retrieves the batch of index.
        indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
        sentence_pairs = self.sentence_pairs[indexes]

        # With BERT tokenizer's batch_encode_plus batch of both the sentences are
        # encoded together and separated by [SEP] token.
        encoded = self.tokenizer.batch_encode_plus(
            sentence_pairs.tolist(),
            add_special_tokens=True,
            max_length=128,
            return_attention_mask=True,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_tensors="tf",
        )   

        bert_output = bert_model(**encoded)
        
        sequence_output = bert_output.last_hidden_state 
         
        if self.include_targets:
            labels = np.array(self.labels[indexes], dtype="int32")
            return sequence_output, labels
        else:
            return sequence_output

    def on_epoch_end(self): 
        if self.shuffle:
            np.random.RandomState(42).shuffle(self.indexes)

In [None]:
train_dataset=BertSemanticDataGenerator(train_inputs,train_labels,shuffle=True)
validation_dataset=BertSemanticDataGenerator(validation_inputs,validation_labels,shuffle=False)

In [None]:
# for d in tqdm(train_dataset):
#     print(d)

In [None]:
input_layer = tf.keras.layers.Input(shape=(128, 768), name=None)  
flat=tf.keras.layers.Flatten()(input_layer) 
output = tf.keras.layers.Dense(6, activation="softmax")(flat)
model = tf.keras.models.Model(inputs=input_layer, outputs=output)
    
model.summary()

In [None]:
model.compile(
        optimizer=tf.keras.optimizers.Adam(),
        loss="categorical_crossentropy",
        metrics=["acc"],
)

In [None]:
history = model.fit(
    train_dataset,
    validation_data=validation_dataset,
    epochs=2
)

In [None]:
for step,(x_batch_train,labels) in enumerate(tqdm(train_dataset)):
    print(labels.flatten().shape)

In [None]:
label

In [None]:
# custom training loop 
## update at each train step
## reset at the end of each batch

import time
## defining a optimizer 
optimizer=tf.keras.optimizers.SGD(learning_rate=1e-3)

## defining loss function 
loss_fn=tf.keras.losses.CategoricalCrossentropy(from_logits=True)
# ## mean loss define
train_loss=tf.keras.metrics.Mean(name="train_loss")
validation_loss=tf.keras.metrics.Mean(name="validation_loss")

# Metric
## dfining the accuracy metric to track our model accuracy.Here for 6 class we 
## have to declare 2d darray of row 6
train_acc_metric=[tf.keras.metrics.CategoricalAccuracy() for i in range(len(label_cols))]

val_acc_metric=[tf.keras.metrics.CategoricalAccuracy() for i in range(len(label_cols))]

# actually from logits denoting the probability from our custom model layed for each label.It is being fetched before the softmax layer to calculate loss between actual and predicted

batch_size=32
EPOCH=2
train_dataset_size=60000
validation_dataset_size=15000

@tf.function
def train_step(model,x_train,label):
    # Gradiane tape actually records the operation run in forward step
    with tf.GradientTape() as tape:
        #caluculate logits for comparison
        logits_prob=model(x_train,training=True)
        # calculate loss value 
        loss_value=loss_fn(label,logits_prob)
    #calculate gradient of trainable variables against the loss
    gradients=tape.gradient(loss_value,model.trainable_weights)
    # update the gradient according to gradient descent
    optimizer.apply_gradients(zip(gradients,model.trainable_weights))
    # update the mean train ing loss
    train_loss(loss_value)
    # update accuracy metric for each of the 6 classes 
    for i,auc in enumerate(train_acc_metric):
        auc.update_state(label[:,i],logits_prob[:,i])
    return loss_value

@tf.function
def validation_step(model,x_validation,label):
    with tf.GradientTape() as tape:
        validation_logit_prob=model(x_validation,training=False)
        valid_loss=loss_fn(label,validation_logit_prob)
        validation_loss(valid_loss)
        for i,auc in enumerate(val_acc_metric):
            auc.update_state(label[:,i],validation_logit_prob[:,i]) 

def train_model(model,train_dataset,validation_dataset):
    for epoch in range(EPOCHS):
        print('\n Epoch No %d\n' % (epoch,))

        ### training part ###
        for step,(x_batch_train,labels) in enumerate(tqdm(train_dataset)):
            training_loss=train_step(model,x_batch_train,labels)
            
            #log result at every 200 batches
            if step%200==0:
                print(f'\nTrain Step: {epoch}, Loss: {train_loss.result()}')
#                 print("Trainng loss at %d batch of data: %.4f"%(step,float(training_loss)))
                # training accuracy metric at end
                for i, label_name in enumerate(label_cols):
                    print(f"{label_name} roc_auc {train_acc_metric[i].result()}")
                    # reset the accuracy metric after every epoch
                    train_acc_metric[i].reset_states()
            
#         training_accuracy=train_acc_metric.result()
#         print("\nTraining accuracy after %d epoch : %.4f"%(epoch,training_accuracy))
#         train_acc_metric.reset_states()
        
        
        ### validation part ###
        for step,(x_batch_val,labels) in enumerate(tqdm(validation_dataset)):
            validation_step(model,x_batch_val,labels)
        print(f'\n Validation Step: {epoch}, Loss: {validation_loss.result()}')
        
        for i, label_name in enumerate(label_cols):
            print(f"{label_name} roc_auc {val_acc_metric[i].result()}") 
            val_acc_metric[i].reset_states()
#         validation_acc=val_acc_metric.result()
#         val_acc_metric.reset_states()
#         print("\n validation accuracy : %4.f"%(validation_acc))

train_model(model,train_dataset,validation_dataset)
model.save("my_custom_train_model.h5")

In [None]:
# from keras.models import load_model
# model=load_model('../input/mymodel/my_model.h5')

In [None]:
s4 = 'Hello'
sentence_pairs = np.array([s4])
test_data = BertSemanticDataGenerator(
        sentence_pairs, labels=None, batch_size=1, shuffle=False, include_targets=False,
    )

pro=model.predict(test_data)
print(pro)
print(label_cols[np.argmax(pro)])

In [None]:
# loading the pre-defined bert model weights
bert_model.Trainable=True

train_dataset=BertSemanticDataGenerator(train_inputs,train_labels,shuffle=True)
validation_dataset=BertSemanticDataGenerator(validation_inputs,validation_labels,shuffle=False)

trained_history = model.fit(
    train_dataset,
    validation_data=validation_dataset,
    epochs=2
)

In [None]:
model.trainable_variables

In [None]:
# submission_df=pd.read_csv("./sample_submission.csv",index_col='id')
# test_df=pd.read_csv("./test.csv")
# label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# print(submission_df.head())
# print(test_df.head())


# test_bert_op=BertSemanticDataGenerator(test_df['comment_text'],None,include_targets=False,shuffle=True)

In [None]:
# for i,sen in enumerate(tqdm(test_bert_op)):
#     sample_ids = test_df.iloc[i*32:(i+1)*32]['id'] 
#     pred=model.predict(sen)
#     submission_df.loc[sample_ids, label_cols] = pred
#     print(pred)
#     print(sen.shape)

In [None]:
# submission_df.to_csv('submission.csv')

In [None]:
# sample_ids = test_df.iloc[0*32:(0+1)*32]['id'] 
# print(sample_ids)
# submission_df.loc[sample_ids]

In [None]:
model.save("my_model.h5")

In [None]:
# from keras.preprocessing.sequence import pad_sequences
# # to convert the iput ids array in same size(column no=max(column no))
# tokenizer=transformers.BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True)
# max_length=128
# bert_model=transformers.TFBertModel.from_pretrained("bert-base-uncased")
# bert_model.trainable=False

# def tokenize(data,tokenizer=tokenizer,max_length=max_length):
# #     input_ids=[]
# #     attention_masks=[]
#     bert_outputs=[]
#     for sentence in tqdm(data):
        
#         encoded_data=tokenizer.batch_encode_plus(
#                         sentence,
#                         add_special_tokens=True,
#                         max_length=max_length,
#                         truncation=True,
#                         return_attention_mask=True,
#                         return_token_type_ids=True,
#                         pad_to_max_length=True,
#                         return_tensors="tf",
#                     )

# #         input_id=np.array(encoded_data["input_ids"],dtype="int32")
# #         attention_mask=np.array(encoded_data["attention_mask"],dtype="int32") 
        
#         bert_output=bert_model(**encoded_data)
#         sequence_output = bert_output.last_hidden_state
#         bert_outputs.append(sequence_output)
#     return bert_outputs
# #         input_ids.append(input_id)
# #         attention_masks.append(attention_mask)
        
# #     return [input_ids,attention_masks]

# bert_op=tokenize(data['comment_text'])
# # input_ids=pad_sequences(bert_op[0],maxlen=max_length,dtype='long',value=0,truncating="post",padding="post")
# # attention_masks=bert_op[1]
# # bert_op.shape

In [None]:
# len(bert_op)

In [None]:
# attention_masks=np.array(attention_masks)
# attention_masks=pad_sequences(attention_masks,maxlen=max_length,dtype='long',value=0,truncating="post",padding="post")
# input_ids.shape
# attention_masks.shape

In [None]:
# # creating batched dataset
# epochs=2
# def create_batch_dataset(data,epochs=epochs,batch_size=batch_size,buffer_size=1000,train=True):
#     dataset=tf.data.Dataset.from_tensor_slices(data)
# #     print(dataset.as_numpy_iterator())
#     if train:
#         dataset=dataset.shuffle(buffer_size=buffer_size)
#         # uses for shuffling the dataset.Select the first buffer_size element from dataset
#     dataset=dataset.repeat(epochs)
#     # just repeat the whole dataset
#     dataset=dataset.batch(batch_size=batch_size)
#     # devide the whole dataset into batch size and create an array of array
#     if train:
#         dataset=dataset.prefetch(1)
#     #     It has no concept of examples vs. batches. examples.prefetch(2) will prefetch two 
#     # elements (2 examples), while examples.batch(20).prefetch(2) will prefetch 2 elements (2 
#     # batches, of 20 examples each).
#     return dataset
# train_dataset=create_batch_dataset((train_inputs,train_masks,train_labels),train=True)
# validation_dataset=create_batch_dataset((validation_inputs,validation_masks,validation_labels),train=True)

In [None]:
# print(train_dataset.as_numpy_iterator())
# print(validation_dataset.as_numpy_iterator())

In [None]:
# list(train_dataset.as_numpy_iterator())[0][0].shape

In [None]:
# for i, (token_ids, masks, labels) in enumerate(tqdm(train_dataset)):
#             print(token_ids.shape)

In [None]:
# https://www.kaggle.com/nkaenzig/bert-tensorflow-2-huggingface-transformers
# https://www.kaggle.com/satyamkryadav/bert-model-96-77/notebook