In [None]:
! unzip ../input/jigsaw-toxic-comment-classification-challenge/train.csv.zip 
! unzip ../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv.zip 
! unzip ../input/jigsaw-toxic-comment-classification-challenge/sample_submission.csv.zip 
! unzip ../input/jigsaw-toxic-comment-classification-challenge/test.csv.zip

In [None]:
! pip install transformers 
# save the best weight ,means less loss valued trained batch 
# do some warm up step initially to make it more effiecient

In [None]:
import numpy as np
import pandas as pd
import re
import tensorflow as tf
import transformers
from tqdm.notebook import tqdm

In [None]:
bert_model = transformers.TFBertModel.from_pretrained("bert-base-uncased") 
bert_model.trainable = False

In [4]:
batch_size=32
max_len=128
EPOCHS=2

In [9]:
# count the individual data
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'] 
positive_comment=data[data[label_cols].sum(axis=1)==0]
toxic_comment=data[data[label_cols].sum(axis=1)>0]
no_value=data[data[label_cols].sum(axis=1)=='']

print(len(positive_comment)) #143346
print(len(toxic_comment))# 16225
print(len(no_value))# 16225

# random samling from positve commnet to tkae out of 14000 comment

data=pd.concat([
    toxic_comment,
    positive_comment.sample(n=30_000)
])

data.shape

143346
16225
0


(46225, 8)

In [10]:
# add up a column positive
conditions = [
    (data['toxic']==0)&
    (data['severe_toxic']==0) & (data['obscene']==0)&
    (data['threat']==0) & (data['insult']==0)&
    (data['identity_hate']==0)
    ]
values = [1]

# data['positive']=np.where(data['toxic']==0 and data['severe_toxic']==0 and  and  and  and ,1,0)
data['positive']=np.select(conditions,values)

label_cols.append("positive")

print(data.shape)
positive_comment=data[data[label_cols].sum(axis=1)==1]
print(len(positive_comment))

(46225, 9)
36360


In [7]:
data["comment_text"]=data["comment_text"].map(lambda x:re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", x))

In [None]:
# sense_count_pd=pd.DataFrame(data[label_cols].value_counts()) 
# sense_count_pd

In [11]:
labels =  data[label_cols].values

In [12]:
from sklearn.model_selection import train_test_split 
input_sen=data["comment_text"].values
# print(input_sen)
train_inputs,validation_inputs,train_labels,validation_labels=train_test_split(input_sen,labels,random_state=0,test_size=0.2) 


print(train_inputs.shape)
print(train_labels.shape)

print(validation_inputs.shape)
print(validation_labels.shape)
 

(36980,)
(36980, 7)
(9245,)
(9245, 7)


In [13]:
class BertSemanticDataGenerator(tf.keras.utils.Sequence): 
    def __init__(
        self,
        sentence_pairs,
        labels,
        batch_size=batch_size,
        shuffle=True,
        include_targets=True,
    ):
        self.sentence_pairs = sentence_pairs
        self.labels = labels
        self.shuffle = shuffle
        self.batch_size = batch_size
        self.include_targets = include_targets
         
        
        self.tokenizer = transformers.BertTokenizer.from_pretrained(
            "bert-base-uncased", do_lower_case=True
        )
        self.indexes = np.arange(len(self.sentence_pairs))
        self.on_epoch_end()

    def __len__(self):
        # Denotes the number of batches per epoch.
        return len(self.sentence_pairs) // self.batch_size

    def __getitem__(self, idx):
        # Retrieves the batch of index.
        indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
        sentence_pairs = self.sentence_pairs[indexes]

        # With BERT tokenizer's batch_encode_plus batch of both the sentences are
        # encoded together and separated by [SEP] token.
        encoded = self.tokenizer.batch_encode_plus(
            sentence_pairs.tolist(),
            add_special_tokens=True,
            max_length=128,
            return_attention_mask=True,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_tensors="tf",
        )   

        bert_output = bert_model(**encoded)
        
        sequence_output = bert_output.last_hidden_state 
         
        if self.include_targets:
            labels = np.array(self.labels[indexes], dtype="int32")
            return sequence_output, labels
        else:
            return sequence_output

    def on_epoch_end(self): 
        if self.shuffle:
            np.random.RandomState(42).shuffle(self.indexes)

In [14]:
train_dataset=BertSemanticDataGenerator(train_inputs,train_labels,shuffle=True)
validation_dataset=BertSemanticDataGenerator(validation_inputs,validation_labels,shuffle=False)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [15]:
input_layer = tf.keras.layers.Input(shape=(128, 768), name=None)  
flat=tf.keras.layers.Flatten()(input_layer) 
output = tf.keras.layers.Dense(7, activation="softmax")(flat)
model = tf.keras.models.Model(inputs=input_layer, outputs=output)
    
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 128, 768)]        0         
_________________________________________________________________
flatten (Flatten)            (None, 98304)             0         
_________________________________________________________________
dense (Dense)                (None, 7)                 688135    
Total params: 688,135
Trainable params: 688,135
Non-trainable params: 0
_________________________________________________________________


In [16]:
model.compile()

In [None]:
# history = model.fit(
#     train_dataset,
#     validation_data=validation_dataset,
#     epochs=2
# )

In [17]:
# custom training loop 
## update at each train step
## reset at the end of each batch

import time
## defining a optimizer 
optimizer= tf.keras.optimizers.Adam(lr=2e-5)

## defining loss function 
loss_fn=tf.keras.losses.CategoricalCrossentropy(from_logits=True)
# ## mean loss define
train_loss=tf.keras.metrics.Mean(name="train_loss")
validation_loss=tf.keras.metrics.Mean(name="validation_loss")

# Metric
## dfining the accuracy metric to track our model accuracy.Here for 6 class we 
## have to declare 2d darray of row 6
train_acc_metric=[tf.keras.metrics.CategoricalAccuracy() for i in range(len(label_cols))]

val_acc_metric=[tf.keras.metrics.CategoricalAccuracy() for i in range(len(label_cols))]

# actually from logits denoting the probability from our custom model layed for each label.It is being fetched before the softmax layer to calculate loss between actual and predicted

batch_size=32
EPOCH=2
train_dataset_size=60000
validation_dataset_size=15000

@tf.function
def train_step(model,x_train,label):
    # Gradiane tape actually records the operation run in forward step
    with tf.GradientTape() as tape:
        #caluculate logits for comparison
        logits_prob=model(x_train,training=True)
        # calculate loss value 
        loss_value=loss_fn(label,logits_prob)
    #calculate gradient of trainable variables against the loss
    gradients=tape.gradient(loss_value,model.trainable_weights)
    # update the gradient according to gradient descent
    optimizer.apply_gradients(zip(gradients,model.trainable_weights))
    # update the mean train ing loss
    train_loss(loss_value)
    # update accuracy metric for each of the 6 classes 
    for i,auc in enumerate(train_acc_metric):
        auc.update_state(label[:,i],logits_prob[:,i])
    return loss_value

@tf.function
def validation_step(model,x_validation,label):
    with tf.GradientTape() as tape:
        validation_logit_prob=model(x_validation,training=False)
        valid_loss=loss_fn(label,validation_logit_prob)
        validation_loss(valid_loss)
        for i,auc in enumerate(val_acc_metric):
            auc.update_state(label[:,i],validation_logit_prob[:,i]) 

def train_model(model,train_dataset,validation_dataset):
    for epoch in range(EPOCHS):
        print('\n Epoch No %d\n' % (epoch,))

        ### training part ###
        for step,(x_batch_train,labels) in enumerate(tqdm(train_dataset)):
            training_loss=train_step(model,x_batch_train,labels)
            
            #log result at every 200 batches
            if step%200==0:
                print(f'\nTrain Step: {epoch}, Loss: {train_loss.result()}')
#                 print("Trainng loss at %d batch of data: %.4f"%(step,float(training_loss)))
                # training accuracy metric at end
                for i, label_name in enumerate(label_cols):
                    print(f"{label_name} roc_auc {train_acc_metric[i].result()}")
                    # reset the accuracy metric after every epoch
                    train_acc_metric[i].reset_states()
            
#         training_accuracy=train_acc_metric.result()
#         print("\nTraining accuracy after %d epoch : %.4f"%(epoch,training_accuracy))
#         train_acc_metric.reset_states()
        
        
        ### validation part ###
        for step,(x_batch_val,labels) in enumerate(tqdm(validation_dataset)):
            validation_step(model,x_batch_val,labels)
        print(f'\n Validation Step: {epoch}, Loss: {validation_loss.result()}')
        
        for i, label_name in enumerate(label_cols):
            print(f"{label_name} roc_auc {val_acc_metric[i].result()}") 
            val_acc_metric[i].reset_states()
#         validation_acc=val_acc_metric.result()
#         val_acc_metric.reset_states()
#         print("\n validation accuracy : %4.f"%(validation_acc))

train_model(model,train_dataset,validation_dataset)
model.save("my_custom_train_model.h5")


 Epoch No 0



  "The `lr` argument is deprecated, use `learning_rate` instead.")


  0%|          | 0/1155 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  '"`categorical_crossentropy` received `from_logits=True`, but '
2022-01-22 13:25:35.438799: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)



Train Step: 0, Loss: 2.996401786804199
toxic roc_auc 0.0
severe_toxic roc_auc 0.0
obscene roc_auc 0.0
threat roc_auc 0.0
insult roc_auc 0.0
identity_hate roc_auc 0.0
positive roc_auc 0.0

Train Step: 0, Loss: 1.9293111562728882
toxic roc_auc 0.05000000074505806
severe_toxic roc_auc 0.054999999701976776
obscene roc_auc 0.10999999940395355
threat roc_auc 0.02500000037252903
insult roc_auc 0.0949999988079071
identity_hate roc_auc 0.009999999776482582
positive roc_auc 0.03500000014901161

Train Step: 0, Loss: 2.0662219524383545
toxic roc_auc 0.11500000208616257
severe_toxic roc_auc 0.06499999761581421
obscene roc_auc 0.11500000208616257
threat roc_auc 0.02500000037252903
insult roc_auc 0.10000000149011612
identity_hate roc_auc 0.03999999910593033
positive roc_auc 0.04500000178813934

Train Step: 0, Loss: 2.2395973205566406
toxic roc_auc 0.04500000178813934
severe_toxic roc_auc 0.05999999865889549
obscene roc_auc 0.13500000536441803
threat roc_auc 0.04500000178813934
insult roc_auc 0.10000

  0%|          | 0/288 [00:00<?, ?it/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.



 Validation Step: 0, Loss: 4.216444969177246
toxic roc_auc 0.0868055522441864
severe_toxic roc_auc 0.0555555559694767
obscene roc_auc 0.09375
threat roc_auc 0.0416666679084301
insult roc_auc 0.0868055522441864
identity_hate roc_auc 0.0416666679084301
positive roc_auc 0.0451388880610466

 Epoch No 1



  0%|          | 0/1155 [00:00<?, ?it/s]


Train Step: 1, Loss: 2.8637492656707764
toxic roc_auc 0.1225806474685669
severe_toxic roc_auc 0.07096774131059647
obscene roc_auc 0.13548387587070465
threat roc_auc 0.032258063554763794
insult roc_auc 0.13548387587070465
identity_hate roc_auc 0.05161290243268013
positive roc_auc 0.032258063554763794

Train Step: 1, Loss: 3.102321147918701
toxic roc_auc 0.07500000298023224
severe_toxic roc_auc 0.07999999821186066
obscene roc_auc 0.14499999582767487
threat roc_auc 0.014999999664723873
insult roc_auc 0.125
identity_hate roc_auc 0.014999999664723873
positive roc_auc 0.029999999329447746

Train Step: 1, Loss: 3.3704206943511963
toxic roc_auc 0.07999999821186066
severe_toxic roc_auc 0.054999999701976776
obscene roc_auc 0.14000000059604645
threat roc_auc 0.10000000149011612
insult roc_auc 0.08500000089406967
identity_hate roc_auc 0.03999999910593033
positive roc_auc 0.05000000074505806

Train Step: 1, Loss: 3.6256754398345947
toxic roc_auc 0.04500000178813934
severe_toxic roc_auc 0.070000000

  0%|          | 0/288 [00:00<?, ?it/s]


 Validation Step: 1, Loss: 5.925820350646973
toxic roc_auc 0.0833333358168602
severe_toxic roc_auc 0.0520833320915699
obscene roc_auc 0.1111111119389534
threat roc_auc 0.7326388955116272
insult roc_auc 0.1006944477558136
identity_hate roc_auc 0.0381944440305233
positive roc_auc 0.1215277761220932


In [None]:
# from keras.models import load_model
# model=load_model('../input/mymodel/my_model.h5')

In [20]:
s4 = 'excellent work'
sentence_pairs = np.array([s4])
test_data = BertSemanticDataGenerator(
        sentence_pairs, labels=None, batch_size=1, shuffle=False, include_targets=False,
    )

pro=model.predict(test_data)
print(np.asarray(pro))
print(label_cols[np.argmax(pro)])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


[[1.0298121e-06 4.7074219e-21 1.3265938e-07 0.0000000e+00 1.6367255e-07
  3.1848161e-26 9.9999869e-01]]
positive


In [None]:
# loading the pre-defined bert model weights
# bert_model.Trainable=True

# train_dataset=BertSemanticDataGenerator(train_inputs,train_labels,shuffle=True)
# validation_dataset=BertSemanticDataGenerator(validation_inputs,validation_labels,shuffle=False)

# trained_history = model.fit(
#     train_dataset,
#     validation_data=validation_dataset,
#     epochs=2
# )

In [None]:
# submission_df=pd.read_csv("./sample_submission.csv",index_col='id')
# test_df=pd.read_csv("./test.csv")
# label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

# print(submission_df.head())
# print(test_df.head())


# test_bert_op=BertSemanticDataGenerator(test_df['comment_text'],None,include_targets=False,shuffle=True)

In [None]:
# for i,sen in enumerate(tqdm(test_bert_op)):
#     sample_ids = test_df.iloc[i*32:(i+1)*32]['id'] 
#     pred=model.predict(sen)
#     submission_df.loc[sample_ids, label_cols] = pred
#     print(pred)
#     print(sen.shape)

In [None]:
# submission_df.to_csv('submission.csv')

In [None]:
# sample_ids = test_df.iloc[0*32:(0+1)*32]['id'] 
# print(sample_ids)
# submission_df.loc[sample_ids]

In [None]:
# len(bert_op)

In [None]:
# # creating batched dataset
# epochs=2
# def create_batch_dataset(data,epochs=epochs,batch_size=batch_size,buffer_size=1000,train=True):
#     dataset=tf.data.Dataset.from_tensor_slices(data)
# #     print(dataset.as_numpy_iterator())
#     if train:
#         dataset=dataset.shuffle(buffer_size=buffer_size)
#         # uses for shuffling the dataset.Select the first buffer_size element from dataset
#     dataset=dataset.repeat(epochs)
#     # just repeat the whole dataset
#     dataset=dataset.batch(batch_size=batch_size)
#     # devide the whole dataset into batch size and create an array of array
#     if train:
#         dataset=dataset.prefetch(1)
#     #     It has no concept of examples vs. batches. examples.prefetch(2) will prefetch two 
#     # elements (2 examples), while examples.batch(20).prefetch(2) will prefetch 2 elements (2 
#     # batches, of 20 examples each).
#     return dataset
# train_dataset=create_batch_dataset((train_inputs,train_masks,train_labels),train=True)
# validation_dataset=create_batch_dataset((validation_inputs,validation_masks,validation_labels),train=True)

In [None]:
# https://www.kaggle.com/nkaenzig/bert-tensorflow-2-huggingface-transformers
# https://www.kaggle.com/satyamkryadav/bert-model-96-77/notebook