In [None]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd '/content/drive/MyDrive/Colab Notebooks/NLP'

/content/drive/MyDrive/Colab Notebooks/NLP


In [None]:
!pip install transformers
!pip install tensorflow_addons



In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf

from tensorflow_addons.metrics import F1Score

from transformers import DefaultDataCollator
from transformers import AutoTokenizer, TFAutoModel

from keras.metrics import Precision, AUC, Recall
from keras import layers
from keras.models import Model

from tqdm.notebook import tqdm


In [None]:
train_path = 'data/train.csv'
test_path = 'data/test.csv'
test_labels_path = 'data/test_labels.csv'
subm_path = 'data/sample_submission.csv'

In [None]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)
df_test_labels = pd.read_csv(test_labels_path)
df_test_labels = df_test_labels.set_index('id')

df_train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


Tokenizer and data preprocessing

In [None]:
MAX_LEN = 128
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")


# define function to handle tokenization
def tokenize(sentence):
  tokens = tokenizer.encode_plus(sentence, max_length=MAX_LEN, truncation=True,
                      padding='max_length', add_special_tokens=True,
                      return_attention_mask=True, return_token_type_ids=False,
                      return_tensors='tf')  
  
  return tokens['input_ids'], tokens['attention_mask']


Xids  = np.zeros((len(df_train), MAX_LEN))
Xmask  = np.zeros((len(df_train), MAX_LEN))

In [None]:
# loop through data and tokenize everything
for i, sentence in enumerate(tqdm(df_train['comment_text'],total = len(df_train))):
    Xids[i, :], Xmask[i, :] = tokenize(sentence)

  0%|          | 0/159571 [00:00<?, ?it/s]

dataset creation

In [None]:
BATCH_SIZE = 32

# create tensorflow dataset object
labels =  df_train[label_cols].values
dataset = tf.data.Dataset.from_tensor_slices((Xids, Xmask, labels))

# restructure dataset format for BERT
def map_func(input_ids, masks, labels):
    return {'input_ids': input_ids, 'attention_mask': masks}, labels

dataset = dataset.map(map_func)  # apply the mapping function

# shuffle and batch the dataset
dataset = dataset.shuffle(10000).batch(BATCH_SIZE)


test train split

In [None]:

# create training-validation sets
train = dataset.take(round(len(list(dataset))*0.9))
val = dataset.skip(round(len(list(dataset))*0.9))

del dataset


Model definition

In [None]:
bert = TFAutoModel.from_pretrained("distilbert-base-uncased")

input_ids = tf.keras.layers.Input(shape=(MAX_LEN,), name='input_ids', dtype='int32')
mask = tf.keras.layers.Input(shape=(MAX_LEN,), name='attention_mask', dtype='int32')

embeddings = bert(input_ids, attention_mask=mask)[0]
x   = layers.GlobalMaxPool1D()(embeddings)
x   = tf.keras.layers.BatchNormalization()(x)
x   = layers.Dense(256,activation='relu')(x)#(embeddings)
x   = tf.keras.layers.Dropout(0.1)(x)
x   = tf.keras.layers.BatchNormalization()(x)
x   = layers.Dense(128,activation='relu')(x)
x   = layers.Dense(6, activation='sigmoid')(x)

model = tf.keras.Model(inputs=[input_ids, mask], outputs=x)

model.layers[2].trainable = False


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertModel: ['vocab_projector', 'vocab_layer_norm', 'activation_13', 'vocab_transform']
- This IS expected if you are initializing TFDistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFDistilBertModel were initialized from the model checkpoint at distilbert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [None]:
model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 128)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 tf_distil_bert_model_2 (TFDist  TFBaseModelOutput(l  66362880   ['input_ids[0][0]',              
 ilBertModel)                   ast_hidden_state=(N               'attention_mask[0][0]']         
                                one, 128, 768),                                                   
                                 hidden_states=None                                         

Compilation

In [None]:

model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=[Precision(),
                           Recall(),
                           AUC()
                           ])


Trainning

In [None]:

epochs = 10
history = model.fit(train,
                    epochs=epochs,
                    validation_data=val,
                    verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
import pickle
model.save('fitted3.model')
with open('fitted3.history', 'wb') as file_pi:
    pickle.dump(history.history, file_pi)

























INFO:tensorflow:Assets written to: fitted3.model/assets


INFO:tensorflow:Assets written to: fitted3.model/assets
