In [1]:
!pip install evaluate --quiet
!pip install emoji --quiet

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from emoji import demojize
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import os, re, random, datasets, evaluate
from sklearn.model_selection import train_test_split

pd.set_option('display.max_colwidth', None)
from transformers import AutoTokenizer, TFAutoModel, DataCollatorWithPadding, TrainingArguments, Trainer, EarlyStoppingCallback

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [3]:
train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

# Preprocess the Data Sets

In the past few notebooks, I established a pre-processing pipeline where I: (1) identify misclassified tweets (duplicate tweets whose labels are not identical), (2) concatenate the substance of the location column with that of the text column, and (3) clean the tweets following the set of pre-processing steps that VinAI used prior to training the BERTweet model. You can read more about the pre-processing steps by taking a look at one of my previous notebooks [here](https://www.kaggle.com/code/l048596/disaster-tweets-bertweet-pytorch-ii-82-62?kernelSessionId=139348416). 

In [4]:
duplicates = train[train.duplicated('text')]
problematic_duplicates = []

for i in range(duplicates.text.nunique()):
    duplicate_subset = train[train.text == duplicates.text.unique()[i]]
    if len(duplicate_subset) > 1 and duplicate_subset.target.nunique() == 2:
        problematic_duplicates.append(i)
        
target_list = [0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,1,0,0]

for problematic_index in range(len(problematic_duplicates)): 
    train.target = np.where(train.text == duplicates.text.unique()[problematic_index], 
                            target_list[problematic_index], train.target)

In [5]:
def clean_tweets(text):
    
    text = text.lower()
    
    text = text.replace("n't", " n't ")
    text = text.replace("n 't", " n't ")
    text = text.replace("ca n't", "can't")
    text = text.replace("ai n't", "ain't")
    
    text = text.replace("'m", " 'm ")
    text = text.replace("'re", " 're ")
    text = text.replace("'s", " 's ")
    text = text.replace("'ll", " 'll ")
    text = text.replace("'d", " 'd ")
    text = text.replace("'ve", " 've ")
    text = text.replace("\n", " ")
    
    text = text.replace(" p . m .", " p.m.")
    text = text.replace(" p . m ", " p.m ")
    text = text.replace(" a . m .", " a.m.")
    text = text.replace(" a . m ", " a.m ")
    
    token_list = text.split(' ')
    
    token_list = [re.sub('#', '', x) for x in token_list]
    token_list = [re.sub(r'@\S+', '@USER', x) for x in token_list]
    token_list = [re.sub(r'http\S+', 'HTTPURL', x) for x in token_list]
    token_list = [re.sub(r'www\S+', 'HTTPURL', x) for x in token_list]
    token_list = [demojize(x) if len(x) == 1 else x for x in token_list]
    
    return(" ".join(token_list))

In [6]:
train.location = train.location.replace(np.nan, '', regex = True)
test_df.location = test_df.location.replace(np.nan, '', regex = True)

train.text = train.text + ". " + train.location + "."
test_df.text = test_df.text + ". " + test_df.location + "."

train.text = train.text.apply(lambda x: clean_tweets(x))
test_df.text = test_df.text.apply(lambda x: clean_tweets(x))

In [7]:
train = train.groupby('target').sample(np.min(train.target.value_counts().to_list()), random_state = 1048596)
train_df, val_df = np.split(train.sample(frac = 1), [int(0.85 * len(train))])

# Load Pre-trained Model for Tokenization

Initially, I had planned to fine-tune the BERTweet model and add one or two Dense and Dropout layers so that we can enhance the expressive power of the model that we had in the previous [notebook](https://www.kaggle.com/code/l048596/disaster-tweets-bertweet-pytorch-ii-82-62?kernelSessionId=139348416). However, instead of doing that, I decided to learn to fine-tune another Hugging Face model just using TensorFlow, in this case DistilBERT, and figure out a standard procedure that I can use to add additional Keras layers on top of the fine-tuned models on Hugging Face for text classification. 

In [8]:
model_name = 'distilbert-base-uncased'

tokenizer = AutoTokenizer.from_pretrained(model_name, 
                                          normalization = True,
                                          use_fast = False,
                                          add_special_tokens = True,
                                          pad_to_max_length = True, 
                                          return_attention_mask = True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [9]:
train_tokens = tokenizer(train_df.text.to_list(),
                         padding = "max_length",
                         truncation = True).data

val_tokens = tokenizer(val_df.text.to_list(),
                       padding = "max_length",
                       truncation = True).data

In [10]:
def extract_features(tokens, labels, batch_size = 16): # Note that batch size of 64 willr esult in GPU OOM error
    features = {x: tokens[x] for x in tokenizer.model_input_names}
    features = tf.data.Dataset.from_tensor_slices((features, labels))
    return features.shuffle(len(labels)).batch(batch_size).prefetch(tf.data.AUTOTUNE)

train_features = extract_features(train_tokens, train_df.target)
val_features = extract_features(val_tokens, val_df.target)

I tried using a number of different pre-trained models on Hugging Face and came to realize that one has to modify certain parts of the code to make things work (if you simply change the *model_name* to some other model on Hugging Face, you will most likely get errors. So, before we proceed onto using the DistilBERT model here, let's take a look at the input and output of the DistilBERT model to determine which part of the code has to be modified so that we can further fine-tune Hugging Face models:

In [11]:
bert_model = TFAutoModel.from_pretrained(model_name)
text = ["Replace me by any text you'd like.", "My name is Messi Lee"]

encoded_input = tokenizer(text, 
                          padding = "max_length", 
                          truncation = True,
                          return_tensors='tf')

output = bert_model([encoded_input['input_ids'], encoded_input['attention_mask']])
output

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


TFBaseModelOutput(last_hidden_state=<tf.Tensor: shape=(2, 512, 768), dtype=float32, numpy=
array([[[ 4.41138633e-04, -2.62405723e-01, -1.01915449e-01, ...,
         -6.27640188e-02,  2.75840908e-01,  3.70140642e-01],
        [ 7.22330213e-01,  1.64490327e-01,  4.00247574e-01, ...,
          1.91608697e-01,  4.04579461e-01, -5.80942333e-02],
        [ 2.81979889e-01, -1.74299002e-01,  3.90757024e-02, ...,
          2.76808701e-02,  1.18860215e-01,  9.14387286e-01],
        ...,
        [ 1.63950697e-01,  4.58683260e-02,  9.72631425e-02, ...,
         -1.33402888e-02, -1.13538861e-01, -3.34521234e-02],
        [ 2.25677252e-01,  5.23837283e-02,  2.04405099e-01, ...,
         -1.12724975e-02, -1.48728728e-01, -5.23389578e-02],
        [ 1.87630862e-01, -1.89079866e-01,  3.37699950e-02, ...,
          1.79747820e-01, -2.19924212e-01, -1.39652103e-01]],

       [[-6.15856461e-02, -1.84155684e-02,  6.84209168e-02, ...,
          3.34893987e-02,  2.31654853e-01,  2.43140161e-01],
        [ 2.

It is important to note that the last hidden state of the DistilBERT model is of shape (2, 512, 768). Here, 2 corresponds to the number of texts, 512 corresponds to the length of the tokenized input, and 768 corresonds to the dimension of the DistilBERT embedding. The value 512 is going to be used for the shape of the input layers in our model. Depending on the BERT model configuration, this value can vary (e.g., BERTweet model had the value of 128). Furthermore, we are going to take the very first (0th) element of the BERT output (instead of the second element for BERTweet model) and feed it into the Dropout layer. Also, I've come to notice that using a relatively big dropout value (we are going to try 0.7 here) is helpful for the generalizability of the model trained here: 

In [12]:
bert_model = TFAutoModel.from_pretrained(model_name)

input_ids = tf.keras.Input(shape=(512,), dtype = 'int32', name = 'input_ids')
attention_masks = tf.keras.Input(shape=(512,), dtype ='int32', name = 'attention_mask')

output = bert_model([input_ids, attention_masks])[0]
output = tf.keras.layers.Dropout(0.7)(output)
output = tf.keras.layers.Flatten()(output)
output = tf.keras.layers.Dense(1, activation = 'sigmoid')(output)

model = tf.keras.models.Model(inputs = [input_ids, attention_masks], outputs = output)

model.compile(optimizer = tf.keras.optimizers.legacy.Adam(learning_rate = 1e-5), 
              loss = tf.keras.losses.BinaryCrossentropy(), 
              metrics = ['accuracy'])

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [14]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 512)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 512)]        0           []                               
                                                                                                  
 tf_distil_bert_model_1 (TFDist  TFBaseModelOutput(l  66362880   ['input_ids[0][0]',              
 ilBertModel)                   ast_hidden_state=(N               'attention_mask[0][0]']         
                                one, 512, 768),                                                   
                                 hidden_states=None                                           

Finally, it helps to inspect the model using the **summary()** function as it shows the output shapes for all layers. As the ouput of the DistilBERT model is of shape (None, 512, 768), even after the output is fed into the Dropout layer and the Dense layer, the shape stays that way. However, for us to calculate the metric specified in this model ("accuracy"), the output of the final layer of this model has to be of the shape (None, 1). To do that, we introduce a Flatten layer right before the Dense layer. 

In [15]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor = 'val_loss', 
                                                  patience = 2)

model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath = 'model/best_performed_model',
    save_weights_only = True,
    save_best_only = True,
    monitor = 'val_loss',
    verbose = 1
)

In [16]:
model.fit(train_features, 
          validation_data = val_features,
          epochs = 30, 
          callbacks = [early_stopping, model_checkpoint_callback])

Epoch 1/30
Epoch 1: val_loss improved from inf to 0.37805, saving model to model/best_performed_model
Epoch 2/30
Epoch 2: val_loss improved from 0.37805 to 0.37747, saving model to model/best_performed_model
Epoch 3/30
Epoch 3: val_loss did not improve from 0.37747
Epoch 4/30
Epoch 4: val_loss did not improve from 0.37747


<keras.callbacks.History at 0x7f7fa64010f0>

# Prepare for Submission

I use the **model_checkpoint_callback** with *save_best_only* set to True. At the end of each epoch, when validation loss is calculated, the callback function checks if the validation loss at the end of the epoch is smallest, and if it is, it saves the weights of that model to the designated path. This is so that if the model overfits, the model weights that performed best with respect to validation loss can then be loaded for test set prediction. However, in my past notebooks, despite using the callback function, I had not loaded the best performing model prior to using the model to predict labels for the test data set. Moving forward, the following line of code will be called so that we can use the best performing model for prediction purposes. 

Let's load the model using the **load_weights()** function and then evaluate the model using the validation data set. If the best performing model was restored, validation accuracy should be 84.34% as it was at the second epoch: 

In [18]:
model.load_weights('model/best_performed_model')
model.evaluate(val_features)



[0.3774731457233429, 0.8433981537818909]

In [19]:
test_token = tokenizer(test_df.text.tolist(), 
                       padding = "max_length", 
                       truncation = True,
                       return_tensors='tf').data

In [20]:
predictions = model.predict(test_token)
pred = [(x > 0.5).astype(int)[0] for x in predictions]



In [21]:
submission = pd.DataFrame(list(zip(test_df.id, pred)), columns = ["id", "target"])
submission.to_csv("submission.csv", index = False)

The next notebook is going to be a slightly modified version of this notebook where I fine-tune DeBERTa in light of the discussions that have been taking place in many discussions pertaining to NLP competitions on Kaggle. Let's see if an equivalent model that fine-tunes DeBERTa performs significantly better than the one that fine-tunes DistilBERT and/or BERTweet. 