# Import Packages, Modules, and Data Sets

In [1]:
!pip install transformers --quiet
!pip install evaluate --quiet

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from emoji import demojize
import matplotlib.pyplot as plt
import os, re, random, datasets, evaluate

pd.set_option('display.max_colwidth', None)
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer, EarlyStoppingCallback

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [3]:
train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

# Pre-process Training and Test Data Sets

In [4]:
duplicates = train[train.duplicated('text')]
problematic_duplicates = []

for i in range(duplicates.text.nunique()):
    duplicate_subset = train[train.text == duplicates.text.unique()[i]]
    if len(duplicate_subset) > 1 and duplicate_subset.target.nunique() == 2:
        problematic_duplicates.append(i)
        
target_list = [0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,1,0,0]

for problematic_index in range(len(problematic_duplicates)): 
    train.target = np.where(train.text == duplicates.text.unique()[problematic_index], 
                            target_list[problematic_index], train.target)

The biggest change we are going to implement in this notebook is to perform additional pre-processing steps prior to fine-tuning the BERTweet model on Hugging Face. We are going to perform the same pre-processing steps that were performed prior to pre-training the BERTweet model as can be found in the [TweetNormalizer](https://github.com/VinAIResearch/BERTweet/blob/master/TweetNormalizer.py) module. Following are the steps that we are going to perform:

- Lower case all characters;
- Expand a few contractions (surprisingly few if you ask me. I would like to find out how they chose these contractions.)
- Replace all account usernames (following '@') with "USER";
- Replace all urls (following 'http' and/or 'www') with "HTTPURL";
- Demojize (remove all emojis)

In [5]:
def clean_tweets(text):
    
    text = text.lower()
    
    text = text.replace("n't", " n't ")
    text = text.replace("n 't", " n't ")
    text = text.replace("ca n't", "can't")
    text = text.replace("ai n't", "ain't")
    
    text = text.replace("'m", " 'm ")
    text = text.replace("'re", " 're ")
    text = text.replace("'s", " 's ")
    text = text.replace("'ll", " 'll ")
    text = text.replace("'d", " 'd ")
    text = text.replace("'ve", " 've ")
    text = text.replace("\n", " ")
    
    text = text.replace(" p . m .", " p.m.")
    text = text.replace(" p . m ", " p.m ")
    text = text.replace(" a . m .", " a.m.")
    text = text.replace(" a . m ", " a.m ")
    
    token_list = text.split(' ')
    
    token_list = [re.sub('#', '', x) for x in token_list]
    token_list = [re.sub(r'@\S+', '@USER', x) for x in token_list]
    token_list = [re.sub(r'http\S+', 'HTTPURL', x) for x in token_list]
    token_list = [re.sub(r'www\S+', 'HTTPURL', x) for x in token_list]
    token_list = [demojize(x) if len(x) == 1 else x for x in token_list]
    
    return(" ".join(token_list))

In case the "location" column provides additional information with regards to whether or not a tweet is about an actual disaster or not, let's append the substance of the location column to the text column. 

In [6]:
train.location = train.location.replace(np.nan, '', regex = True)
test_df.location = test_df.location.replace(np.nan, '', regex = True)

In [7]:
train.text = train.text + ". " + train.location + "."
test_df.text = test_df.text + ". " + test_df.location + "."

In [8]:
train.text = train.text.apply(lambda x: clean_tweets(x))
test_df.text = test_df.text.apply(lambda x: clean_tweets(x))

In [9]:
train[41:50]

Unnamed: 0,id,keyword,location,text,target
41,61,ablaze,,on the outside you 're ablaze and alive but you 're dead inside. .,0
42,62,ablaze,milky way,had an awesome time visiting the cfc head office the ancop site and ablaze. thanks to tita vida for taking care of us ??. milky way.,0
43,63,ablaze,,soooo pumped for ablaze ???? @USER .,0
44,64,ablaze,,i wanted to set chicago ablaze with my preaching... but not my hotel! HTTPURL .,0
45,65,ablaze,,i gained 3 followers in the last week. you? know your stats and grow with HTTPURL .,0
46,66,ablaze,"GREENSBORO,NORTH CAROLINA","how the west was burned: thousands of wildfires ablaze in california alone HTTPURL greensboro,north carolina.",1
47,67,ablaze,,building the perfect tracklist to life leave the streets ablaze. .,0
48,68,ablaze,Live On Webcam,check these out: HTTPURL HTTPURL HTTPURL HTTPURL nsfw. live on webcam.,0
49,71,ablaze,England.,first night with retainers in. it 's quite weird. better get used to it; i have to wear them every single night for the next year at least.. england..,0


In [10]:
train = train.groupby('target').sample(np.min(train.target.value_counts().to_list()), random_state = 1048597)
train_df, val_df = np.split(train.sample(frac = 1), [int(0.8 * len(train))])

We are going to retain the columns that we are going to need for training and evaluation: *id* for test data set evaluation, *text* and *target* for both. Moving forward, I am going to stick to a pre-processing pipeline where we store the training, validation, and test data sets (if applicable) as Datasets inside one **DatasetDict**. 

In [11]:
train_df = train_df[['id', 'text', 'target']]
val_df = val_df[['id', 'text', 'target']]
test_df = test_df[['id', 'text']]

In [12]:
train_dict = datasets.Dataset.from_dict(train_df.to_dict(orient="list"))
val_dict = datasets.Dataset.from_dict(val_df.to_dict(orient="list"))
test_dict = datasets.Dataset.from_dict(test_df.to_dict(orient="list"))

In [13]:
tweets_ds = datasets.DatasetDict({"train": train_dict, "val": val_dict, "test": test_dict})

In [14]:
tweets_ds

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'target'],
        num_rows: 5209
    })
    val: Dataset({
        features: ['id', 'text', 'target'],
        num_rows: 1303
    })
    test: Dataset({
        features: ['id', 'text'],
        num_rows: 3263
    })
})

# Finetune Hugging Face Model

We are going to use the **BERTweet-base** model on Hugging Face. The justification for this is the following: 
1. BERTweet has been trained on 850 million English Tweets. As we are trying to classify tweets, this model will capture the subtletie that only Tweets have;
2. BERTweet has been trained based on the RoBERTa pre-training procedure. RoBERTa is generally a good model to fine-tune for classifcation purposes.

In [15]:
model_name = 'vinai/bertweet-base'
tokenizer = AutoTokenizer.from_pretrained(model_name)
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 2)

Downloading (…)lve/main/config.json:   0%|          | 0.00/558 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/843k [00:00<?, ?B/s]

Downloading (…)solve/main/bpe.codes:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading pytorch_model.bin:   0%|          | 0.00/543M [00:00<?, ?B/s]

Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.weight', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: 

In [16]:
def tokenize_function(dataset):
    return(tokenizer(dataset['text'], truncation = True))

tokenized_data = tweets_ds.map(tokenize_function, batched = True)

  0%|          | 0/6 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [17]:
tokenized_data

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'target', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5209
    })
    val: Dataset({
        features: ['id', 'text', 'target', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1303
    })
    test: Dataset({
        features: ['id', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3263
    })
})

In [18]:
tokenized_data['train'] = tokenized_data['train'].rename_column('target', 'labels')
tokenized_data['val'] = tokenized_data['val'].rename_column('target', 'labels')
tokenized_data.with_format('pt')

DatasetDict({
    train: Dataset({
        features: ['id', 'text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 5209
    })
    val: Dataset({
        features: ['id', 'text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1303
    })
    test: Dataset({
        features: ['id', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3263
    })
})

I came to learn that the column containing the labels (predictions) have to be named "labels". If not, **trainer.train()** will return an error. As the test dataset inside the DatasetDict object does not contain the labels yet, we are going to rename the "target" columns inside the train and validation datasets as "labels". 

In [21]:
training_args = TrainingArguments(model_name,  
                                  evaluation_strategy = 'epoch',
                                  num_train_epochs = 5,
                                  learning_rate = 5e-5,
                                  weight_decay = 0.005,
                                  per_device_train_batch_size = 16,
                                  per_device_eval_batch_size = 16,
                                  report_to = 'none',
                                  load_best_model_at_end = True,
                                  save_strategy = 'epoch')

def compute_metrics(eval_pred):
    metric = evaluate.load("accuracy")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis = -1)
    return metric.compute(predictions=predictions, references=labels)

early_stop = EarlyStoppingCallback(2, 0.01)

trainer = Trainer(
    model,
    training_args,
    train_dataset = tokenized_data["train"],
    eval_dataset = tokenized_data["val"],
    data_collator = data_collator,
    tokenizer = tokenizer,
    compute_metrics = compute_metrics,
    callbacks = [early_stop]
)

In [22]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.481905,0.816577
2,0.357600,0.492348,0.802763
3,0.357600,0.635452,0.810437


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

TrainOutput(global_step=978, training_loss=0.31767982841756937, metrics={'train_runtime': 3964.7303, 'train_samples_per_second': 6.569, 'train_steps_per_second': 0.411, 'total_flos': 340291497339420.0, 'train_loss': 0.31767982841756937, 'epoch': 3.0})

# Prepare for Submission

In [23]:
test_predictions = trainer.predict(tokenized_data["test"])
preds = np.argmax(test_predictions.predictions, axis = 1)

In [24]:
submission = pd.DataFrame(list(zip(test_df.id, preds)), 
                          columns = ["id", "target"])
submission.to_csv("submission.csv", index = False)

Despite introducing an additional pre-processing step, the fine-tuned model did not perform better than [my best attempt](https://www.kaggle.com/code/l048596/disaster-tweets-bertweet-tensorflow). In following notebooks, I am going to learn to add additional layers to fine-tuned BERT models so that we can increase the expressive power of our models. It is my understanding that this can be done in one of two ways: 1) we define a custom module where we add layers on top of a pre-trained BERT model; 2) use input layers in tensorflow such that the input of the model are outputs of the pre-trained BERT model (similar to the one we have in the aforementioned notebook). 