In [53]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [54]:
!pip install transformers[torch]

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [55]:
!pip install lion-pytorch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting lion-pytorch
  Downloading lion_pytorch-0.1.2-py3-none-any.whl (4.4 kB)
Installing collected packages: lion-pytorch
Successfully installed lion-pytorch-0.1.2


In [56]:
!pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [57]:
pip install ensemble-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Tasks and goals:

Task from that Kaggle competition regarding to identification real posts about disaster based on only text post:

https://www.kaggle.com/competitions/nlp-getting-started/overview

Actually goals of these project were achieved: comparison of new optimizer LION with frequently used AdamW and completion transformer's ensemle. But with using only GPU T4.

For completion of that task were following steps done:
- data clear from hashtags, url etc,
- doubled info also was cleared: if doubled text has different label, than final label was recalculated based on average,
- choised 3 encoder-only transformers: BERT, Distilled RoBERTa, and Distilled RoBERTa, which was finetuned on fake-news detection,
- all these base models were finetuned on train dataset,
- and after that finally ensemble of transformers were prepared.

Final F1-score of transformers ensemble were achieved on level 83%.

Also new optimizer LION was tried, but on T4 it was not good idea, because LION is more effective than AdamW, when batch size bigger than 64. So with small batch size LION is not effective - please see details final section after Conclusion.

### Imports

In [68]:
import torch
from torch.utils.data import DataLoader
import random
import re
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel,  AutoModelForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import datasets
from datasets import Dataset
from transformers import pipeline
from lion_pytorch import Lion
from ensemble_transformers import EnsembleModelForSequenceClassification
import time
from tqdm.auto import tqdm


In [59]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
torch.__version__

'2.0.1+cu118'

### Data preprocessing

In [60]:
import pandas as pd
dataset = pd.read_csv("/content/drive/MyDrive/Kaggle_tweet/train.csv")
dataset.head(3)

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1


In [None]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [None]:
dataset['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

In [61]:
# set of variables and base model choice
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model1 = "bert-base-cased"
model2 = "distilroberta-base"
model3 = "vikram71198/distilroberta-base-finetuned-fake-news-detection"
num_labels=2
label_column = 'target'
text_column = 'text'
random_seed = 42

In [62]:
# data clear from hashtags, url etc
# doubled info also was cleared:
# if doubled text has different label, than
# final label based on average:
# 1 if average > 0.51 and 0 if average < 0.49

urls_regex = r'http\S+'
mentions_regex = r'@\S+'
hashtags_regex = r'#\S+'
enter_regex = r'\n'

patterns = "|".join([urls_regex, mentions_regex, hashtags_regex, enter_regex])

clean_text = lambda t: re.sub(patterns, "", t)
clean_text_column = 'text_clean'

dataset[clean_text_column] = dataset[text_column].apply(clean_text)

dataset = dataset.groupby(clean_text_column).agg({
        'id': 'first',
        'keyword': 'first',
        'location': 'first',
        'text': 'first',
        'target': 'mean'
    }).reset_index()

dataset = dataset[(dataset['target'] < 0.49) | (dataset['target'] > 0.51)]
dataset['target'] = round(dataset['target'])
dataset['target']=dataset['target'].astype(int)


In [63]:
# keywords was added to text
# but location - not (because
# probably location has not influence)

text_train_column = 'text_train'
text_column = 'text_clean'
columns_to_merge = ['text_clean', 'keyword']

def get_value(row, key, default_value='unknown'):
    if key == text_train_column:
        return row[key]
    value = default_value if pd.isna(row[key]) else row[key]
    return f'{key.capitalize()}: {value}'

def preprocess_text(df):
    columns_to_merge = [text_train_column, 'keyword']
    df[text_train_column] = df[text_column].apply(lambda t: re.sub(urls_regex, "", t))
    df[text_train_column]= df.apply(lambda row: '. '.join([get_value(row, c) for c in columns_to_merge]), axis=1)
    return df

dataset = preprocess_text(dataset)
dataset.rename(columns={'target':'labels'}, inplace=True)

In [64]:
# train and test split
train_full, test = train_test_split(dataset, test_size=0.1, random_state=42, stratify=dataset['labels'])
print(f'Train_full size:\t{train_full.shape[0]}\nTest size:\t{test.shape[0]}')

Train_full size:	6214
Test size:	691


In [None]:
torch.cuda.empty_cache()

### Main functions

In [65]:
def tokenize_function(examples):
    """Tokenisation of text"""

    return tokenizer(examples["text_train"], padding="max_length", truncation=True)

In [66]:
def train_eval_dataloader(train_full):
    """Train and validation dataloaders"""

    train, val = train_test_split(train_full, test_size=0.15, random_state=42, stratify=train_full['labels'])
    train_data = Dataset.from_dict(train[['labels', 'text_train']])
    val_data = Dataset.from_dict(val[['labels', 'text_train']])
    my_dataset_dict = datasets.DatasetDict({"train":train_data, "test": val_data})

    tokenized_datasets = my_dataset_dict.map(tokenize_function, batched=True)
    tokenized_datasets = tokenized_datasets.remove_columns(["text_train"])
    tokenized_datasets.set_format("torch")

    train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=8)
    eval_dataloader = DataLoader(tokenized_datasets["test"], batch_size=8)

    return train_dataloader, eval_dataloader

In [67]:
def model_train(model, path2save):
    """Training and saving of model"""

    # train params
    num_epochs = 4
    num_training_steps = num_epochs * len(train_dataloader)
    optimizer = AdamW(model.parameters(), lr=5e-6, weight_decay=2e-3)
    lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

    torch._dynamo.config.suppress_errors = True

    best_val_accuracy = 0.0
    progress_bar = tqdm(range(num_training_steps))
    val_acc = 0.0
    loss = 0.0
    for epoch in range(num_epochs):
        # train
        model.train()
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}

            outputs = model(**batch)

            loss = outputs.loss
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

            progress_bar.update(1)

        # validation accuracy
        model.eval()

        accr = 0
        j = 0
        for batch in eval_dataloader:
            y_act = batch['labels']
            y_act = torch.Tensor(y_act).to('cpu').long()
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                output = model(**batch)

            logits = output.logits
            y_pred = []
            for i in range(len(logits)):
              if logits[i][0]>= logits[i][1]:
                y_pred.append(0)
              else:
                y_pred.append(1)
            y_pred = torch.Tensor(y_pred).to('cpu').long()
            accr += accuracy_score(y_act, y_pred)
            j +=1
            val_acc = accr/j

        print(f"\rEpoch: {epoch + 1}, Validation accuracy: {val_acc}")

        # saving checkpoints with good metric
        if val_acc > best_val_accuracy:
            print("Saving checkpoint!")
        best_val_accuracy = val_acc
        checkpoint_path = f"{path2save}/checkpoints_epoch_{(epoch+1)}"
        model.save_pretrained(checkpoint_path)
        tokenizer.save_pretrained(checkpoint_path)

    # model saving
    model_path = f"{path2save}/fine-tune-version"
    model.save_pretrained(model_path)
    tokenizer.save_pretrained(model_path)

    print("training is finished")


In [69]:
def class_report_test(model_path):
    """Classification report for test dataset"""

    classifier = pipeline("sentiment-analysis", model=model_path, tokenizer=model_path, device =0)

    X = test["text_train"].to_list()
    y_act =  test[ "labels"].to_list()
    labels = [1, 0]

    y_pred = []
    for i in range(len(X)):
      if classifier(X[i])[0]['label'] == 'LABEL_1':
        y_pred.append(1)
      else:
        y_pred.append(0)

    print(classification_report(y_pred, y_act, labels=labels))

### Train of BERT

In [70]:
tokenizer = AutoTokenizer.from_pretrained(model1)
model = AutoModelForSequenceClassification.from_pretrained(model1, num_labels=num_labels)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initi

In [71]:
train_dataloader, eval_dataloader = train_eval_dataloader(train_full)

Map:   0%|          | 0/5281 [00:00<?, ? examples/s]

Map:   0%|          | 0/933 [00:00<?, ? examples/s]

In [72]:
model = torch.compile(model)
model.to(device)

OptimizedModule(
  (_orig_mod): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(28996, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features

In [None]:
model_train(model, model1)

  0%|          | 0/2644 [00:00<?, ?it/s]



Epoch: 1, Validation accuracy: 0.8049145299145299
Saving checkpoint!
Epoch: 2, Validation accuracy: 0.8326923076923076
Saving checkpoint!
Epoch: 3, Validation accuracy: 0.8316239316239316
Epoch: 4, Validation accuracy: 0.8273504273504273
training is finished


In [None]:
torch.cuda.empty_cache()

### Train of distilled RoBERTa

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model2)
model = AutoModelForSequenceClassification.from_pretrained(model2, num_labels=num_labels)

Downloading (…)lve/main/config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json: 0.00B [00:00, ?B/s]

Downloading (…)olve/main/merges.txt: 0.00B [00:00, ?B/s]

Downloading (…)/main/tokenizer.json: 0.00B [00:00, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/331M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilroberta-base were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.out_proj.weight', 'classifier.dense.bia

In [None]:
train_dataloader, eval_dataloader = train_eval_dataloader(train_full)

Map:   0%|          | 0/5281 [00:00<?, ? examples/s]

Map:   0%|          | 0/933 [00:00<?, ? examples/s]

In [None]:
model = torch.compile(model)
model.to(device)

OptimizedModule(
  (_orig_mod): RobertaForSequenceClassification(
    (roberta): RobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(50265, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0-5): 6 x RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): RobertaSelfOutput(
                (dens

In [None]:
model_train(model, model2)

  0%|          | 0/2644 [00:00<?, ?it/s]



Epoch: 1, Validation accuracy: 0.8188034188034188
Saving checkpoint!
Epoch: 2, Validation accuracy: 0.8316239316239316
Saving checkpoint!
Epoch: 3, Validation accuracy: 0.8305555555555555
Epoch: 4, Validation accuracy: 0.8241452991452991
training is finished


In [None]:
torch.cuda.empty_cache()

### Train of distilled RoBERTa, which was finetuned on fake-news detection

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model3)
model = AutoModelForSequenceClassification.from_pretrained(model3, num_labels=num_labels)


Downloading (…)okenizer_config.json:   0%|          | 0.00/386 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json: 0.00B [00:00, ?B/s]

Downloading (…)olve/main/merges.txt: 0.00B [00:00, ?B/s]

Downloading (…)/main/tokenizer.json: 0.00B [00:00, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/740 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

In [None]:
train_dataloader, eval_dataloader = train_eval_dataloader(train_full)

Map:   0%|          | 0/5281 [00:00<?, ? examples/s]

Map:   0%|          | 0/933 [00:00<?, ? examples/s]

In [None]:
model = torch.compile(model)
model.to(device)

OptimizedModule(
  (_orig_mod): RobertaForSequenceClassification(
    (roberta): RobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(50265, 768, padding_idx=1)
        (position_embeddings): Embedding(514, 768, padding_idx=1)
        (token_type_embeddings): Embedding(1, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0-5): 6 x RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): RobertaSelfOutput(
                (dens

In [None]:
model_train(model, model3)

  0%|          | 0/2644 [00:00<?, ?it/s]



Epoch: 1, Validation accuracy: 0.7803418803418803
Saving checkpoint!
Epoch: 2, Validation accuracy: 0.8134615384615385
Saving checkpoint!
Epoch: 3, Validation accuracy: 0.8198717948717948
Saving checkpoint!
Epoch: 4, Validation accuracy: 0.8145299145299145
training is finished


In [None]:
torch.cuda.empty_cache()

### Classification reports for transformers

In [None]:
# for finetuned BERT
class_report_test("/content/drive/MyDrive/bert-base-cased/fine-tune-version")

Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


              precision    recall  f1-score   support

           1       0.80      0.79      0.80       287
           0       0.85      0.86      0.86       404

    accuracy                           0.83       691
   macro avg       0.83      0.83      0.83       691
weighted avg       0.83      0.83      0.83       691



In [None]:
# for finetuned Distil-RoBERTa
class_report_test("/content/drive/MyDrive/distilroberta-base/fine-tune-version")

              precision    recall  f1-score   support

           1       0.73      0.81      0.76       254
           0       0.88      0.82      0.85       437

    accuracy                           0.82       691
   macro avg       0.80      0.82      0.81       691
weighted avg       0.82      0.82      0.82       691



In [None]:
# for finetuned Distil-RoBERTa, which previously was finetuned on fake-news detection
class_report_test("/content/drive/MyDrive/vikram71198/distilroberta-base-finetuned-fake-news-detection/fine-tune-version")

              precision    recall  f1-score   support

           1       0.73      0.81      0.77       253
           0       0.88      0.82      0.85       438

    accuracy                           0.82       691
   macro avg       0.80      0.82      0.81       691
weighted avg       0.83      0.82      0.82       691



### Transformers Ensemble

In [None]:
ensemble = EnsembleModelForSequenceClassification.from_multiple_pretrained(
    "/content/drive/MyDrive/bert-base-cased/fine-tune-version", "/content/drive/MyDrive/distilroberta-base/fine-tune-version",
    "/content/drive/MyDrive/vikram71198/distilroberta-base-finetuned-fake-news-detection/fine-tune-version"
)
X = test["text_train"].to_list()
y_pred = []
for i in range(len(X)):
  batch = X[i]
  output = ensemble(batch)
  stacked_output = ensemble(batch, mean_pool=True)
  result = stacked_output.logits
  if result[0][0] > result[0][1]:
    y_pred.append(0)
  else:
    y_pred.append(1)
y_act =  test[ "labels"].to_list()

print(classification_report(y_pred, y_act, labels= [1, 0] ))

              precision    recall  f1-score   support

           1       0.76      0.81      0.78       265
           0       0.88      0.84      0.86       426

    accuracy                           0.83       691
   macro avg       0.82      0.82      0.82       691
weighted avg       0.83      0.83      0.83       691



### Conclusion:

Actually goals of these project were achieved: comparison of new optimizer LION with frequently used AdamW and completion transformer's ensemle. But with using only GPU T4.

However there are the following ways for result impovement:

First of all, augmentation of train data based on translation on other languages, because current train dataset is not big.

Secondly, using more  performing GPU, it will be appropriate to use advantages new optimizer LION, increasing batch size (starting from 64 and bigger) and qauntity of epoch.

Also, using of full versions of bert-like transformers probably will make additional increase for metrics.

### Training with LION optimizer

In [74]:
def model_train_LION(model, path2save):
    """Training with LION and saving of model"""

    # train params
    num_epochs = 4
    num_training_steps = num_epochs * len(train_dataloader)
    optimizer = Lion(model.parameters(), lr=1e-4, weight_decay=1e-2)
    lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

    torch._dynamo.config.suppress_errors = True

    best_val_accuracy = 0.0
    progress_bar = tqdm(range(num_training_steps))
    val_acc = 0.0
    loss = 0.0
    for epoch in range(num_epochs):
        # train
        model.train()
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}

            outputs = model(**batch)

            loss = outputs.loss
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

            progress_bar.update(1)

        # validation accuracy
        model.eval()

        accr = 0
        j = 0
        for batch in eval_dataloader:
            y_act = batch['labels']
            y_act = torch.Tensor(y_act).to('cpu').long()
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                output = model(**batch)

            logits = output.logits
            y_pred = []
            for i in range(len(logits)):
              if logits[i][0]>= logits[i][1]:
                y_pred.append(0)
              else:
                y_pred.append(1)
            y_pred = torch.Tensor(y_pred).to('cpu').long()
            accr += accuracy_score(y_act, y_pred)
            j +=1
            val_acc = accr/j

        print(f"\rEpoch: {epoch + 1}, Validation accuracy: {val_acc}")

        # saving checkpoints with good metric
        if val_acc > best_val_accuracy:
            print("Saving checkpoint!")
        best_val_accuracy = val_acc
        checkpoint_path = f"{path2save}/LION/checkpoints_epoch_{(epoch+1)}"
        model.save_pretrained(checkpoint_path)
        tokenizer.save_pretrained(checkpoint_path)

    # model saving
    model_path = f"{path2save}/LION/fine-tune-version"
    model.save_pretrained(model_path)
    tokenizer.save_pretrained(model_path)

    print("training is finished")

See below that accuracy on validation for BERT training with LION dramatically low than  with using AdamW

In [75]:
%%time

# BERT base model
model_train_LION(model, model1)

  0%|          | 0/2644 [00:00<?, ?it/s]



Epoch: 1, Validation accuracy: 0.5927350427350427
Saving checkpoint!
Epoch: 2, Validation accuracy: 0.5927350427350427
