In [1]:
!pip install -q transformers torch-summary

In [2]:
import pandas as pd
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, RobertaForSequenceClassification
import numpy as np

In [None]:
print(torch.cuda.memory_summary())

In [3]:
# tokenizer = AutoTokenizer.from_pretrained("pig4431/TweetEval_roBERTa_5E")
tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-large-2022-154m")

In [4]:
# read dataframe from folder data and save it to variable df
# test_df = pd.read_csv('/mnt/home/abhinavkumar2/Profiling-Cryptocurrency-Influencers-with-FSL/data/test.csv')
train_df = pd.read_csv('/content/train.csv')
validate_df = pd.read_csv('/content/validate.csv')

In [5]:
# group the df by twitter user id and aggregate the texts and keep other columns as it is
train_df = train_df.groupby('twitter user id').agg({'texts': ' '.join, 'class': 'first', 'count_mention': sum}).reset_index()

In [6]:
validate_df = validate_df.groupby('twitter user id').agg({'texts': ' '.join, 'class': 'first', 'count_mention': sum}).reset_index()

In [7]:
labels = train_df['class'].unique().tolist()
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels, label2id

(['nano', 'no influencer', 'macro', 'mega', 'micro'],
 {'nano': 0, 'no influencer': 1, 'macro': 2, 'mega': 3, 'micro': 4})

In [8]:
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, max_len, tweet_df):
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.tweets_dataset = tweet_df
  
    def __len__(self):
        return len(self.tweets_dataset)
  
    def __getitem__(self, idx):
        tweet = self.tweets_dataset.iloc[idx]['texts']
        label = self.tweets_dataset.iloc[idx]['class']
        user_id = self.tweets_dataset.iloc[idx]['twitter user id']
        label = label2id[label]
        labels_matrix = np.zeros(5)
        labels_matrix[label] = 1
   
        encoding = self.tokenizer(
            text = tweet,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
            return_token_type_ids=True
        )

        return {
            'tweet': tweet,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(labels_matrix, dtype=torch.float),
            'token_type_ids': encoding['token_type_ids'].flatten(),
            'user_id': user_id
        }

In [9]:
train_dataset = TweetDataset(tokenizer, 256, train_df)

In [None]:
train_encoded = DataLoader(train_dataset, batch_size=10, shuffle=True, num_workers=3)

In [10]:
val_dataset = TweetDataset(tokenizer, 256, validate_df)
val_encoded = DataLoader(val_dataset, batch_size=10, shuffle=True, num_workers=3)



In [11]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-large-2022-154m", 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(labels),
                                                           ignore_mismatched_sizes=True)
# model = RobertaForSequenceClassification.from_pretrained( "pig4431/TweetEval_roBERTa_5E",
#                                                          num_labels=len(labels),
#                                                          problem_type="multi_label_classification",
#                                                          ignore_mismatched_sizes=True)

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-large-2022-154m were not used when initializing RobertaForSequenceClassification: ['lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-large-2022-154m and are newly initialized: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'clas

In [None]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

Using pad_token, but it is not set yet.


In [12]:
from torchsummary import summary
summary(model)

Layer (type:depth-idx)                   Param #
├─RobertaModel: 1-1                      --
|    └─RobertaEmbeddings: 2-1            --
|    |    └─Embedding: 3-1               51,471,360
|    |    └─Embedding: 3-2               526,336
|    |    └─Embedding: 3-3               1,024
|    |    └─LayerNorm: 3-4               2,048
|    |    └─Dropout: 3-5                 --
|    └─RobertaEncoder: 2-2               --
|    |    └─ModuleList: 3-6              302,309,376
├─RobertaClassificationHead: 1-2         --
|    └─Linear: 2-3                       1,049,600
|    └─Dropout: 2-4                      --
|    └─Linear: 2-5                       5,125
Total params: 355,364,869
Trainable params: 355,364,869
Non-trainable params: 0


Layer (type:depth-idx)                   Param #
├─RobertaModel: 1-1                      --
|    └─RobertaEmbeddings: 2-1            --
|    |    └─Embedding: 3-1               51,471,360
|    |    └─Embedding: 3-2               526,336
|    |    └─Embedding: 3-3               1,024
|    |    └─LayerNorm: 3-4               2,048
|    |    └─Dropout: 3-5                 --
|    └─RobertaEncoder: 2-2               --
|    |    └─ModuleList: 3-6              302,309,376
├─RobertaClassificationHead: 1-2         --
|    └─Linear: 2-3                       1,049,600
|    └─Dropout: 2-4                      --
|    └─Linear: 2-5                       5,125
Total params: 355,364,869
Trainable params: 355,364,869
Non-trainable params: 0

In [13]:
batch_size = 10
metric_name = "f1"

In [14]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"roberta-tweet-english",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=50,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    save_total_limit = 3,
    warmup_steps=100,
    logging_dir='./logs',
    logging_steps=10,
    #push_to_hub=True,
)

In [15]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
    
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    print(result)
    return result

In [16]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [17]:
trainer.train()

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.6566,0.649258,0.2,0.5,0.2
2,0.6525,0.559987,0.0,0.5,0.0
3,0.5437,0.485227,0.0,0.5,0.0
4,0.4902,0.490766,0.125,0.533333,0.066667
5,0.4854,0.482454,0.210526,0.55,0.133333
6,0.4528,0.462518,0.272727,0.566667,0.133333
7,0.3441,0.450201,0.333333,0.591667,0.266667
8,0.2987,0.498754,0.444444,0.65,0.333333
9,0.2898,0.44478,0.461538,0.658333,0.4
10,0.1726,0.549711,0.357143,0.6,0.333333


{'f1': 0.20000000000000004, 'roc_auc': 0.5, 'accuracy': 0.2}
{'f1': 0.0, 'roc_auc': 0.5, 'accuracy': 0.0}
{'f1': 0.0, 'roc_auc': 0.5, 'accuracy': 0.0}
{'f1': 0.125, 'roc_auc': 0.5333333333333333, 'accuracy': 0.06666666666666667}
{'f1': 0.2105263157894737, 'roc_auc': 0.55, 'accuracy': 0.13333333333333333}
{'f1': 0.27272727272727276, 'roc_auc': 0.5666666666666667, 'accuracy': 0.13333333333333333}
{'f1': 0.33333333333333337, 'roc_auc': 0.5916666666666666, 'accuracy': 0.26666666666666666}
{'f1': 0.4444444444444445, 'roc_auc': 0.65, 'accuracy': 0.3333333333333333}
{'f1': 0.4615384615384615, 'roc_auc': 0.6583333333333333, 'accuracy': 0.4}
{'f1': 0.3571428571428571, 'roc_auc': 0.6, 'accuracy': 0.3333333333333333}
{'f1': 0.5, 'roc_auc': 0.6833333333333333, 'accuracy': 0.4666666666666667}
{'f1': 0.5517241379310344, 'roc_auc': 0.7166666666666666, 'accuracy': 0.5333333333333333}
{'f1': 0.5, 'roc_auc': 0.6833333333333333, 'accuracy': 0.4666666666666667}
{'f1': 0.5, 'roc_auc': 0.6833333333333333, '

TrainOutput(global_step=650, training_loss=0.099953117490961, metrics={'train_runtime': 1745.3837, 'train_samples_per_second': 3.466, 'train_steps_per_second': 0.372, 'total_flos': 2819120949427200.0, 'train_loss': 0.099953117490961, 'epoch': 50.0})