In [1]:
!pip install -q transformers torch-summary

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m56.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m56.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
import pandas as pd
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, RobertaForSequenceClassification
import numpy as np

In [None]:
print(torch.cuda.memory_summary())

In [4]:
tokenizer = AutoTokenizer.from_pretrained("pig4431/TweetEval_roBERTa_5E")

Downloading (…)okenizer_config.json:   0%|          | 0.00/427 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

In [7]:
# read dataframe from folder data and save it to variable df
# test_df = pd.read_csv('/mnt/home/abhinavkumar2/Profiling-Cryptocurrency-Influencers-with-FSL/data/test.csv')
train_df = pd.read_csv('/content/train.csv')
validate_df = pd.read_csv('/content/validate.csv')

In [8]:
# group the df by twitter user id and aggregate the texts and keep other columns as it is
train_df = train_df.groupby('twitter user id').agg({'texts': ' '.join, 'class': 'first', 'count_mention': sum}).reset_index()

In [9]:
validate_df = validate_df.groupby('twitter user id').agg({'texts': ' '.join, 'class': 'first', 'count_mention': sum}).reset_index()

In [10]:
labels = train_df['class'].unique().tolist()
id2label = {idx:label for idx, label in enumerate(labels)}
label2id = {label:idx for idx, label in enumerate(labels)}
labels, label2id

(['nano', 'no influencer', 'macro', 'mega', 'micro'],
 {'nano': 0, 'no influencer': 1, 'macro': 2, 'mega': 3, 'micro': 4})

In [11]:
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, tokenizer, max_len, tweet_df):
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.tweets_dataset = tweet_df
  
    def __len__(self):
        return len(self.tweets_dataset)
  
    def __getitem__(self, idx):
        tweet = self.tweets_dataset.iloc[idx]['texts']
        label = self.tweets_dataset.iloc[idx]['class']
        user_id = self.tweets_dataset.iloc[idx]['twitter user id']
        label = label2id[label]
        labels_matrix = np.zeros(5)
        labels_matrix[label] = 1
   
        encoding = self.tokenizer(
            text = tweet,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt',
            return_token_type_ids=True
        )

        return {
            'tweet': tweet,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(labels_matrix, dtype=torch.float),
            'token_type_ids': encoding['token_type_ids'].flatten(),
            'user_id': user_id
        }

In [12]:
train_dataset = TweetDataset(tokenizer, 256, train_df)

In [None]:
train_encoded = DataLoader(train_dataset, batch_size=10, shuffle=True, num_workers=3)

In [13]:
val_dataset = TweetDataset(tokenizer, 256, validate_df)
val_encoded = DataLoader(val_dataset, batch_size=10, shuffle=True, num_workers=3)



In [15]:
from transformers import AutoModelForSequenceClassification

# model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-large-2022-154m", 
#                                                            problem_type="multi_label_classification", 
#                                                            num_labels=len(labels),
#                                                            ignore_mismatched_sizes=True)
model = RobertaForSequenceClassification.from_pretrained( "pig4431/TweetEval_roBERTa_5E",
                                                         num_labels=len(labels),
                                                         problem_type="multi_label_classification",
                                                         ignore_mismatched_sizes=True)

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at pig4431/TweetEval_roBERTa_5E and are newly initialized because the shapes did not match:
- classifier.out_proj.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([5, 768]) in the model instantiated
- classifier.out_proj.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([5]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

Using pad_token, but it is not set yet.


In [16]:
from torchsummary import summary
summary(model)

Layer (type:depth-idx)                   Param #
├─RobertaModel: 1-1                      --
|    └─RobertaEmbeddings: 2-1            --
|    |    └─Embedding: 3-1               38,603,520
|    |    └─Embedding: 3-2               394,752
|    |    └─Embedding: 3-3               768
|    |    └─LayerNorm: 3-4               1,536
|    |    └─Dropout: 3-5                 --
|    └─RobertaEncoder: 2-2               --
|    |    └─ModuleList: 3-6              85,054,464
├─RobertaClassificationHead: 1-2         --
|    └─Linear: 2-3                       590,592
|    └─Dropout: 2-4                      --
|    └─Linear: 2-5                       3,845
Total params: 124,649,477
Trainable params: 124,649,477
Non-trainable params: 0


Layer (type:depth-idx)                   Param #
├─RobertaModel: 1-1                      --
|    └─RobertaEmbeddings: 2-1            --
|    |    └─Embedding: 3-1               38,603,520
|    |    └─Embedding: 3-2               394,752
|    |    └─Embedding: 3-3               768
|    |    └─LayerNorm: 3-4               1,536
|    |    └─Dropout: 3-5                 --
|    └─RobertaEncoder: 2-2               --
|    |    └─ModuleList: 3-6              85,054,464
├─RobertaClassificationHead: 1-2         --
|    └─Linear: 2-3                       590,592
|    └─Dropout: 2-4                      --
|    └─Linear: 2-5                       3,845
Total params: 124,649,477
Trainable params: 124,649,477
Non-trainable params: 0

In [17]:
batch_size = 10
metric_name = "f1"

In [22]:
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"roberta-tweet-english",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=50,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    save_total_limit = 3,
    warmup_steps=100,
    logging_dir='./logs',
    logging_steps=10,
    #push_to_hub=True,
)

In [19]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
    
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    print(result)
    return result

In [23]:
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [24]:
trainer.train()



Epoch,Training Loss,Validation Loss,F1,Roc Auc,Accuracy
1,0.308,0.475432,0.105263,0.508333,0.066667
2,0.3275,0.472675,0.285714,0.575,0.2
3,0.2842,0.476976,0.347826,0.6,0.266667
4,0.2495,0.482481,0.272727,0.566667,0.2
5,0.2225,0.477585,0.4,0.625,0.333333
6,0.1952,0.46497,0.347826,0.6,0.266667
7,0.1462,0.506723,0.181818,0.525,0.133333
8,0.1433,0.556399,0.4,0.625,0.333333
9,0.1129,0.52922,0.32,0.583333,0.266667
10,0.0863,0.554195,0.384615,0.616667,0.333333


{'f1': 0.10526315789473685, 'roc_auc': 0.5083333333333333, 'accuracy': 0.06666666666666667}
{'f1': 0.28571428571428575, 'roc_auc': 0.575, 'accuracy': 0.2}
{'f1': 0.3478260869565218, 'roc_auc': 0.6, 'accuracy': 0.26666666666666666}
{'f1': 0.27272727272727276, 'roc_auc': 0.5666666666666667, 'accuracy': 0.2}
{'f1': 0.4, 'roc_auc': 0.6249999999999999, 'accuracy': 0.3333333333333333}
{'f1': 0.3478260869565218, 'roc_auc': 0.6, 'accuracy': 0.26666666666666666}
{'f1': 0.18181818181818182, 'roc_auc': 0.5249999999999999, 'accuracy': 0.13333333333333333}
{'f1': 0.4, 'roc_auc': 0.6249999999999999, 'accuracy': 0.3333333333333333}
{'f1': 0.32, 'roc_auc': 0.5833333333333333, 'accuracy': 0.26666666666666666}
{'f1': 0.3846153846153846, 'roc_auc': 0.6166666666666667, 'accuracy': 0.3333333333333333}
{'f1': 0.4, 'roc_auc': 0.6249999999999999, 'accuracy': 0.3333333333333333}
{'f1': 0.5185185185185186, 'roc_auc': 0.6916666666666668, 'accuracy': 0.4666666666666667}
{'f1': 0.4, 'roc_auc': 0.6249999999999999, 

TrainOutput(global_step=650, training_loss=0.05433361717141592, metrics={'train_runtime': 576.3882, 'train_samples_per_second': 10.496, 'train_steps_per_second': 1.128, 'total_flos': 795932380953600.0, 'train_loss': 0.05433361717141592, 'epoch': 50.0})