In [None]:
train_texts, train_labels = [], []

for line in open('./data/tokens_bertweet.txt', encoding='utf-8'):
    label, text = line.strip().split('\t')
    train_texts.append(text)
    train_labels.append(int(label))

In [None]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.1, random_state=0)

In [None]:
from transformers import AutoModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)

In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

In [None]:
import torch

class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TweetDataset(train_encodings, train_labels)
val_dataset = TweetDataset(val_encodings, val_labels)

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification

training_args = TrainingArguments(
    output_dir='./results',      
#     overwrite_output_dir=True
    num_train_epochs=3,            
    per_device_train_batch_size=64, 
    per_device_eval_batch_size=64,
    evaluation_strategy="steps",
    warmup_steps=500,              
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10000,
    dataloader_num_workers=4,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy'
)

model = AutoModelForSequenceClassification.from_pretrained("vinai/bertweet-base")

trainer = Trainer(
    model=model,                        
    args=training_args,                  
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,        
    eval_dataset=val_dataset,           
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

In [None]:
model_path = './bert'
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)