In [1]:
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import torch

In [6]:
import os 

# Get the home directory and expand '~' to the full path
home_dir = os.path.expanduser("~")
output_dir = os.path.join(home_dir, 'Documents/results')

In [11]:
import pandas as pd
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments

# Load tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Load pre-trained model
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=18)  # 18 emotion labels

# Load training dataset
train_df = pd.read_csv('training.csv')

# Define text and label columns
text_column = 'text'
label_columns = train_df.columns[1:]  # Exclude the text column

# Tokenize the text data
train_encodings = tokenizer(train_df[text_column].tolist(), truncation=True, padding=True)

# Convert labels to tensors
train_labels = torch.tensor(train_df[label_columns].values, dtype=torch.float32)

# Create a PyTorch dataset
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MyDataset(train_encodings, train_labels)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=500,
    save_total_limit=2,
    load_best_model_at_end=True,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    tokenizer=tokenizer,
)

# Fine-tuning
trainer.train()

# Save the model
trainer.save_model("./fine_tuned_roberta")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/318 [00:00<?, ?it/s]

{'loss': 0.5343, 'grad_norm': 0.8666396141052246, 'learning_rate': 1e-05, 'epoch': 0.94}
{'loss': 0.2213, 'grad_norm': 0.46281716227531433, 'learning_rate': 2e-05, 'epoch': 1.89}
{'loss': 0.1878, 'grad_norm': 0.4075983464717865, 'learning_rate': 3e-05, 'epoch': 2.83}
{'train_runtime': 794.6377, 'train_samples_per_second': 3.201, 'train_steps_per_second': 0.4, 'train_loss': 0.3062333100996677, 'epoch': 3.0}
