In [4]:
from transformers import pipeline
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
from transformers import AdamW, get_scheduler
import evaluate
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
import torch.nn as nn

# Dataset Paths (replace with your actual file paths)
data_files = {"train": "/Users/heyodogo/Downloads/emobank_train.csv", "test": "/Users/heyodogo/Downloads/emobank_test.csv"}

# Load the Dataset
dataset = load_dataset('csv', data_files=data_files)

# Remove unnecessary columns and rename the label column
dataset = dataset.remove_columns(['id', 'A', 'D', 'split'])
dataset = dataset.rename_column("V", "labels")

# Tokenization Function
def tokenize_function(examples):
  return tokenizer(examples["text"], padding="max_length", truncation=True, return_tensors="pt")

# Load tokenizer (replace 'bert-base-uncased' with appropriate model if needed)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the Dataset (batched)
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Remove "text" column and set format to torch
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets.set_format("torch")

# Sample Set Size (consider increasing for better training)
SAMPLE_SET_SIZE = 300  # Increase this if possible

# Create Train and Eval Datasets (shuffled and selected)
small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(SAMPLE_SET_SIZE))
small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(SAMPLE_SET_SIZE))

# Data Collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Dataloaders
train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8, collate_fn=data_collator)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=8, collate_fn=data_collator)

# Load Pre-trained Model (consider DistilBERT for smaller datasets)
model_name = "bert-base-uncased"  # Or a pre-trained regression model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)

# Move Model to Device (GPU if available)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)  # Experiment with learning rates
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

# Loss Function (Ensure labels are continuous valence values)
loss_fn = nn.MSELoss()

# Training Loop
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
  for batch in train_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(**batch)

    # Ensure labels are floats (continuous values)
    targets = batch['labels'].float().to(device)
    # Reshape targets if necessary (should already be same size as predictions)
    targets = targets.view(len(batch), 1)

    loss = loss_fn(outputs.logits, targets)
    loss.backward()
    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    progress_bar.update(1)

# Evaluation
metric = evaluate.load("mse")  # Mean Squared Error for regression
model.eval()
for batch in eval_dataloader:
  batch = {k: v.to(device) for k, v in batch.items()}
  with torch.no_no_grad():
        outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/114 [00:00<?, ?it/s]

RuntimeError: shape '[4, 1]' is invalid for input of size 8

In [3]:
targets = targets.view(len(batch), 1)  # Reshape to match batch size
# Inside the training loop, before reshaping
print(targets.shape)


torch.Size([4, 1])


In [7]:
from transformers import pipeline
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
from transformers import AdamW, get_scheduler
import evaluate
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
import torch.nn as nn

# Dataset Paths (replace with your actual file paths)
data_files = {"train": "/Users/heyodogo/Downloads/emobank_train.csv", "test": "/Users/heyodogo/Downloads/emobank_test.csv"}

# Load the Dataset
dataset = load_dataset('csv', data_files=data_files)

# Remove unnecessary columns and rename the label column
dataset = dataset.remove_columns(['id', 'A', 'D', 'split'])
dataset = dataset.rename_column("V", "labels")

# Tokenization Function
def tokenize_function(examples):
  return tokenizer(examples["text"], padding="max_length", truncation=True, return_tensors="pt")

# Load tokenizer (replace 'bert-base-uncased' with appropriate model if needed)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Tokenize the Dataset (batched)
tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Remove "text" column and set format to torch
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets.set_format("torch")

# # Sample Set Size (consider increasing for better training)
# SAMPLE_SET_SIZE = 100  # Increase this if possible

# # Create Train and Eval Datasets (shuffled and selected)
# small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(SAMPLE_SET_SIZE))
# small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(SAMPLE_SET_SIZE))

# Data Collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Dataloaders
train_dataloader = DataLoader(small_train_dataset, shuffle=True, batch_size=8, collate_fn=data_collator)
eval_dataloader = DataLoader(small_eval_dataset, batch_size=8, collate_fn=data_collator)

# Load Pre-trained Model (consider DistilBERT for smaller datasets)
model_name = "bert-base-uncased"  # Or a pre-trained regression model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)

# Move Model to Device (GPU if available)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# Optimizer and Scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)  # Experiment with learning rates
num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

# Loss Function (Ensure labels are continuous valence values)
loss_fn = nn.MSELoss()

# Training Loop
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
  for batch in train_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = model(**batch)

    # Ensure labels are floats (continuous values)
    targets = batch['labels'].float().to(device)

    # **Reshape targets with unsqueeze to add a dimension (if necessary)**
    targets = targets.unsqueeze(1)  # Add a dimension of size 1

    loss = loss_fn(outputs.logits, targets)
    loss.backward()
    optimizer.step()
    lr_scheduler.step()
    optimizer.zero_grad()
    progress_bar.update(1)

# Evaluation
metric = evaluate.load("mse")  # Mean Squared Error for regression
model.eval()
for batch in eval_dataloader:
  batch = {k: v.to(device) for k, v in batch.items()}
  with torch.no_grad():
        outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/39 [00:00<?, ?it/s]



{'mse': 7.27}

In [8]:
metric = evaluate.load("mse")  # Mean Squared Error for regression
model.eval()
for batch in eval_dataloader:
  batch = {k: v.to(device) for k, v in batch.items()}
  with torch.no_grad():
        outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()



{'mse': 7.27}