In [20]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, EvalPrediction
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [21]:
# Load data from CSV files
train_df = pd.read_csv('train_dataset.csv')
eval_df = pd.read_csv('eval_dataset.csv')

In [22]:
# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [23]:
# Tokenize the input
train_encodings = tokenizer(list(train_df['text']), truncation=True, padding=True)
eval_encodings = tokenizer(list(eval_df['text']), truncation=True, padding=True)

In [24]:
# Define a PyTorch dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_labels = list(train_df['label'])
eval_labels = list(eval_df['label'])
train_dataset = CustomDataset(train_encodings, train_labels)
eval_dataset = CustomDataset(eval_encodings, eval_labels)

In [25]:
# Load the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=1,              # number of training epochs
    per_device_train_batch_size=2,   # batch size for training
    per_device_eval_batch_size=2,    # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
)

In [27]:
# Initialize the Trainer
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=eval_dataset            # evaluation dataset
)

In [28]:
# Train the model
trainer.train()

Step,Training Loss
10,0.6059
20,0.7405
30,0.6734
40,0.6806
50,0.5605
60,0.4071
70,0.2585
80,0.1995
90,0.1422
100,0.0956


TrainOutput(global_step=120, training_loss=0.37283447856704394, metrics={'train_runtime': 17.5994, 'train_samples_per_second': 13.637, 'train_steps_per_second': 6.818, 'total_flos': 1233333072000.0, 'train_loss': 0.37283447856704394, 'epoch': 3.0})

In [36]:
# Evaluate the model
eval_result = trainer.evaluate()

print(f"Evaluation results: {eval_result}")

Evaluation results: {'eval_loss': 0.029392385855317116, 'eval_runtime': 1.1139, 'eval_samples_per_second': 17.956, 'eval_steps_per_second': 8.978, 'epoch': 3.0}


RuntimeError: Placeholder storage has not been allocated on MPS device!

In [40]:
# # Make predictions on the evaluation set
eval_predictions = trainer.predict(eval_dataset)

for idx, example in enumerate(eval_dataset):
    text = tokenizer.decode(example['input_ids'], skip_special_tokens=True)
    true_label = eval_dataset[idx]['labels']
    predicted_label = eval_predictions.predictions[idx].argmax().item()
    predicted_sentiment = "positive" if predicted_label == 1 else "negative"

    print(f"Text: {text}")
    print(f"True Label: {'positive' if true_label == 1 else 'negative'}")
    print(f"Predicted Sentiment: {predicted_sentiment}\n")


Text: i dislike bert.
True Label: negative
Predicted Sentiment: negative

Text: bert is amazing.
True Label: positive
Predicted Sentiment: positive

Text: bert is not good.
True Label: negative
Predicted Sentiment: negative

Text: i dislike bert.
True Label: negative
Predicted Sentiment: negative

Text: i love bert!
True Label: positive
Predicted Sentiment: positive

Text: i dislike bert.
True Label: negative
Predicted Sentiment: negative

Text: bert is not good.
True Label: negative
Predicted Sentiment: negative

Text: bert is not good.
True Label: negative
Predicted Sentiment: negative

Text: bert is amazing.
True Label: positive
Predicted Sentiment: positive

Text: i'm a fan of bert.
True Label: positive
Predicted Sentiment: positive

Text: bert is not good.
True Label: negative
Predicted Sentiment: negative

Text: i'm a fan of bert.
True Label: positive
Predicted Sentiment: positive

Text: bert is amazing.
True Label: positive
Predicted Sentiment: positive

Text: i'm not impressed 

In [31]:
# Calculate metrics
def compute_metrics(p: EvalPrediction):
    preds = p.predictions.argmax(-1)
    accuracy = accuracy_score(p.label_ids, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='binary')
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

metrics = compute_metrics(eval_predictions)
print(f"Metrics: {metrics}")

Metrics: {'accuracy': 1.0, 'precision': 1.0, 'recall': 1.0, 'f1': 1.0}
