In [None]:
import pandas as pd
import numpy as np
import torch
import os
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Disable Weights & Biases
os.environ["WANDB_DISABLED"] = "true"

# Load dataset
df = pd.read_csv("Journal_Dataset.csv")

# Map emotions to numerical labels
label_map = {"Happy": 0, "Sad": 1, "Frustrated": 2, "Anxious": 3}
df["label"] = df["Emotion"].map(label_map)

# Split dataset with more validation data
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["How do you feel today?"].tolist(), df["label"].tolist(), test_size=0.3, random_state=42, stratify=df["label"].tolist()
)

# Load tokenizer
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

# Convert data to Hugging Face Dataset
train_dataset = Dataset.from_dict({"text": train_texts, "labels": train_labels})
test_dataset = Dataset.from_dict({"text": test_texts, "labels": test_labels})

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

# Apply tokenization
train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Load BERT model with dropout regularization
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4, hidden_dropout_prob=0.3, attention_probs_dropout_prob=0.3)

# Training arguments with adjustments to prevent overfitting
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=16,  # Increased batch size
    per_device_eval_batch_size=16,
    num_train_epochs=1,  # Further reduced epochs
    weight_decay=0.1,  # Increased regularization
    learning_rate=3e-5,  # Slightly increased learning rate
    logging_dir="./logs",
    save_total_limit=2,  # Limit saved checkpoints
    load_best_model_at_end=True,  # Load best model based on validation loss
)

# Define compute metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    acc = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

# Train model with early stopping
trainer.train()

# Evaluate model
results = trainer.evaluate()
print(results)

# Save model
model.save_pretrained("./fine_tuned_bert")
tokenizer.save_pretrained("./fine_tuned_bert")

In [None]:
from google.colab import files
import shutil

shutil.make_archive("fine_tuned_bert", 'zip', "./fine_tuned_bert")
files.download("fine_tuned_bert.zip")
