In [None]:
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer
import torch
import torch.nn as nn
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
from sklearn.model_selection import KFold
from torch.utils.data import Subset
from sklearn.metrics import accuracy_score

In [None]:
# Load your dataset
data = pd.read_excel("Path to data")

print(data.head())

In [None]:
# Convert DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(data)

# Split dataset into training and validation sets
dataset = dataset.train_test_split(test_size=0.2)
train_dataset = dataset['train']
val_dataset = dataset['test']

In [None]:
from sklearn.preprocessing import LabelEncoder

# Define label encoder
label_encoder = LabelEncoder()

# Fit encoder to label names
label_names = ["complaint", "enquiry", "other", "praise", "promo", "reaction", "recommendation", "response"]
label_encoder.fit(label_names)

# Encode labels
train_dataset = train_dataset.map(lambda example: {"label": label_encoder.transform([example["label"]])[0]})
val_dataset = val_dataset.map(lambda example: {"label": label_encoder.transform([example["label"]])[0]})


In [None]:
from transformers import AutoTokenizer

# Load BERTweet tokenizer
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")

# Tokenization function
def tokenize_function(examples):
    return tokenizer(examples["texts"], truncation=True, padding="max_length", max_length=128)

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["created_at", "likes", "retweets", "replies", "views"])
val_dataset = val_dataset.map(tokenize_function, batched=True, remove_columns=["created_at", "likes", "retweets", "replies", "views"])

In [None]:
import torch

# Set dataset format for PyTorch
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
val_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])


In [None]:
# Compute class weights to account for class imbalances
train_labels = np.array([example["label"] for example in train_dataset])
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(train_labels),
    y=train_labels
)
class_weights = torch.tensor(class_weights, dtype=torch.float)

# Define Custom BERTweet Model with Weighted Loss
class WeightedBERTweet(nn.Module):
    def __init__(self, model_name, num_labels, class_weights):
        super(WeightedBERTweet, self).__init__()
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
        self.loss_fn = nn.CrossEntropyLoss(weight=class_weights)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        loss = None
        if labels is not None:
            loss = self.loss_fn(logits, labels)
        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}

In [None]:
# Initialize the model with class weights
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = WeightedBERTweet("vinai/bertweet-base", num_labels=8, class_weights=class_weights.to(device)).to(device)

In [None]:
# Define the compute_metrics function
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    labels = p.label_ids
    accuracy = accuracy_score(labels, preds)
    return {"accuracy": accuracy}

# Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.1,
    logging_steps=10,
    logging_dir="./logs",
    save_strategy="epoch",
    metric_for_best_model="accuracy",
    
)

# Convert dataset to Pandas or NumPy for indexing
texts = np.array(train_dataset["texts"])
labels = np.array(train_dataset["label"])

kf = KFold(n_splits=5, shuffle=True, random_state=42)

fold_accuracies = []  # List to store accuracies for each fold


for fold, (train_idx, val_idx) in enumerate(kf.split(texts, labels)):
    print(f"Training fold {fold+1}...")

    # Select subsets using `.select()`
    train_subset = train_dataset.select(train_idx.tolist())
    val_subset = train_dataset.select(val_idx.tolist())
    
    #train_subset = Subset(dataset, train_idx.tolist())
    #val_subset = Subset(dataset, val_idx.tolist())
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_subset,
        eval_dataset=val_subset,
        compute_metrics=compute_metrics
    )
    
    trainer.train()
    eval_results = trainer.evaluate()
    fold_accuracies.append(eval_results["eval_accuracy"])
    
# Calculate the average accuracy across all folds
avg_accuracy = np.mean(fold_accuracies)
print(f"Average accuracy across folds: {avg_accuracy}")


In [None]:
import torch

save_directory = "bertweet_model_v1"
torch.save(trainer.model.state_dict(), f"{save_directory}/model.pth")
tokenizer.save_pretrained(save_directory)

print(f"Model and tokenizer saved to {save_directory}")