In [None]:
!pip install accelerate -U
!pip install transformers[torch] -U


Collecting accelerate
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/302.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m297.0/302.6 kB[0m [31m11.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.utils.class_weight import compute_class_weight
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset, WeightedRandomSampler

# Set seed for reproducibility
torch.manual_seed(42)

# Custom dataset class
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Load data
train_df = pd.read_csv('human_values_annotation_groundtruth.csv')
test_df = pd.read_csv('Anthropic_hh-rlhf_full_processed_1_18_24.csv')

# Check for missing values in labels
if train_df['Label'].isnull().any() or test_df['chosen'].isnull().any():
    raise ValueError("Missing values in label column. Handle missing values before proceeding.")

# Convert labels to numeric format using LabelEncoder
label_encoder = LabelEncoder()
train_df['Label'] = label_encoder.fit_transform(train_df['Label'])
num_labels = len(label_encoder.classes_)

# Split data into train and test
train_texts, test_texts, train_labels, test_labels = train_test_split(train_df['Text'], train_df['Label'], test_size=0.2, random_state=42)

# Tokenize data using RoBERTa tokenizer
max_length = 128  # Set your desired max length
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', max_length=max_length)
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, return_tensors='pt', max_length=max_length)
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True, return_tensors='pt', max_length=max_length)

# Convert labels to tensor
train_labels = torch.tensor(train_labels.tolist(), dtype=torch.long)
test_labels = torch.tensor(test_labels.tolist(), dtype=torch.long)

# Ensure all classes are present in both training and test labels
all_classes = torch.unique(torch.cat((train_labels, test_labels))).numpy()

# Calculate class weights for imbalanced classes
class_weights = torch.tensor(compute_class_weight('balanced', classes=all_classes, y=train_labels.numpy()), dtype=torch.float)

# Create custom datasets with weighted sampling for class imbalance
weights = class_weights[train_labels]
sampler = WeightedRandomSampler(weights, len(weights), replacement=True)
train_dataset = CustomDataset(train_encodings, train_labels)
test_dataset = CustomDataset(test_encodings, test_labels)

# Fine-tune model using RoBERTa with regularization, data augmentation, and class weights
def model_init():
    model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=num_labels, num_hidden_layers=12)  # Increase the number of layers
    # Apply regularization techniques (e.g., dropout)
    model.roberta.encoder.layer[-1].output_layer_norm = torch.nn.Dropout(0.1)
    return model

# Define training arguments with hyperparameter tuning
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=8,  # Increase the number of epochs
    per_device_train_batch_size=64,  # Adjust batch size
    per_device_eval_batch_size=64,
    warmup_steps=1000,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Define trainer
trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,  # Using the test dataset for evaluation
)

# Train the model
trainer.train()

# Get predictions for the test set
predictions = trainer.predict(test_dataset)

# Get predicted labels
predicted_labels = torch.argmax(torch.from_numpy(predictions.predictions), dim=1).tolist()

# Calculate and print metrics (excluding AUC)
accuracy = accuracy_score(test_labels.tolist(), predicted_labels)
print(f"Accuracy: {accuracy}")
precision, recall, f1, _ = precision_recall_fscore_support(test_labels.tolist(), predicted_labels, average='weighted')
print(f"Weighted F1 Score: {f1}")

# Calculate and print F1 scores per class with corresponding human values
class_f1_scores = precision_recall_fscore_support(test_labels.tolist(), predicted_labels, average=None, labels=all_classes)
for name, score in zip(label_encoder.inverse_transform(all_classes), class_f1_scores[2]):
    print(f"{name} F1 Score: {score}")

# Save predictions to csv
pred_df = pd.DataFrame({'Prediction': predicted_labels, 'Text': test_texts.tolist()})
pred_df.to_csv('classification_results_roberta.csv', index=False)

print('Saved results to classification_results_roberta.csv')


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


Step,Training Loss
500,0.8302


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


Accuracy: 0.8001537279016141
Weighted F1 Score: 0.8022413801020528
Civility/Tolerance F1 Score: 0.8075117370892018
Duty/Accountability F1 Score: 0.8129496402877697
Empathy/Helpfulness F1 Score: 0.6292134831460674
Information Seeking F1 Score: 0.8311425682507584
Justice & Human/Animal Rights F1 Score: 0.8831168831168831
Well-being/Peace F1 Score: 0.6486486486486487
Wisdom/Knowledge F1 Score: 0.8150208623087621
Saved results to classification_results_roberta.csv


In [None]:
#we also experimented with Bert, but ultimately selected roberta due to the slightly better performance
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.utils.class_weight import compute_class_weight
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset, WeightedRandomSampler

# Set seed for reproducibility
torch.manual_seed(42)

# Custom dataset class
class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Load data
train_df = pd.read_csv('human_values_annotation_groundtruth.csv', on_bad_lines='skip')
test_df = pd.read_csv('Anthropic_hh-rlhf_full_processed_1_18_24.csv', on_bad_lines='skip')

# Check for missing values in labels
if train_df['Label'].isnull().any() or test_df['chosen'].isnull().any():
    raise ValueError("Missing values in label column. Handle missing values before proceeding.")

# Convert labels to numeric format using LabelEncoder
label_encoder = LabelEncoder()
train_df['Label'] = label_encoder.fit_transform(train_df['Label'])
num_labels = len(label_encoder.classes_)

# Split data into train and test
train_texts, test_texts, train_labels, test_labels = train_test_split(train_df['Text'], train_df['Label'], test_size=0.2, random_state=42)

# Tokenize data using BERT tokenizer with explicit max_length
max_length = 128  # Set your desired max length
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', max_length=max_length)
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding=True, return_tensors='pt', max_length=max_length)
test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding=True, return_tensors='pt', max_length=max_length)

# Convert labels to tensor
train_labels = torch.tensor(train_labels.tolist(), dtype=torch.long)
test_labels = torch.tensor(test_labels.tolist(), dtype=torch.long)

# Ensure all classes are present in both training and test labels
all_classes = torch.unique(torch.cat((train_labels, test_labels))).numpy()

# Calculate class weights for imbalanced classes
class_weights = torch.tensor(compute_class_weight('balanced', classes=all_classes, y=train_labels.numpy()), dtype=torch.float)

# Create custom datasets with weighted sampling for class imbalance
weights = class_weights[train_labels]
sampler = WeightedRandomSampler(weights, len(weights), replacement=True)
train_dataset = CustomDataset(train_encodings, train_labels)
test_dataset = CustomDataset(test_encodings, test_labels)

# Fine-tune model using BERT Large with regularization, data augmentation, and class weights
def model_init():
    model = BertForSequenceClassification.from_pretrained('bert-large-uncased', num_labels=num_labels, num_hidden_layers=24)  # Increase the number of layers
    # Apply regularization techniques (e.g., dropout)
    model.bert.encoder.layer[-1].output.layer_norm = torch.nn.Dropout(0.1)
    return model

# Define training arguments with hyperparameter tuning
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=8,  # Increase the number of epochs
    per_device_train_batch_size=32,  # Adjust batch size
    per_device_eval_batch_size=32,
    warmup_steps=1000,
    weight_decay=0.01,
    logging_dir='./logs',
)

# Define trainer
trainer = Trainer(
    model_init=model_init,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,  # Using the test dataset for evaluation
)

# Train the model
trainer.train()

# Get predictions for the test set
predictions = trainer.predict(test_dataset)

# Get predicted labels
predicted_labels = torch.argmax(torch.from_numpy(predictions.predictions), dim=1).tolist()

# Calculate and print metrics (excluding AUC)
accuracy = accuracy_score(test_labels.tolist(), predicted_labels)
print(f"Accuracy: {accuracy}")
precision, recall, f1, _ = precision_recall_fscore_support(test_labels.tolist(), predicted_labels, average='weighted')
print(f"Weighted F1 Score: {f1}")

# Calculate and print F1 scores per class with corresponding human values
class_f1_scores = precision_recall_fscore_support(test_labels.tolist(), predicted_labels, average=None, labels=all_classes)
for name, score in zip(label_encoder.inverse_transform(all_classes), class_f1_scores[2]):
    print(f"{name} F1 Score: {score}")

# Save predictions to csv
pred_df = pd.DataFrame({'Prediction': predicted_labels, 'Text': test_texts.tolist()})
pred_df.to_csv('classification_results_bert_large.csv', index=False)

print('Saved results to classification_results_bert_large.csv')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-large-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


Step,Training Loss
500,0.9724
1000,0.2712


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


Accuracy: 0.8009223674096848
Weighted F1 Score: 0.8017768255887486
Civility/Tolerance F1 Score: 0.7699530516431924
Duty/Accountability F1 Score: 0.7894736842105263
Empathy/Helpfulness F1 Score: 0.6013071895424835
Information Seeking F1 Score: 0.8340248962655601
Justice & Human/Animal Rights F1 Score: 0.8148148148148148
Well-being/Peace F1 Score: 0.6274509803921569
Wisdom/Knowledge F1 Score: 0.844559585492228
Saved results to classification_results_bert_large.csv
