Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

Load dataset

In [None]:
# Load and prepare dataset
df = pd.read_csv("merged_dataset.csv")
df = df[df["translated_text"].notna() & df["orientation"].notna()]

BERT

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

# Encode orientation labels
le = LabelEncoder()
df["label"] = le.fit_transform(df["orientation"])

# Convert to Hugging Face dataset
dataset = Dataset.from_pandas(df[["translated_text", "label"]])

# Tokenization
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_function(example):
    return tokenizer(example["translated_text"], truncation=True, padding="max_length")

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Convert to pandas for stratified split
tokenized_df = tokenized_dataset.to_pandas()
train_df, test_df = train_test_split(tokenized_df, test_size=0.2, stratify=tokenized_df["label"], random_state=42)

# Re-wrap as Hugging Face Datasets
from datasets import Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Load model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)

# Define compute metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted")
    }

# Training arguments
training_args = TrainingArguments(
    output_dir="./bert_results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.001,
    logging_dir="./logs"
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()

Map:   0%|          | 0/775 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss


{'eval_loss': 1.1830779314041138,
 'eval_accuracy': 0.6580645161290323,
 'eval_f1': 0.6611854095725064,
 'eval_runtime': 4.5639,
 'eval_samples_per_second': 33.962,
 'eval_steps_per_second': 4.382,
 'epoch': 4.0}

Roberta Base

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

# Encode orientation labels
le = LabelEncoder()
df["label"] = le.fit_transform(df["orientation"])

# Convert to Hugging Face dataset
dataset = Dataset.from_pandas(df[["translated_text", "label"]])

# Tokenization
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

def tokenize_function(example):
    return tokenizer(example["translated_text"], truncation=True, padding="max_length")

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Convert to pandas for stratified split
tokenized_df = tokenized_dataset.to_pandas()
train_df, test_df = train_test_split(tokenized_df, test_size=0.2, stratify=tokenized_df["label"], random_state=42)

# Re-wrap as Hugging Face Datasets
from datasets import Dataset
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Load model
model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=3)

# Define compute metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted")
    }

# Training arguments
training_args = TrainingArguments(
    output_dir="./bert_results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs"
)

# Trainer setup
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate the model
trainer.evaluate()

Map:   0%|          | 0/775 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss


{'eval_loss': 0.9645559191703796,
 'eval_accuracy': 0.5225806451612903,
 'eval_f1': 0.45339752283437357,
 'eval_runtime': 4.2771,
 'eval_samples_per_second': 36.24,
 'eval_steps_per_second': 4.676,
 'epoch': 3.0}

Binary BERT

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

# Load dataset
df = pd.read_csv("binary_merged_dataset.csv")
df = df[df["translated_text"].notna() & df["orientation"].notna()]

# Encode labels
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["orientation"])  # Left=0, Right=1

# Split data
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_df[["translated_text", "label"]])
test_dataset = Dataset.from_pandas(test_df[["translated_text", "label"]])

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize(example):
    return tokenizer(example["translated_text"], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# Load model
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# Compute metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds, average="weighted")
    }

# Training arguments
training_args = TrainingArguments(
    output_dir="./binary_bert_results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./binary_logs"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Train and evaluate
trainer.train()
trainer.evaluate()


Map:   0%|          | 0/547 [00:00<?, ? examples/s]

Map:   0%|          | 0/137 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss


{'eval_loss': 0.6466796398162842,
 'eval_accuracy': 0.6131386861313869,
 'eval_f1': 0.6098912074207544,
 'eval_runtime': 4.1207,
 'eval_samples_per_second': 33.247,
 'eval_steps_per_second': 4.368,
 'epoch': 3.0}

BERT Regression

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

# Load dataset
df = pd.read_csv("extremity_merged_dataset.csv")
df = df[df["translated_text"].notna() & df["extremity"].notna()]

# Split data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Prepare datasets
train_df = train_df.rename(columns={"extremity": "labels"})
test_df = test_df.rename(columns={"extremity": "labels"})
train_dataset = Dataset.from_pandas(train_df[["translated_text", "labels"]])
test_dataset = Dataset.from_pandas(test_df[["translated_text", "labels"]])

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize(example):
    return tokenizer(example["translated_text"], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# Load model for regression
model = AutoModelForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels=1,
    problem_type="regression"
)
model.config.hidden_dropout_prob = 0.3

# Define evaluation metrics
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = preds.squeeze()
    return {
        "mse": mean_squared_error(labels, preds),
        "r2": r2_score(labels, preds)
    }

# Training arguments
training_args = TrainingArguments(
    output_dir="./spectrum_bert_results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=6,
    weight_decay=0.01,
    logging_dir="./spectrum_logs"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Train and evaluate
trainer.train()
trainer.evaluate()

Map:   0%|          | 0/299 [00:00<?, ? examples/s]

Map:   0%|          | 0/75 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss


{'eval_loss': 0.37652328610420227,
 'eval_mse': 0.3765232563018799,
 'eval_r2': 0.4788602590560913,
 'eval_runtime': 2.243,
 'eval_samples_per_second': 33.437,
 'eval_steps_per_second': 4.458,
 'epoch': 6.0}

ROBERTA Regression

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

# Load dataset
df = pd.read_csv("extremity_merged_dataset.csv")
df = df[df["translated_text"].notna() & df["extremity"].notna()]

# Split data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Prepare datasets
train_df = train_df.rename(columns={"extremity": "labels"})
test_df = test_df.rename(columns={"extremity": "labels"})
train_dataset = Dataset.from_pandas(train_df[["translated_text", "labels"]])
test_dataset = Dataset.from_pandas(test_df[["translated_text", "labels"]])

# Use RoBERTa for regression
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

def tokenize(example):
    return tokenizer(example["translated_text"], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# Load model for regression
model = AutoModelForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=1,
    problem_type="regression"
)
model.config.hidden_dropout_prob = 0.3  # Helps reduce overfitting

# Define evaluation metrics
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = preds.squeeze()
    return {
        "mse": mean_squared_error(labels, preds),
        "r2": r2_score(labels, preds)
    }

# Training arguments
training_args = TrainingArguments(
    output_dir="./spectrum_bert_results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=6,
    weight_decay=0.01,
    logging_dir="./spectrum_logs"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Train and evaluate
trainer.train()
trainer.evaluate()

Map:   0%|          | 0/299 [00:00<?, ? examples/s]

Map:   0%|          | 0/75 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss


{'eval_loss': 0.2642052471637726,
 'eval_mse': 0.2642051875591278,
 'eval_r2': 0.6343178749084473,
 'eval_runtime': 2.1426,
 'eval_samples_per_second': 35.004,
 'eval_steps_per_second': 4.667,
 'epoch': 6.0}

Roberta Regression Fine Tuning

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

# Load dataset
df = pd.read_csv("extremity_merged_dataset.csv")
df = df[df["translated_text"].notna() & df["extremity"].notna()]

# Split data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Prepare datasets
train_df = train_df.rename(columns={"extremity": "labels"})
test_df = test_df.rename(columns={"extremity": "labels"})
train_dataset = Dataset.from_pandas(train_df[["translated_text", "labels"]])
test_dataset = Dataset.from_pandas(test_df[["translated_text", "labels"]])

# Use RoBERTa for regression
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

def tokenize(example):
    return tokenizer(example["translated_text"], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# Load model for regression
model = AutoModelForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=1,
    problem_type="regression"
)
model.config.hidden_dropout_prob = 0.3  # Helps reduce overfitting

# Define evaluation metrics
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = preds.squeeze()
    return {
        "mse": mean_squared_error(labels, preds),
        "r2": r2_score(labels, preds)
    }

# Training arguments
training_args = TrainingArguments(
    output_dir="./spectrum_bert_results",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=6,
    weight_decay=0.01,
    logging_dir="./spectrum_logs"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Train and evaluate
trainer.train()
trainer.evaluate()

Map:   0%|          | 0/299 [00:00<?, ? examples/s]

Map:   0%|          | 0/75 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss


{'eval_loss': 0.2271214872598648,
 'eval_mse': 0.227121502161026,
 'eval_r2': 0.685644805431366,
 'eval_runtime': 2.1184,
 'eval_samples_per_second': 35.404,
 'eval_steps_per_second': 4.721,
 'epoch': 6.0}

Further fine-tuning **(BEST MODEL)**

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

# Load dataset
df = pd.read_csv("extremity_merged_dataset.csv")
df = df[df["translated_text"].notna() & df["extremity"].notna()]

# Split data
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Prepare datasets
train_df = train_df.rename(columns={"extremity": "labels"})
test_df = test_df.rename(columns={"extremity": "labels"})
train_dataset = Dataset.from_pandas(train_df[["translated_text", "labels"]])
test_dataset = Dataset.from_pandas(test_df[["translated_text", "labels"]])

# Use RoBERTa for regression
tokenizer = AutoTokenizer.from_pretrained("roberta-base")

def tokenize(example):
    return tokenizer(example["translated_text"], padding="max_length", truncation=True)

train_dataset = train_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

# Load model for regression
model = AutoModelForSequenceClassification.from_pretrained(
    "roberta-base",
    num_labels=1,
    problem_type="regression"
)
model.config.hidden_dropout_prob = 0.3  # Helps reduce overfitting

# Define evaluation metrics
def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = preds.squeeze()
    return {
        "mse": mean_squared_error(labels, preds),
        "r2": r2_score(labels, preds)
    }

# Training arguments
training_args = TrainingArguments(
    output_dir="./spectrum_bert_results",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=8,
    weight_decay=0.01,
    logging_dir="./spectrum_logs"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)

# Train and evaluate
trainer.train()
trainer.evaluate()

Map:   0%|          | 0/299 [00:00<?, ? examples/s]

Map:   0%|          | 0/75 [00:00<?, ? examples/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Step,Training Loss


{'eval_loss': 0.21875528991222382,
 'eval_mse': 0.2187553197145462,
 'eval_r2': 0.6972243189811707,
 'eval_runtime': 2.1163,
 'eval_samples_per_second': 35.44,
 'eval_steps_per_second': 4.725,
 'epoch': 8.0}

Save Model

In [None]:
# Save the model
model.save_pretrained("extremity_roberta_best")
tokenizer.save_pretrained("extremity_roberta_best")

('extremity_roberta_best/tokenizer_config.json',
 'extremity_roberta_best/special_tokens_map.json',
 'extremity_roberta_best/vocab.json',
 'extremity_roberta_best/merges.txt',
 'extremity_roberta_best/added_tokens.json',
 'extremity_roberta_best/tokenizer.json')

Download Model

In [None]:
import shutil
shutil.make_archive("extremity_roberta_best", 'zip', "extremity_roberta_best")
from google.colab import files
files.download("extremity_roberta_best.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>