In [2]:
import pandas as pd
partial_train_df = pd.read_csv('train_set.csv')
internal_dev_df = pd.read_csv('internal_validation_set.csv')
val_df = pd.read_csv('dev_set.csv')

# Combine partial train with internal dev to make the train set
train_df = pd.concat([partial_train_df, internal_dev_df], ignore_index=True)

# Replace NaNs with empty strings
train_df['text'].fillna('', inplace=True)
val_df['text'].fillna('', inplace=True)

val_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2094 entries, 0 to 2093
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   par_id      2094 non-null   int64 
 1   community   2094 non-null   object
 2   text        2094 non-null   object
 3   label       2094 non-null   int64 
 4   orig_label  2094 non-null   int64 
dtypes: int64(3), object(2)
memory usage: 81.9+ KB


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train_df['text'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  val_df['text'].fillna('', inplace=True)


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import resample
from sklearn.metrics import classification_report
from simpletransformers.classification import ClassificationModel, ClassificationArgs

In [4]:
def create_folds(df, k=5, output_prefix="fold"):
    """
    Splits dataset into stratified K folds, relying on StratifiedKFold to maintain class balance.
    """
    assert "text" in df.columns and "label" in df.columns, "Dataset must contain 'text' and 'label' columns."

    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
    fold_idx = 1

    for train_index, val_index in skf.split(df["text"], df["label"]):
        train_df, val_df = df.iloc[train_index], df.iloc[val_index]

        train_df.to_csv(f"{output_prefix}_train_{fold_idx}.csv", index=False)
        val_df.to_csv(f"{output_prefix}_val_{fold_idx}.csv", index=False)
        print(f"Saved Fold {fold_idx}: {len(train_df)} training, {len(val_df)} validation")
        fold_idx += 1

# create_folds(train_df, k=5, output_prefix="fold")

In [11]:
# Returns the positive class F1-score
def f1(preds, labels):
    preds = np.array(preds)
    labels = np.array(labels)
    return classification_report(labels, preds, output_dict=True)["1"]["f1-score"]

def train_deberta(fold_idx, learning_rate, batch_size, num_epochs, weight_decay, num_layers_unfrozen, log_prefix="log_lr_"):
    # Load dataset
    train_file = f"fold_train_{fold_idx}.csv"
    val_file = f"fold_val_{fold_idx}.csv"

    train_df = pd.read_csv(train_file)
    val_df = pd.read_csv(val_file)

    # Upsample the minority class within the training set
    df_majority = train_df[train_df["label"] == 0]
    df_minority = train_df[train_df["label"] == 1]
    target_minority_size = int(len(df_majority) * (2 / 3)) # Ratio 2:3

    duplication_factor = target_minority_size // len(df_minority)
    remainder = target_minority_size % len(df_minority)
    df_minority_upsampled = pd.concat([df_minority] * duplication_factor, ignore_index=True)
    df_minority_upsampled = pd.concat([df_minority_upsampled, df_minority.sample(n=remainder, random_state=42)], ignore_index=True)

    train_df_upsampled = pd.concat([df_majority, df_minority_upsampled]).sample(frac=1, random_state=42).reset_index(drop=True)

    print(f"Fold {fold_idx}: Training set size after upsampling: {len(train_df_upsampled)} ({train_df_upsampled['label'].value_counts()}), Validation set size: {len(val_df)} ({val_df['label'].value_counts()})")

    # Model configuration with hyperparameters
    model_args = {
        "num_train_epochs": num_epochs,
        "train_batch_size": batch_size,
        "eval_batch_size": batch_size,
        "learning_rate": learning_rate,
        "weight_decay": weight_decay,
        "output_dir": f"outputs_lr_{learning_rate}_batch_size_{batch_size}_unfreeze_{num_layers_unfrozen}/fold_{fold_idx}",
        "overwrite_output_dir": True,
        "save_best_model": False,
        "save_eval_checkpoints": False,
        "save_model_every_epoch": False,
        "early_stopping_patience": 3,  # Stop if no improvement after 3 eval steps
        "evaluate_during_training": False,
        # "evaluate_during_training_steps": 2000,
        "use_early_stopping": False,
        "use_multiprocessing": False,
        "use_multiprocessing_for_evaluation": False,
        # "early_stopping_metric": "f1",
        # "early_stopping_metric_minimize": False, # Maximise the F1-score
        "classification_report": True,
        "reprocess_input_data": True,
        "save_steps": -1,
        "fp16": False  # Ensure FP16 is disabled
    }

    # Initialize DeBERTa model
    model = ClassificationModel(
        "deberta",  # Model type
        "microsoft/deberta-base",
        num_labels=2,
        args=model_args,
        use_cuda=True
    )

    # Unfreeze the last `num_layers_unfrozen` layers + classifier head
    model_layers = list(model.model.deberta.encoder.layer)
    num_total_layers = len(model_layers)
    layers_to_unfreeze = min(num_layers_unfrozen, num_total_layers)

    for name, param in model.model.named_parameters():
        param.requires_grad = False  # Freeze everything first

    for i in range(num_total_layers - layers_to_unfreeze, num_total_layers):
        for param in model_layers[i].parameters():
            param.requires_grad = True  # Unfreeze selected layers

    # Ensure classifier head is always trainable
    for name, param in model.model.named_parameters():
        if "classifier" in name:
            param.requires_grad = True

    # Train the model
    model.train_model(
        train_df_upsampled[["text", "label"]],
        # eval_df=val_df[["text", "label"]],
        f1=f1
    )

    # Evaluate on validation set
    # result, model_outputs, wrong_predictions = model.eval_model(val_df)
    # print("Validation Results:", result)

    texts = val_df['text'].tolist()

    # Get predictions
    preds, raw = model.predict(texts)

    # Generate classification report
    report = classification_report(val_df['label'], preds, digits=4, output_dict=True)
    print("Classification Report:")
    print(classification_report(val_df['label'], preds, digits=4))

    # Log results to a separate file per job
    base_prefix = "./"
    log_file = f"{base_prefix}{log_prefix}{learning_rate}.csv"
    log_data = {
        "learning_rate": learning_rate,
        "batch_size": batch_size,
        "num_epochs": num_epochs,
        "weight_decay": weight_decay,
        "num_layers_unfrozen": num_layers_unfrozen,
        "f1_class_0": report["0"]["f1-score"],
        "f1_class_1": report["1"]["f1-score"],
        "precision_class_0": report["0"]["precision"],
        "precision_class_1": report["1"]["precision"],
        "recall_class_0": report["0"]["recall"],
        "recall_class_1": report["1"]["recall"],
        "overall_f1": report["macro avg"]["f1-score"]
    }

    log_df = pd.DataFrame([log_data])
    log_df.to_csv(log_file, index=False)

    return report["1"]["f1-score"]


In [16]:
learning_rate = 0.00002
batch_size = 16
num_epochs = 5
weight_decay = 0.0
num_layers_unfrozen = 5

f1_scores = []

for i in range(1, 6):  # Assuming 5 folds
        f1_score = train_deberta(fold_idx=i, learning_rate=learning_rate, batch_size=batch_size, num_epochs=num_epochs, weight_decay=weight_decay, num_layers_unfrozen=num_layers_unfrozen)
        f1_scores.append(f1_score)

# Print the average F1-score for this combination
print(f"Average F1-score for lr={learning_rate}, batch_size={batch_size}, num_epochs={num_epochs}, weight_decay={weight_decay}, num_layers_unfrozen={num_layers_unfrozen}: {np.mean(f1_scores)}")
print(f1_scores)

Fold 1: Training set size after upsampling: 10106 (label
0    6064
1    4042
Name: count, dtype: int64), Validation set size: 1675 (label
0    1517
1     158
Name: count, dtype: int64)


Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Running Epoch 1 of 5:   0%|          | 0/632 [00:00<?, ?it/s]

KeyboardInterrupt: 