In [None]:
"""
!pip install transformers[torch]
!pip install accelerate
!pip install transformers_interpret
!pip install git+https://github.com/allenai/longformer.git
!pip install convokit
"""

Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.30.1-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.6/302.6 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch->transformers[torch])
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch->transformers[torch])
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch->transformers[torch])
  Using cached nvidia_cublas_cu

In [None]:
import random
import torch
import json
import random
import torch
from torch.nn import L1Loss
from sklearn.metrics import mean_absolute_error
import numpy as np
from sklearn.metrics import mean_squared_error
from scipy import stats
from transformers import (
    Trainer,
    TrainingArguments,
    RobertaForSequenceClassification,
    RobertaTokenizerFast,
    LongformerForSequenceClassification,
    LongformerTokenizerFast,
    EarlyStoppingCallback)

class conv_data_loader(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]).to(device) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).to(device)
        return item

    def __len__(self):
        return len(self.labels)


# Define the compute_metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {"rmse": rmse}

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    mae = mean_absolute_error(labels, predictions)
    return {"mae": mae}

# 1. Reading Data

## 1.1. OUM dataset

In [None]:
# Define the OUMDataset class
class OUMDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]).to(device) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).to(device)
        return item

    def __len__(self):
        return len(self.labels)

# Load the data
def load_data_oum(label='after'):
    final_convs = []
    final_labels = []
    final_experience_features = []
    wizards_data = []
    moral_foundations = ["care", "fairness", "liberty", "loyalty", "authority", "sanctity", "none"]
    input_files = {"wizards": "wizards_dialogues.json", "final_argubot": "argubot_final_exp.json",
                   "models_dialogues": "models_dialogues.json"}
    dials_with_scores = {"wizards": {}, "final_argubot": {}, "models_dialogues": {}}


    for key in input_files:
        input_file = input_files[key]
        with open(input_file, "r", encoding="utf-8") as f:
            data = json.load(f)
        for d in data:
            is_wiki = False
            for m in d["messages"]:
                if 'model' in m and (m['model'] == 'wikibot' or m['model'] == 'controlbot'):
                    is_wiki = True
                    break
            if is_wiki:
                continue
            yes_no = 'none'
            k = 'Did you vote for (Leave) or against (Remain) Brexit in the 2016 UK referendum?'
            if k in d['participant_info']:
                if d['participant_info'][k].lower() == 'against (remain)':
                    yes_no = 'no'
                elif d['participant_info'][k].lower() == 'for (leave)':
                    yes_no = 'yes'
                else:
                    yes_no = 'none'

            k = 'In the referendum on whether the UK should remain a member of the EU (BREXIT), how did you vote?'
            if k in d['participant_info']:
                if d['participant_info'][k].lower() == 'remain (against brexit)':
                    yes_no = 'no'
                elif d['participant_info'][k].lower() == 'leave (for brexit)':
                    yes_no = 'yes'
                else:
                    yes_no = 'none'
            k = 'Have you had at least one dose of an approved Covid-19 vaccine?'
            if k in d['participant_info']:
                if d['participant_info'][k].lower() == 'yes':
                    yes_no = 'yes'
                elif d['participant_info'][k].lower() == 'no':
                    yes_no = 'no'
            k = 'Are you a vegan?'
            if k in d['participant_info']:
                if d['participant_info'][k].lower() == 'yes':
                    yes_no = 'yes'
                elif d['participant_info'][k].lower() == 'no':
                    yes_no = 'no'

            if yes_no == 'none':
                continue

            if 'Questions' in d['participant_info']:
                for q in d['participant_info']['Questions']:
                    if "final" in input_file:
                        if label == 'oum':
                            continue
                        if d['participant_info']['Questions'][q]['after'] == -1:
                            continue
                    elif d['participant_info']['Questions'][q]['before'] == -1 or d['participant_info']['Questions'][q]['after'] == -1:
                        continue
                    if 'good reasons' in q.lower():
                        if d['topic'] != 'brexit' and 'not' in q.lower() and yes_no == 'no':
                            continue
                        if d['topic'] != 'brexit' and 'not' not in q.lower() and yes_no == 'yes':
                            continue
                        if 'leave' in q.lower() and yes_no == 'yes':
                            continue
                        if 'remain' in q.lower() and yes_no == 'no':
                            continue
                        if d["_id"] not in dials_with_scores[key]:
                            text = ''
                            dials_with_scores[key][d["_id"]] = {"topic": d["topic"], "dataset": key}
                            for message in d['messages']:
                                if message['role'] == 'admin' or 'modified_argument' not in message:
                                    continue

                                text = text + '\n\n' + '<' + message['role'] + '>' + '\n' + message['modified_argument']
                            dials_with_scores[key][d["_id"]]['text'] = text.strip()
                            final_convs.append(text.strip())



                    if 'good reasons' in q.lower():
                        if False and label == 'oum':
                            final_labels.append(float(d['participant_info']['Questions'][q]['after']) - float(d['participant_info']['Questions'][q]['before']))
                        else:
                            final_labels.append(float(d['participant_info']['Questions'][q]['after']))
                        oum = d['participant_info']['Questions'][q]['after'] - d['participant_info']['Questions'][q]['before'] if "final" not in input_file else None
                        dials_with_scores[key][d["_id"]]["good_reasons"] = {"oum": oum, "after": d['participant_info']['Questions'][q]['after']}
                        if 'before' in d['participant_info']['Questions'][q] and d['participant_info']['Questions'][q]['before'] != -1:
                            dials_with_scores[key][d["_id"]]["good_reasons"]['before'] = d['participant_info']['Questions'][q]['before']
                        else:
                            dials_with_scores[key][d["_id"]]["good_reasons"]['before'] = None



    assert len(final_convs) == len(final_labels)
    return final_convs, final_labels 
conversations, labels = load_data_oum()

## 1.2. Wikitactics

In [None]:
import json
import pandas as pd
import numpy as np
from collections import Counter

def load_data_wikitac():
    with open('./wikitactics.json') as f:
        data = json.load(f)

    conversations = []
    utterances_cleaned = []
    labels = []

    for dispute in data:
        conversation = []
        utt_cleaned = []
        users = list()
        for utterance in dispute['utterances']:
            username = utterance['username']
            text = utterance['text']
            conversation.append(f"<{username}>\n{text}\n\n")
            utt_cleaned.append(text)
        conversations.append('\n'.join(conversation))
        utterances_cleaned.append('\n'.join(utt_cleaned))
        labels.append(dispute['escalation_label'])

    return conversations, labels

conversations, labels = load_data_wikitac()

## 1.3. AFD data

In [None]:
def load_data_afd():
    # Load the data from the JSON file
    with open('afd_1000_randomised_dialogues.json', 'r') as json_file:
        data_dict = json.load(json_file)

    # Extract the conversations, utterances, and labels from the data dictionary
    conversations = data_dict['conversations']
    utterances = data_dict['utterances']
    labels = data_dict['labels']
    labels = [1 if i == 0 else 0 for i in labels]
    return conversations, utterances, labels

conversations, utterances, labels = load_data_afd()

# 2. Training the Model

## 2.1. Seven-fold Flat Cross Validation

### 2.1.1. OUM data

#### 2.1.1.1. Fine-tune the whole model

In [None]:
import random
import torch
from torch.nn import L1Loss
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error
from scipy.stats import spearmanr
import itertools
import time

In [None]:
LRs = [2e-5, 1e-4]
WARMUP_EPOCHS = [3, 1] 
MAX_LENGTH = [2048, 4096]

hyperparameter_combinations = list(itertools.product(LRs, WARMUP_EPOCHS, MAX_LENGTH))

#################################################
##### Define Trainer and Args class objects #####
#################################################
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False): # Using MAE as the loss function
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        labels = labels.float()

        loss_fct = L1Loss()
        mae_loss = loss_fct(logits.squeeze(), labels)

        return (mae_loss, outputs) if return_outputs else mae_loss

class Args:
    model_path = 'models/'
    num_labels = 1
    num_epochs = 5
    train_batch_size = 1
    valid_batch_size = 1
    model_name = 'allenai/longformer-large-4096' 
    model_type = 'longformer' 
    logging_steps = 1  
    # save_steps = 300
    mode = 'train'
    labels = 'after'

args = Args()

########################################
##### Define seeds and model setup #####
########################################
device = 'cuda' if torch.cuda.is_available() else 'cpu'

seed = 42
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

if device == 'cuda':
    torch.cuda.manual_seed_all(seed)

####################################
##### Define Data and Training #####
####################################
x, y = load_data_oum()
all_data = list(zip(x, y))
random.shuffle(all_data)

num_folds = 7 # 7-fold cross validation
kf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=seed)

results_file = "oum_hyperparameters_results.csv"
with open(results_file, "w") as f:
    f.write("Warmup Steps,Learning Rate,Max Length,Spearman Correlation,MAE\n")

for i, (lr, warmup_epochs, max_length) in enumerate(hyperparameter_combinations):
    print(f'\n\n{i}')
    print(f"Training with hyperparameters: Warmup Steps={warmup_epochs}, Learning Rate={lr}, Max Length={max_length}")

    all_predictions = []
    all_labels = []
    all_val_indexes = [] 

    for fold, (train_index, val_index) in enumerate(kf.split(x, y), start=1):
        print(f"Fold {fold}")
        train_data = [all_data[i] for i in train_index]
        val_data = [all_data[i] for i in val_index]
        train_convs, train_labels = zip(*train_data)
        val_convs, val_labels = zip(*val_data)

        MODEL_CLASSES = {
            "longformer": (LongformerForSequenceClassification, LongformerTokenizerFast)
        }
        model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
        model = model_class.from_pretrained(args.model_name, num_labels=args.num_labels).to(device)
        tokenizer = tokenizer_class.from_pretrained(args.model_name) 
        tokenizer.model_max_length = max_length

        train_encodings = tokenizer(list(train_convs), truncation=True, padding=True, max_length=max_length)
        val_encodings = tokenizer(list(val_convs), truncation=True, padding=True, max_length=max_length)

        train_dataset = conv_data_loader(train_encodings, train_labels)
        val_dataset = conv_data_loader(val_encodings, val_labels)


        warmup_steps = len(train_dataset) // (args.train_batch_size * 32) * warmup_epochs

        training_args = TrainingArguments(
            output_dir=args.model_path + f'single_model_fold{fold}/',
            num_train_epochs=args.num_epochs,
            per_device_train_batch_size=args.train_batch_size,
            per_device_eval_batch_size=args.valid_batch_size,
            warmup_steps=warmup_steps,
            learning_rate=lr,
            logging_dir=args.model_path + f'single_model_fold{fold}/logs',
            load_best_model_at_end=False,
            metric_for_best_model='loss',
            logging_steps=args.logging_steps,
            evaluation_strategy="epoch", 
            save_strategy="epoch",
            save_total_limit=1,
            gradient_accumulation_steps=32,  # The true batch size is 32
            logging_strategy="steps",
            logging_first_step=True,
        )

        trainer = CustomTrainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics
        )

        if args.mode == 'train':
            trainer.train()
            model.save_pretrained(args.model_path + f'single_model_fold{fold}/')
            tokenizer.save_pretrained(args.model_path + f'single_model_fold{fold}/')
            eval_output = trainer.evaluate()
            print('Evaluation results:', eval_output)

        output = trainer.predict(val_dataset)
        predictions = output.predictions.squeeze()
        all_predictions.extend(predictions)
        all_labels.extend(val_labels)
        all_val_indexes.extend(val_index)

        # Clean GPU memory
        import gc
        del model, train_dataset, val_dataset, train_data, val_data, train_convs, val_convs, train_labels, val_labels, tokenizer
        del train_encodings, val_encodings, trainer, output, predictions
        gc.collect()
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()

        model = None
        train_dataset = None
        val_dataset = None
        train_data = None
        val_data = None
        train_convs = None
        val_convs = None
        train_labels = None
        val_labels = None
        train_encodings = None
        val_encodings = None
        trainer = None
        output = None
        predictions = None
        tokenizer = False

    spearman_corr = spearmanr(all_predictions, all_labels)[0]
    mae = mean_absolute_error(all_labels, all_predictions)

    print(f"Spearman Correlation: {spearman_corr}")
    print(f"MAE: {mae}")

    with open(results_file, "a") as f:
        f.write(f"{warmup_steps},{lr},{max_length},{spearman_corr},{mae}\n")

    predictions_labels_file = f"oum_predictions_labels_warmup={warmup_steps}_lr={lr}_maxlen={max_length}.csv"
    with open(predictions_labels_file, "w") as f:
        f.write("Prediction,Label,Index\n")
        for pred, label, index in zip(all_predictions, all_labels, all_val_indexes):
            f.write(f"{pred},{label},{index}\n")

    time.sleep(100) 

In [None]:
#############################################
##### Using the optimal hyperparameters #####
#############################################
LRs = [2e-5]
WARMUP_EPOCHS = [1]
MAX_LENGTH = [2048]

hyperparameter_combinations = list(itertools.product(LRs, WARMUP_EPOCHS, MAX_LENGTH))

#################################################
##### Define Trainer and Args class objects #####
#################################################
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False): # Using MAE as the loss function
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        labels = labels.float()

        loss_fct = L1Loss()
        mae_loss = loss_fct(logits.squeeze(), labels)

        return (mae_loss, outputs) if return_outputs else mae_loss

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    mae = mean_absolute_error(labels, predictions)
    return {"mae": mae}

class Args:
    model_path = 'models/'
    num_labels = 1
    num_epochs = 5
    train_batch_size = 1
    valid_batch_size = 1
    model_name = 'allenai/longformer-large-4096' 
    model_type = 'longformer' 
    logging_steps = 1
    mode = 'train'
    labels = 'after'

args = Args()

########################################
##### Define seeds and model setup #####
########################################
device = 'cuda' if torch.cuda.is_available() else 'cpu'

for seed in [1, 2, 3]:
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

    if device == 'cuda':
        torch.cuda.manual_seed_all(seed)

    ####################################
    ##### Define Data and Training #####
    ####################################
    x, y = load_data_oum()
    all_data = list(zip(x, y))
    random.shuffle(all_data)

    num_folds = 7
    kf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=seed)


    for i, (lr, warmup_epochs, max_length) in enumerate(hyperparameter_combinations):
        print(f'\n\n{i}')
        print(f"Training with hyperparameters: Warmup Steps={warmup_epochs}, Learning Rate={lr}, Max Length={max_length}")

        all_predictions = []
        all_labels = []
        all_val_convs = []
        for fold, (train_index, val_index) in enumerate(kf.split(x, y), start=1):
            print(f"Fold {fold}")
            train_data = [all_data[i] for i in train_index]
            val_data = [all_data[i] for i in val_index]
            train_convs, train_labels = zip(*train_data)
            val_convs, val_labels = zip(*val_data)

            MODEL_CLASSES = {
                "longformer": (LongformerForSequenceClassification, LongformerTokenizerFast)
            }
            model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
            model = model_class.from_pretrained(args.model_name, num_labels=args.num_labels).to(device)
            tokenizer = tokenizer_class.from_pretrained(args.model_name) # do_lower_case=True
            tokenizer.model_max_length = max_length

            train_encodings = tokenizer(list(train_convs), truncation=True, padding=True, max_length=max_length)
            val_encodings = tokenizer(list(val_convs), truncation=True, padding=True, max_length=max_length)

            train_dataset = conv_data_loader(train_encodings, train_labels)
            val_dataset = conv_data_loader(val_encodings, val_labels)

            # Calculate warmup steps based on the current warmup_epochs value
            warmup_steps = len(train_dataset) // (args.train_batch_size * 32) * warmup_epochs

            training_args = TrainingArguments(
                output_dir=args.model_path + f'single_model_fold{fold}/',
                num_train_epochs=args.num_epochs,
                per_device_train_batch_size=args.train_batch_size,
                per_device_eval_batch_size=args.valid_batch_size,
                warmup_steps=warmup_steps,
                learning_rate=lr,
                logging_dir=args.model_path + f'single_model_fold{fold}/logs',
                load_best_model_at_end=False,
                metric_for_best_model='loss',
                logging_steps=args.logging_steps,
                evaluation_strategy="epoch", 
                save_strategy="epoch",
                save_total_limit=1,
                gradient_accumulation_steps=32, 
                logging_strategy="steps",
                logging_first_step=True,
            )

            trainer = CustomTrainer(
                model=model,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=val_dataset,
                tokenizer=tokenizer,
                compute_metrics=compute_metrics
            )

            if args.mode == 'train':
                trainer.train()
                model.save_pretrained(args.model_path + f'single_model_fold{fold}/')
                tokenizer.save_pretrained(args.model_path + f'single_model_fold{fold}/')
                eval_output = trainer.evaluate()
                print('Evaluation results:', eval_output)

            output = trainer.predict(val_dataset)
            predictions = output.predictions.squeeze()
            all_predictions.extend(predictions)
            all_labels.extend(val_labels)
            all_val_convs.extend(val_convs)

            # Clean GPU memory
            import gc
            del model, train_dataset, val_dataset, train_data, val_data, train_convs, val_convs, train_labels, val_labels, tokenizer
            del train_encodings, val_encodings, trainer, output, predictions
            gc.collect()
            torch.cuda.empty_cache()
            torch.cuda.reset_peak_memory_stats()

            model = None
            train_dataset = None
            val_dataset = None
            train_data = None
            val_data = None
            train_convs = None
            val_convs = None
            train_labels = None
            val_labels = None
            train_encodings = None
            val_encodings = None
            trainer = None
            output = None
            predictions = None
            tokenizer = False

        spearman_corr = spearmanr(all_predictions, all_labels)[0]
        mae = mean_absolute_error(all_labels, all_predictions)

        print(f"Spearman Correlation: {spearman_corr}")
        print(f"MAE: {mae}")

        predictions_labels_file = f"oum_lonformer_predictions_seed={seed}.json"

        data = []
        for conv, pred, label in zip(all_val_convs, all_predictions, all_labels):
            data.append({
                "Conversation": conv,
                "Prediction": float(pred),
                "Label": int(label)  
            })

        with open(predictions_labels_file, "w") as f:
            json.dump(data, f, indent=4)

        time.sleep(100) 

#### 2.1.1.2. Fine-tuning only the last layer

In [None]:
import random
import torch
from torch.nn import L1Loss
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error
from scipy.stats import spearmanr
import itertools
import time
import os

LRs = [2e-5, 1e-4]
WARMUP_EPOCHS = [3, 1] # warmup_steps for 1 or 3 epochs.
MAX_LENGTH = [2048, 4096]

hyperparameter_combinations = list(itertools.product(LRs, WARMUP_EPOCHS, MAX_LENGTH))

#################################################
##### Define Trainer and Args class objects #####
#################################################
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False): # Using MAE as the loss function
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get("logits")

        labels = labels.float()

        loss_fct = L1Loss()
        mae_loss = loss_fct(logits.squeeze(), labels)

        return (mae_loss, outputs) if return_outputs else mae_loss

class Args:
    model_path = 'models/'
    num_labels = 1
    num_epochs = 5
    train_batch_size = 1
    valid_batch_size = 1
    model_name = 'allenai/longformer-large-4096' 
    model_type = 'longformer'  
    logging_steps = 1 
    # save_steps = 300
    mode = 'train'
    labels = 'after'

args = Args()

########################################
##### Define seeds and model setup #####
########################################
device = 'cuda' if torch.cuda.is_available() else 'cpu'

seed = 42
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

if device == 'cuda':
    torch.cuda.manual_seed_all(seed)

####################################
##### Define Data and Training #####
####################################
x, y = load_data_oum()
all_data = list(zip(x, y))
random.shuffle(all_data)

num_folds = 7 # 7-fold cross validation
kf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=seed)

results_file = "oum_hyperparameters_results_lf-last-layer.csv"

if not os.path.exists(results_file):
    with open(results_file, "w") as f:
        f.write("Warmup Steps,Learning Rate,Max Length,Spearman Correlation,MAE\n")

for i, (lr, warmup_epochs, max_length) in enumerate(hyperparameter_combinations):
    print(f'\n\n{i}')
    print(f"Training with hyperparameters: Warmup Steps={warmup_epochs}, Learning Rate={lr}, Max Length={max_length}")

    all_predictions = []
    all_labels = []
    all_val_indexes = [] # useful to know the original indexes of data points (e.g., do topical analysis).

    for fold, (train_index, val_index) in enumerate(kf.split(x, y), start=1):
        print(f"Fold {fold}")
        train_data = [all_data[i] for i in train_index]
        val_data = [all_data[i] for i in val_index]
        train_convs, train_labels = zip(*train_data)
        val_convs, val_labels = zip(*val_data)

        MODEL_CLASSES = {
            "longformer": (LongformerForSequenceClassification, LongformerTokenizerFast)
        }
        model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
        model = model_class.from_pretrained(args.model_name, num_labels=args.num_labels).to(device)

        # Freeze all layers except the last one
        for name, param in model.named_parameters():
            if 'classifier' not in name:  
                param.requires_grad = False

        params_to_update = []
        for name, param in model.named_parameters():
            if param.requires_grad:
                params_to_update.append(param)

        tokenizer = tokenizer_class.from_pretrained(args.model_name)
        tokenizer.model_max_length = max_length

        train_encodings = tokenizer(list(train_convs), truncation=True, padding=True, max_length=max_length)
        val_encodings = tokenizer(list(val_convs), truncation=True, padding=True, max_length=max_length)

        train_dataset = conv_data_loader(train_encodings, train_labels)
        val_dataset = conv_data_loader(val_encodings, val_labels)

        # Calculate warmup steps based on the current warmup_epochs value
        warmup_steps = len(train_dataset) // (args.train_batch_size * 32) * warmup_epochs

        training_args = TrainingArguments(
            output_dir=args.model_path + f'single_model_fold{fold}/',
            num_train_epochs=args.num_epochs,
            per_device_train_batch_size=args.train_batch_size,
            per_device_eval_batch_size=args.valid_batch_size,
            warmup_steps=warmup_steps,
            learning_rate=lr,
            logging_dir=args.model_path + f'single_model_fold{fold}/logs',
            load_best_model_at_end=False,
            metric_for_best_model='loss',
            logging_steps=args.logging_steps,
            evaluation_strategy="epoch",
            save_strategy="epoch",  
            save_total_limit=1,
            gradient_accumulation_steps=32,  # The true batch size is 32
            logging_strategy="steps",
            logging_first_step=True,
        )

        trainer = CustomTrainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics
        )

        if args.mode == 'train':
            trainer.train()
            model.save_pretrained(args.model_path + f'single_model_fold{fold}/')
            tokenizer.save_pretrained(args.model_path + f'single_model_fold{fold}/')
            eval_output = trainer.evaluate()
            print('Evaluation results:', eval_output)

        output = trainer.predict(val_dataset)
        predictions = output.predictions.squeeze()
        all_predictions.extend(predictions)
        all_labels.extend(val_labels)
        all_val_indexes.extend(val_index)

        # Clean GPU memory
        import gc
        del model, train_dataset, val_dataset, train_data, val_data, train_convs, val_convs, train_labels, val_labels, tokenizer
        del train_encodings, val_encodings, trainer, output, predictions
        gc.collect()
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()

        model = None
        train_dataset = None
        val_dataset = None
        train_data = None
        val_data = None
        train_convs = None
        val_convs = None
        train_labels = None
        val_labels = None
        train_encodings = None
        val_encodings = None
        trainer = None
        output = None
        predictions = None
        tokenizer = False

    spearman_corr = spearmanr(all_predictions, all_labels)[0]
    mae = mean_absolute_error(all_labels, all_predictions)

    print(f"Spearman Correlation: {spearman_corr}")
    print(f"MAE: {mae}")

    with open(results_file, "a") as f:
        f.write(f"{warmup_steps},{lr},{max_length},{spearman_corr},{mae}\n")

    predictions_labels_file = f"oum_predictions_labels_warmup={warmup_steps}_lr={lr}_maxlen={max_length}_lf-last-layer.csv"
    with open(predictions_labels_file, "w") as f:
        f.write("Prediction,Label,Index\n")
        for pred, label, index in zip(all_predictions, all_labels, all_val_indexes):
            f.write(f"{pred},{label},{index}\n")

    time.sleep(100) 

### 2.1.2. AFD

#### 2.1.2.1. Fine-tuning the whole model

In [None]:
import random
import torch
from torch.nn import L1Loss
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error
from scipy.stats import spearmanr
import itertools
import time
from transformers import AutoModel
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import f1_score, roc_auc_score, average_precision_score
# from torch.nn import CrossEntropyLoss
from torch.nn import BCEWithLogitsLoss, BCELoss
from sklearn.model_selection import train_test_split
from torch.nn import BCEWithLogitsLoss

In [None]:
LRs = [2e-5, 1e-4]
WARMUP_EPOCHS = [3, 1] # warmup_steps for 1 or 3 epochs.
MAX_LENGTH = [2048, 4096]

hyperparameter_combinations = list(itertools.product(LRs, WARMUP_EPOCHS, MAX_LENGTH))

#################################################
##### Define Trainer and Args class objects #####
#################################################
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        labels = labels.float()

        # Apply sigmoid function to the logits
        sigmoid_logits = torch.sigmoid(logits)

        loss_fct = BCELoss()
        bce_loss = loss_fct(sigmoid_logits.view(-1), labels.view(-1))

        return (bce_loss, outputs) if return_outputs else bce_loss

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.reshape(-1)  # Flatten the predictions
    labels = labels.reshape(-1)  # Flatten the labels

    # Apply sigmoid function to the predictions
    sigmoid_predictions = torch.sigmoid(torch.from_numpy(predictions)).numpy()

    auroc = roc_auc_score(labels, sigmoid_predictions)
    aupr = average_precision_score(labels, sigmoid_predictions)

    return {"auroc": auroc, "aupr": aupr}


class Args:
    model_path = 'models/'
    num_labels = 1
    num_epochs = 5
    train_batch_size = 1
    valid_batch_size = 1
    model_name = 'allenai/longformer-large-4096'  
    model_type = 'longformer'  
    logging_steps = 1  
    mode = 'train'
    labels = 'after'

args = Args()

########################################
##### Define seeds and model setup #####
########################################
device = 'cuda' if torch.cuda.is_available() else 'cpu'

seed = 42
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

if device == 'cuda':
    torch.cuda.manual_seed_all(seed)

####################################
##### Define Data and Training #####
####################################
x, _, y = load_data_afd()
x, y = x, y

all_data = list(zip(x, y))
random.shuffle(all_data)

num_folds = 7 # 7-fold cross validation
kf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=seed)

results_file = "afd_hyperparameters_results.csv"
with open(results_file, "w") as f:
    f.write("Warmup Steps,Learning Rate,Max Length,AUROC,AUPR\n")

for i, (lr, warmup_epochs, max_length) in enumerate(hyperparameter_combinations):
    print(f'\n\n{i}')
    print(f"Training with hyperparameters: Warmup Steps={warmup_epochs}, Learning Rate={lr}, Max Length={max_length}")

    all_predictions = []
    all_labels = []
    all_val_convs = []

    for fold, (train_index, val_index) in enumerate(kf.split(x, y), start=1):
        print(f"Fold {fold}")
        train_data = [all_data[i] for i in train_index]
        val_data = [all_data[i] for i in val_index]
        train_convs, train_labels = zip(*train_data)
        val_convs, val_labels = zip(*val_data)

        MODEL_CLASSES = {
            "longformer": (LongformerForSequenceClassification, LongformerTokenizerFast)
        }
        model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
        model = model_class.from_pretrained(args.model_name, num_labels=args.num_labels).to(device)
        tokenizer = tokenizer_class.from_pretrained(args.model_name) # do_lower_case=True
        tokenizer.model_max_length = max_length

        train_encodings = tokenizer(list(train_convs), truncation=True, padding=True, max_length=max_length)
        val_encodings = tokenizer(list(val_convs), truncation=True, padding=True, max_length=max_length)

        train_dataset = conv_data_loader(train_encodings, train_labels)
        val_dataset = conv_data_loader(val_encodings, val_labels)

        # Calculate warmup steps based on the current warmup_epochs value
        warmup_steps = len(train_dataset) // (args.train_batch_size * 32) * warmup_epochs

        training_args = TrainingArguments(
            output_dir=args.model_path + f'single_model_fold{fold}/',
            num_train_epochs=args.num_epochs,
            per_device_train_batch_size=args.train_batch_size,
            per_device_eval_batch_size=args.valid_batch_size,
            warmup_steps=warmup_steps,
            learning_rate=lr,
            logging_dir=args.model_path + f'single_model_fold{fold}/logs',
            load_best_model_at_end=False,
            metric_for_best_model='loss',
            logging_steps=args.logging_steps,
            evaluation_strategy="epoch", 
            save_strategy="epoch", 
            save_total_limit=1,
            gradient_accumulation_steps=32,  # The true batch size is 32
            logging_strategy="steps",
            logging_first_step=True,
        )

        trainer = CustomTrainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics
        )

        if args.mode == 'train':
            trainer.train()
            model.save_pretrained(args.model_path + f'single_model_fold{fold}/')
            tokenizer.save_pretrained(args.model_path + f'single_model_fold{fold}/')
            eval_output = trainer.evaluate()
            print('Evaluation results:', eval_output)

        output = trainer.predict(val_dataset)
        predictions = output.predictions.squeeze()
        all_predictions.extend(torch.sigmoid(torch.tensor(predictions)).tolist()) # apply sigmoid

        all_labels.extend(val_labels)
        all_val_convs.extend(val_convs)

        # Clean GPU memory
        import gc
        del model, train_dataset, val_dataset, train_data, val_data, train_convs, val_convs, train_labels, val_labels, tokenizer
        del train_encodings, val_encodings, trainer, output, predictions
        gc.collect()
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()

        model = None
        train_dataset = None
        val_dataset = None
        train_data = None
        val_data = None
        train_convs = None
        val_convs = None
        train_labels = None
        val_labels = None
        train_encodings = None
        val_encodings = None
        trainer = None
        output = None
        predictions = None
        tokenizer = False

    auroc = roc_auc_score(all_labels, all_predictions)
    aupr = average_precision_score(all_labels, all_predictions)

    print(f"AUROC: {auroc}")
    print(f"AUPR: {aupr}")

    with open(results_file, "a") as f:
        f.write(f"{warmup_steps},{lr},{max_length},{auroc},{aupr}\n")

    predictions_labels_file = f"afd_predictions_labels_warmup={warmup_steps}_lr={lr}_maxlen={max_length}.csv"
    with open(predictions_labels_file, "w") as f:
        f.write("Conversation,Prediction,Label\n")
        for conv, pred, label in zip(all_val_convs, all_predictions, all_labels):
            f.write(f"{conv},{pred},{label}\n")

    time.sleep(100) 

In [None]:
#############################################
##### Using the optimal hyperparameters #####
#############################################
LRs = [2e-5]
WARMUP_EPOCHS = [3]
MAX_LENGTH = [2048]

hyperparameter_combinations = list(itertools.product(LRs, WARMUP_EPOCHS, MAX_LENGTH))

#################################################
##### Define Trainer and Args class objects #####
#################################################
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        labels = labels.float()

        # Apply sigmoid function to the logits
        sigmoid_logits = torch.sigmoid(logits)

        loss_fct = BCELoss()
        bce_loss = loss_fct(sigmoid_logits.view(-1), labels.view(-1))

        return (bce_loss, outputs) if return_outputs else bce_loss

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.reshape(-1)  # Flatten the predictions
    labels = labels.reshape(-1)  # Flatten the labels

    # Apply sigmoid function to the predictions
    sigmoid_predictions = torch.sigmoid(torch.from_numpy(predictions)).numpy()

    auroc = roc_auc_score(labels, sigmoid_predictions)
    aupr = average_precision_score(labels, sigmoid_predictions)

    return {"auroc": auroc, "aupr": aupr}


class Args:
    model_path = 'models/'
    num_labels = 1
    num_epochs = 5
    train_batch_size = 1
    valid_batch_size = 1
    model_name = 'allenai/longformer-large-4096' 
    model_type = 'longformer'  
    logging_steps = 1 
    # save_steps = 300
    mode = 'train'
    labels = 'after'

args = Args()

########################################
##### Define seeds and model setup #####
########################################
device = 'cuda' if torch.cuda.is_available() else 'cpu'

for seed in [1,2,3]:
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

    if device == 'cuda':
        torch.cuda.manual_seed_all(seed)

    ####################################
    ##### Define Data and Training #####
    ####################################
    x, _, y = load_data_afd()
    x, y = x, y

    all_data = list(zip(x, y))
    random.shuffle(all_data)

    num_folds = 7 # 7-fold cross validation
    kf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=seed)

    for i, (lr, warmup_epochs, max_length) in enumerate(hyperparameter_combinations):
        print(f'\n\n{i}')
        print(f"Training with hyperparameters: Warmup Steps={warmup_epochs}, Learning Rate={lr}, Max Length={max_length}")

        all_predictions = []
        all_labels = []
        all_val_convs = []

        for fold, (train_index, val_index) in enumerate(kf.split(x, y), start=1):
            print(f"Fold {fold}")
            train_data = [all_data[i] for i in train_index]
            val_data = [all_data[i] for i in val_index]
            train_convs, train_labels = zip(*train_data)
            val_convs, val_labels = zip(*val_data)

            MODEL_CLASSES = {
                "longformer": (LongformerForSequenceClassification, LongformerTokenizerFast)
            }
            model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
            model = model_class.from_pretrained(args.model_name, num_labels=args.num_labels).to(device)
            tokenizer = tokenizer_class.from_pretrained(args.model_name) 
            tokenizer.model_max_length = max_length

            train_encodings = tokenizer(list(train_convs), truncation=True, padding=True, max_length=max_length)
            val_encodings = tokenizer(list(val_convs), truncation=True, padding=True, max_length=max_length)

            train_dataset = conv_data_loader(train_encodings, train_labels)
            val_dataset = conv_data_loader(val_encodings, val_labels)

            # Calculate warmup steps based on the current warmup_epochs value
            warmup_steps = len(train_dataset) // (args.train_batch_size * 32) * warmup_epochs

            training_args = TrainingArguments(
                output_dir=args.model_path + f'single_model_fold{fold}/',
                num_train_epochs=args.num_epochs,
                per_device_train_batch_size=args.train_batch_size,
                per_device_eval_batch_size=args.valid_batch_size,
                warmup_steps=warmup_steps,
                learning_rate=lr,
                logging_dir=args.model_path + f'single_model_fold{fold}/logs',
                load_best_model_at_end=False,
                metric_for_best_model='loss',
                logging_steps=args.logging_steps,
                evaluation_strategy="epoch", 
                save_strategy="epoch", 
                save_total_limit=1,
                gradient_accumulation_steps=32,  
                logging_strategy="steps",
                logging_first_step=True,
            )

            trainer = CustomTrainer(
                model=model,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=val_dataset,
                tokenizer=tokenizer,
                compute_metrics=compute_metrics
            )

            if args.mode == 'train':
                trainer.train()
                model.save_pretrained(args.model_path + f'single_model_fold{fold}/')
                tokenizer.save_pretrained(args.model_path + f'single_model_fold{fold}/')
                eval_output = trainer.evaluate()
                print('Evaluation results:', eval_output)

            output = trainer.predict(val_dataset)
            predictions = output.predictions.squeeze()
            
            all_predictions.extend(torch.sigmoid(torch.tensor(predictions)).tolist()) 

            all_labels.extend(val_labels)
            all_val_convs.extend(val_convs)

            # Clean GPU memory
            import gc
            del model, train_dataset, val_dataset, train_data, val_data, train_convs, val_convs, train_labels, val_labels, tokenizer
            del train_encodings, val_encodings, trainer, output, predictions
            gc.collect()
            torch.cuda.empty_cache()
            torch.cuda.reset_peak_memory_stats()

            model = None
            train_dataset = None
            val_dataset = None
            train_data = None
            val_data = None
            train_convs = None
            val_convs = None
            train_labels = None
            val_labels = None
            train_encodings = None
            val_encodings = None
            trainer = None
            output = None
            predictions = None
            tokenizer = False

        auroc = roc_auc_score(all_labels, all_predictions)
        aupr = average_precision_score(all_labels, all_predictions)

        print(f"AUROC: {auroc}")
        print(f"AUPR: {aupr}")

        predictions_labels_file = f"afd_lonformer_predictions_seed={seed}.json"

        data = []
        for conv, pred, label in zip(all_val_convs, all_predictions, all_labels):
            data.append({
                "Conversation": conv,
                "Prediction": float(pred),
                "Label": int(label) 
            })

        with open(predictions_labels_file, "w") as f:
            json.dump(data, f, indent=4)

        time.sleep(100) 

#### 2.1.2.2. Fine-tuning only the last layer

In [None]:
LRs = [2e-5, 1e-4]
WARMUP_EPOCHS = [3, 1] # warmup_steps for 1 or 3 epochs.
MAX_LENGTH = [2048, 4096]

hyperparameter_combinations = list(itertools.product(LRs, WARMUP_EPOCHS, MAX_LENGTH))

#################################################
##### Define Trainer and Args class objects #####
#################################################
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        labels = labels.float()

        # Apply sigmoid function to the logits
        sigmoid_logits = torch.sigmoid(logits)

        loss_fct = BCELoss()
        bce_loss = loss_fct(sigmoid_logits.view(-1), labels.view(-1))

        return (bce_loss, outputs) if return_outputs else bce_loss

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.reshape(-1)  # Flatten the predictions
    labels = labels.reshape(-1)  # Flatten the labels

    # Apply sigmoid function to the predictions
    sigmoid_predictions = torch.sigmoid(torch.from_numpy(predictions)).numpy()

    auroc = roc_auc_score(labels, sigmoid_predictions)
    aupr = average_precision_score(labels, sigmoid_predictions)

    return {"auroc": auroc, "aupr": aupr}


class Args:
    model_path = 'models/'
    num_labels = 1
    num_epochs = 5
    train_batch_size = 1
    valid_batch_size = 1
    model_name = 'allenai/longformer-large-4096'  
    model_type = 'longformer' 
    logging_steps = 1 
    mode = 'train'
    labels = 'after'

args = Args()

########################################
##### Define seeds and model setup #####
########################################
device = 'cuda' if torch.cuda.is_available() else 'cpu'

seed = 42
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

if device == 'cuda':
    torch.cuda.manual_seed_all(seed)

####################################
##### Define Data and Training #####
####################################
x, _, y = load_data_afd()

all_data = list(zip(x, y))
random.shuffle(all_data)

num_folds = 7
kf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=seed)

results_file = "afd_hyperparameters_results_lf-last-layer.csv"
with open(results_file, "w") as f:
    f.write("Warmup Steps,Learning Rate,Max Length,AUROC,AUPR\n")

for i, (lr, warmup_epochs, max_length) in enumerate(hyperparameter_combinations):
    print(f'\n\n{i}')
    print(f"Training with hyperparameters: Warmup Steps={warmup_epochs}, Learning Rate={lr}, Max Length={max_length}")

    all_predictions = []
    all_labels = []
    all_val_convs = []

    for fold, (train_index, val_index) in enumerate(kf.split(x, y), start=1):
        print(f"Fold {fold}")
        train_data = [all_data[i] for i in train_index]
        val_data = [all_data[i] for i in val_index]
        train_convs, train_labels = zip(*train_data)
        val_convs, val_labels = zip(*val_data)

        MODEL_CLASSES = {
            "longformer": (LongformerForSequenceClassification, LongformerTokenizerFast)
        }
        model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
        model = model_class.from_pretrained(args.model_name, num_labels=args.num_labels).to(device)

        # Freeze all layers except the last one
        for name, param in model.named_parameters():
            if 'classifier' not in name: 
                param.requires_grad = False

        params_to_update = []
        for name, param in model.named_parameters():
            if param.requires_grad:
                params_to_update.append(param)

        tokenizer = tokenizer_class.from_pretrained(args.model_name)
        tokenizer.model_max_length = max_length

        train_encodings = tokenizer(list(train_convs), truncation=True, padding=True, max_length=max_length)
        val_encodings = tokenizer(list(val_convs), truncation=True, padding=True, max_length=max_length)

        train_dataset = conv_data_loader(train_encodings, train_labels)
        val_dataset = conv_data_loader(val_encodings, val_labels)

        # Calculate warmup steps based on the current warmup_epochs value
        warmup_steps = len(train_dataset) // (args.train_batch_size * 32) * warmup_epochs

        training_args = TrainingArguments(
            output_dir=args.model_path + f'single_model_fold{fold}/',
            num_train_epochs=args.num_epochs,
            per_device_train_batch_size=args.train_batch_size,
            per_device_eval_batch_size=args.valid_batch_size,
            warmup_steps=warmup_steps,
            learning_rate=lr,
            logging_dir=args.model_path + f'single_model_fold{fold}/logs',
            load_best_model_at_end=False,
            metric_for_best_model='loss',
            logging_steps=args.logging_steps,
            evaluation_strategy="epoch", 
            save_strategy="epoch",  
            save_total_limit=1,
            gradient_accumulation_steps=32,  
            logging_strategy="steps",
            logging_first_step=True,
        )

        trainer = CustomTrainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics
        )

        if args.mode == 'train':
            trainer.train()
            model.save_pretrained(args.model_path + f'single_model_fold{fold}/')
            tokenizer.save_pretrained(args.model_path + f'single_model_fold{fold}/')
            eval_output = trainer.evaluate()
            print('Evaluation results:', eval_output)

        output = trainer.predict(val_dataset)
        predictions = output.predictions.squeeze()
        all_predictions.extend(torch.sigmoid(torch.tensor(predictions)).tolist())

        all_labels.extend(val_labels)
        all_val_convs.extend(val_convs)

        # Clean GPU memory
        import gc
        del model, train_dataset, val_dataset, train_data, val_data, train_convs, val_convs, train_labels, val_labels, tokenizer
        del train_encodings, val_encodings, trainer, output, predictions
        gc.collect()
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()

        model = None
        train_dataset = None
        val_dataset = None
        train_data = None
        val_data = None
        train_convs = None
        val_convs = None
        train_labels = None
        val_labels = None
        train_encodings = None
        val_encodings = None
        trainer = None
        output = None
        predictions = None
        tokenizer = False

    auroc = roc_auc_score(all_labels, all_predictions)
    aupr = average_precision_score(all_labels, all_predictions)

    print(f"AUROC: {auroc}")
    print(f"AUPR: {aupr}")

    with open(results_file, "a") as f:
        f.write(f"{warmup_steps},{lr},{max_length},{auroc},{aupr}\n")

    predictions_labels_file = f"afd_predictions_labels_warmup={warmup_steps}_lr={lr}_maxlen={max_length}_lf-last-layer.csv"
    with open(predictions_labels_file, "w") as f:
        f.write("Conversation,Prediction,Label\n")
        for conv, pred, label in zip(all_val_convs, all_predictions, all_labels):
            f.write(f"{conv},{pred},{label}\n")

    time.sleep(100) 

### 2.1.3. Wikitactics

#### 2.1.3.1. Fine-tuning the whole model

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import random
import torch
from torch.nn import L1Loss
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error
from scipy.stats import spearmanr
import itertools
import time
from transformers import AutoModel
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import f1_score, roc_auc_score, average_precision_score
# from torch.nn import CrossEntropyLoss
from torch.nn import BCEWithLogitsLoss, BCELoss
from sklearn.model_selection import train_test_split
from torch.nn import BCEWithLogitsLoss

In [None]:
LRs = [2e-5, 1e-4]
WARMUP_EPOCHS = [3, 1]
MAX_LENGTH = [2048, 4096]

hyperparameter_combinations = list(itertools.product(LRs, WARMUP_EPOCHS, MAX_LENGTH))

#################################################
##### Define Trainer and Args class objects #####
#################################################
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        labels = labels.float()

        # Apply sigmoid function to the logits
        sigmoid_logits = torch.sigmoid(logits)

        loss_fct = BCELoss()
        bce_loss = loss_fct(sigmoid_logits.view(-1), labels.view(-1))

        return (bce_loss, outputs) if return_outputs else bce_loss

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.reshape(-1)  # Flatten the predictions
    labels = labels.reshape(-1)  # Flatten the labels

    # Apply sigmoid function to the predictions
    sigmoid_predictions = torch.sigmoid(torch.from_numpy(predictions)).numpy()

    auroc = roc_auc_score(labels, sigmoid_predictions)
    aupr = average_precision_score(labels, sigmoid_predictions)

    return {"auroc": auroc, "aupr": aupr}


class Args:
    model_path = 'models/'
    num_labels = 1
    num_epochs = 5
    train_batch_size = 1
    valid_batch_size = 1
    model_name = 'allenai/longformer-large-4096'  
    model_type = 'longformer' 
    logging_steps = 1  
    mode = 'train'
    labels = 'after'

args = Args()

########################################
##### Define seeds and model setup #####
########################################
device = 'cuda' if torch.cuda.is_available() else 'cpu'

seed = 42
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True

if device == 'cuda':
    torch.cuda.manual_seed_all(seed)

####################################
##### Define Data and Training #####
####################################
x, y = load_data_wikitac()

all_data = list(zip(x, y))
random.shuffle(all_data)

num_folds = 7 # 7-fold cross validation
kf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=seed)

results_file = "wikitac_hyperparameters_results.csv"
with open(results_file, "w") as f:
    f.write("Warmup Steps,Learning Rate,Max Length,AUROC,AUPR\n")

for i, (lr, warmup_epochs, max_length) in enumerate(hyperparameter_combinations):

    print(f'\n\n{i}')
    print(f"Training with hyperparameters: Warmup Steps={warmup_epochs}, Learning Rate={lr}, Max Length={max_length}")

    all_predictions = []
    all_labels = []
    all_val_convs = []

    for fold, (train_index, val_index) in enumerate(kf.split(x, y), start=1):
        print(f"Fold {fold}")
        train_data = [all_data[i] for i in train_index]
        val_data = [all_data[i] for i in val_index]
        train_convs, train_labels = zip(*train_data)
        val_convs, val_labels = zip(*val_data)

        MODEL_CLASSES = {
            "longformer": (LongformerForSequenceClassification, LongformerTokenizerFast)
        }
        model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
        model = model_class.from_pretrained(args.model_name, num_labels=args.num_labels).to(device)
        tokenizer = tokenizer_class.from_pretrained(args.model_name)
        tokenizer.model_max_length = max_length

        train_encodings = tokenizer(list(train_convs), truncation=True, padding=True, max_length=max_length)
        val_encodings = tokenizer(list(val_convs), truncation=True, padding=True, max_length=max_length)

        train_dataset = conv_data_loader(train_encodings, train_labels)
        val_dataset = conv_data_loader(val_encodings, val_labels)

        warmup_steps = len(train_dataset) // (args.train_batch_size * 32) * warmup_epochs

        training_args = TrainingArguments(
            output_dir=args.model_path + f'single_model_fold{fold}/',
            num_train_epochs=args.num_epochs,
            per_device_train_batch_size=args.train_batch_size,
            per_device_eval_batch_size=args.valid_batch_size,
            warmup_steps=warmup_steps,
            learning_rate=lr,
            logging_dir=args.model_path + f'single_model_fold{fold}/logs',
            load_best_model_at_end=False,
            metric_for_best_model='loss',
            logging_steps=args.logging_steps,
            evaluation_strategy="epoch",
            save_strategy="epoch", 
            save_total_limit=1,
            gradient_accumulation_steps=32, 
            logging_strategy="steps",
            logging_first_step=True,
        )

        trainer = CustomTrainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=val_dataset,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics
        )

        if args.mode == 'train':
            trainer.train()
            model.save_pretrained(args.model_path + f'single_model_fold{fold}/')
            tokenizer.save_pretrained(args.model_path + f'single_model_fold{fold}/')
            eval_output = trainer.evaluate()
            print('Evaluation results:', eval_output)

        output = trainer.predict(val_dataset)
        predictions = output.predictions.squeeze()
        all_predictions.extend(torch.sigmoid(torch.tensor(predictions)).tolist())

        all_labels.extend(val_labels)
        all_val_convs.extend(val_convs)

        task = 'wikitac'
        model_path = f"./finetuned_longformers/{task}_fold={fold}" 
        model.save_pretrained(model_path)
        tokenizer.save_pretrained(model_path)


        import gc
        del model, train_dataset, val_dataset, train_data, val_data, train_convs, val_convs, train_labels, val_labels, tokenizer
        del train_encodings, val_encodings, trainer, output, predictions
        gc.collect()
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()

        model = None
        train_dataset = None
        val_dataset = None
        train_data = None
        val_data = None
        train_convs = None
        val_convs = None
        train_labels = None
        val_labels = None
        train_encodings = None
        val_encodings = None
        trainer = None
        output = None
        predictions = None
        tokenizer = False

    auroc = roc_auc_score(all_labels, all_predictions)
    aupr = average_precision_score(all_labels, all_predictions)

    print(f"AUROC: {auroc}")
    print(f"AUPR: {aupr}")

    with open(results_file, "a") as f:
        f.write(f"{warmup_steps},{lr},{max_length},{auroc},{aupr}\n")

    predictions_labels_file = f"./wikitac_predictions_labels_warmup={warmup_steps}_lr={lr}_maxlen={max_length}.csv"
    with open(predictions_labels_file, "w") as f:
        f.write("Conversation,Prediction,Label\n")
        for conv, pred, label in zip(all_val_convs, all_predictions, all_labels):
            f.write(f"{conv},{pred},{label}\n")

    time.sleep(100) 

#### 2.1.3.2. Fine-tuning the last layer only

In [None]:
# Training with hyperparameters: Warmup Steps=3, Learning Rate=0.0001, Max Length=4096 --> The best.

In [None]:
"""
Using the optimal combination of hyperparameters
"""
LRs = [1e-4]
WARMUP_EPOCHS = [3]
MAX_LENGTH = [4096]

hyperparameter_combinations = list(itertools.product(LRs, WARMUP_EPOCHS, MAX_LENGTH))

#################################################
##### Define Trainer and Args class objects #####
#################################################
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        labels = labels.float()

        # Apply sigmoid function to the logits
        sigmoid_logits = torch.sigmoid(logits)

        loss_fct = BCELoss()
        bce_loss = loss_fct(sigmoid_logits.view(-1), labels.view(-1))

        return (bce_loss, outputs) if return_outputs else bce_loss

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions.reshape(-1)  # Flatten the predictions
    labels = labels.reshape(-1)  # Flatten the labels

    # Apply sigmoid function to the predictions
    sigmoid_predictions = torch.sigmoid(torch.from_numpy(predictions)).numpy()

    auroc = roc_auc_score(labels, sigmoid_predictions)
    aupr = average_precision_score(labels, sigmoid_predictions)

    return {"auroc": auroc, "aupr": aupr}


class Args:
    model_path = 'models/'
    num_labels = 1
    num_epochs = 5
    train_batch_size = 1
    valid_batch_size = 1
    model_name = 'allenai/longformer-large-4096' 
    model_type = 'longformer'  
    logging_steps = 1  
    # save_steps = 300
    mode = 'train'
    labels = 'after'

args = Args()

########################################
##### Define seeds and model setup #####
########################################
device = 'cuda' if torch.cuda.is_available() else 'cpu'

for seed in [1, 2, 3]:

    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

    if device == 'cuda':
        torch.cuda.manual_seed_all(seed)

    ####################################
    ##### Define Data and Training #####
    ####################################
    x, y = load_data_wikitac()

    all_data = list(zip(x, y))
    random.shuffle(all_data)

    num_folds = 7 # 7-fold cross validation
    kf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=seed)


    for i, (lr, warmup_epochs, max_length) in enumerate(hyperparameter_combinations):
        print(f'\n\n{i}')
        print(f"Training with hyperparameters: Warmup Steps={warmup_epochs}, Learning Rate={lr}, Max Length={max_length}")

        all_predictions = []
        all_labels = []
        all_val_convs = []

        for fold, (train_index, val_index) in enumerate(kf.split(x, y), start=1):
            print(f"Fold {fold}")
            train_data = [all_data[i] for i in train_index]
            val_data = [all_data[i] for i in val_index]
            train_convs, train_labels = zip(*train_data)
            val_convs, val_labels = zip(*val_data)

            MODEL_CLASSES = {
                "longformer": (LongformerForSequenceClassification, LongformerTokenizerFast)
            }
            model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
            model = model_class.from_pretrained(args.model_name, num_labels=args.num_labels).to(device)
            tokenizer = tokenizer_class.from_pretrained(args.model_name) # do_lower_case=True
            tokenizer.model_max_length = max_length

            train_encodings = tokenizer(list(train_convs), truncation=True, padding=True, max_length=max_length)
            val_encodings = tokenizer(list(val_convs), truncation=True, padding=True, max_length=max_length)

            train_dataset = conv_data_loader(train_encodings, train_labels)
            val_dataset = conv_data_loader(val_encodings, val_labels)

            # Calculate warmup steps based on the current warmup_epochs value
            warmup_steps = len(train_dataset) // (args.train_batch_size * 32) * warmup_epochs

            training_args = TrainingArguments(
                output_dir=args.model_path + f'single_model_fold{fold}/',
                num_train_epochs=args.num_epochs,
                per_device_train_batch_size=args.train_batch_size,
                per_device_eval_batch_size=args.valid_batch_size,
                warmup_steps=warmup_steps,
                learning_rate=lr,
                logging_dir=args.model_path + f'single_model_fold{fold}/logs',
                load_best_model_at_end=False,
                metric_for_best_model='loss',
                logging_steps=args.logging_steps,
                evaluation_strategy="epoch",
                save_strategy="epoch",  
                save_total_limit=1,
                gradient_accumulation_steps=32,
                logging_strategy="steps",
                logging_first_step=True,
            )

            trainer = CustomTrainer(
                model=model,
                args=training_args,
                train_dataset=train_dataset,
                eval_dataset=val_dataset,
                tokenizer=tokenizer,
                compute_metrics=compute_metrics,
                # callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
            )

            if args.mode == 'train':
                trainer.train()
                model.save_pretrained(args.model_path + f'single_model_fold{fold}/')
                tokenizer.save_pretrained(args.model_path + f'single_model_fold{fold}/')
                eval_output = trainer.evaluate()
                print('Evaluation results:', eval_output)

            output = trainer.predict(val_dataset)
            predictions = output.predictions.squeeze()
            all_predictions.extend(torch.sigmoid(torch.tensor(predictions)).tolist())

            all_labels.extend(val_labels)
            all_val_convs.extend(val_convs)


            task = 'wikitac'
            model_path = f"./finetuned_longformers/{task}_seed={seed}_fold={fold}" 
            model.save_pretrained(model_path)
            tokenizer.save_pretrained(model_path)

            # Clean GPU memory
            import gc
            del model, train_dataset, val_dataset, train_data, val_data, train_convs, val_convs, train_labels, val_labels, tokenizer
            del train_encodings, val_encodings, trainer, output, predictions
            gc.collect()
            torch.cuda.empty_cache()
            torch.cuda.reset_peak_memory_stats()

            model = None
            train_dataset = None
            val_dataset = None
            train_data = None
            val_data = None
            train_convs = None
            val_convs = None
            train_labels = None
            val_labels = None
            train_encodings = None
            val_encodings = None
            trainer = None
            output = None
            predictions = None
            tokenizer = False

        auroc = roc_auc_score(all_labels, all_predictions)
        aupr = average_precision_score(all_labels, all_predictions)

        print(f"AUROC: {auroc}")
        print(f"AUPR: {aupr}")

        predictions_labels_file = f"./wikitac_predictions_labels_seed={seed}.csv"
        with open(predictions_labels_file, "w") as f:
            f.write("Conversation,Prediction,Label\n")
            for conv, pred, label in zip(all_val_convs, all_predictions, all_labels):
                f.write(f"{conv},{pred},{label}\n")

        time.sleep(100)

# 3. Evaluation time

## 3.1. Overall Evaluation

In [None]:
from scipy.stats import spearmanr, pearsonr

# Prediction on the evaluation set
output = trainer.predict(val_dataset)  
predictions = output.predictions.squeeze() 

# Compute Spearman correlation
spearman_corr = spearmanr(predictions, val_labels)[0]

print(f"Spearman Correlation: {spearman_corr}")

In [None]:
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, accuracy_score, cohen_kappa_score, f1_score, confusion_matrix, recall_score, precision_score
from collections import Counter
import matplotlib.pyplot as plt

threshold = Counter(val_labels)[1] / len(val_labels)

# Evaluate on the validation set
auroc_val = roc_auc_score(val_labels, predictions)
precision_val, recall_val, _ = precision_recall_curve(val_labels, predictions)
auprc_val = auc(recall_val, precision_val)
accuracy_val = accuracy_score(val_labels, [1 if p >= threshold else 0 for p in predictions])
kappa_val = cohen_kappa_score(val_labels, [1 if p >= threshold else 0 for p in predictions])
f1_val = f1_score(val_labels, [1 if p >= threshold else 0 for p in predictions], average='binary')
cm_val = confusion_matrix(val_labels, [1 if p >= threshold else 0 for p in predictions])

# Calculate recall, precision, and specificity
y_pred_val_binary = [1 if p >= threshold else 0 for p in predictions]
recall = recall_score(val_labels, y_pred_val_binary)
precision = precision_score(val_labels, y_pred_val_binary)
tn, fp, fn, tp = cm_val.ravel()
specificity = tn / (tn + fp)

print('Validation Set:')
print("AUROC:", auroc_val)
print("AUPRC:", auprc_val)
print("Accuracy:", accuracy_val)
print("Cohen's Kappa:", kappa_val)
print("F1 Score:", f1_val)
print("Recall:", recall)
print("Precision:", precision)
print("Specificity:", specificity)
print("Confusion Matrix:")
print(cm_val)

# Validation data plot with density
plt.figure(figsize=(8, 6))
plt.hexbin(predictions, val_labels, gridsize=50, cmap='viridis', mincnt=1)
plt.colorbar(label='Count in bin')
plt.xlabel('Predicted Probability')
plt.ylabel('True Label')
plt.title('Validation Data Predictions')
plt.tight_layout()
plt.show()

## 3.2. Robustness Evaluation (OUM)

In [None]:
from transformers_interpret import SequenceClassificationExplainer
import pandas as pd
from scipy import stats

df = pd.DataFrame({
    'Conversations': val_convs,
    'Labels': val_labels,
    'Predictions': predictions
})

csv_filename = f'predict_oum_val_data_corr={round(spearman_corr, 2)}.csv'
df.to_csv(csv_filename, index=False)

df = pd.read_csv(f'predict_oum_val_data_corr={round(spearman_corr, 2)}.csv')
df['#words'] = df['Conversations'].map(lambda c: len(c.split()))

def topic_categorisation(conv):
    if 'veganism' in conv.lower():
        return 'veganism'
    elif 'covid' in conv.lower():
        return 'covid'
    elif 'brexit' in conv.lower():
        return 'brexit'
    else:
        return 'other'

df['topic'] = df['Conversations'].map(topic_categorisation)


overall_corr, _ = stats.spearmanr(df['Labels'], df['Predictions'])
print(f"Overall Spearman correlation: {overall_corr:.3f}")

topic_corrs = {}
for topic in df['topic'].unique():
    topic_df = df[df['topic'] == topic]
    topic_corr, _ = stats.spearmanr(topic_df['Labels'], topic_df['Predictions'])
    topic_corrs[topic] = topic_corr

for topic, corr in topic_corrs.items():
    print(f"Spearman correlation for topic '{topic}': {corr:.3f}")