In [2]:
import numpy as np
import pandas as pd
import os

In [3]:
os.chdir("/home/573/rh2942/WASSA-2023-EMP") # changing dir for evaluation file
os.environ["TOKENIZERS_PARALLELISM"] = "false" # due to huggingface warning

In [3]:
import transformers as trf
from datasets import Dataset
import torch

In [4]:
#padding="longest" is deferred to do dynamic padding
def tokenise(sentence):
    return tokeniser(sentence[feature_1], sentence[feature_2], truncation=True) #feature_1 and feature_2 to be concatenated by [SEP] token by Huggingface tokeniser

def load_tokenised_data(filename, task, tokenise_fn, train_test):
   
    input_data = pd.read_csv(filename, header=0, index_col=0)
    
    if train_test == "train":
        chosen_data = input_data[[feature_1, feature_2, task]]
    elif train_test == "test":
        chosen_data = input_data[[feature_1, feature_2]]  #test data shouldn't have output label
    
    hugging_dataset = Dataset.from_pandas(chosen_data, preserve_index=False)

    tokenised_hugging_dataset = hugging_dataset.map(tokenise_fn, batched=True, remove_columns = [feature_1, feature_2])
    
    if train_test == "train":
        tokenised_hugging_dataset = tokenised_hugging_dataset.rename_column(task, "labels") # as huggingface requires
    
    tokenised_hugging_dataset = tokenised_hugging_dataset.with_format("torch")

    return tokenised_hugging_dataset

# Prediction

In [5]:
NUM_EPOCH = 50

# train_filename = "CONV_train_train_paraphrased.csv"
# test_filename = "CONV_preprocessed_complete_dev.csv"

## during final test time
train_filename = "CONV_train_dev_paraphrased.csv"
test_filename = "CONV_preprocessed_test.csv"

#Chosen features
feature_1 = 'text'
feature_2 = 'article'

checkpoint = "bert-base-uncased"

tokeniser = trf.AutoTokenizer.from_pretrained(checkpoint)

# data collator due to variable max token length per batch size
data_collator = trf.DataCollatorWithPadding(tokenizer = tokeniser)

## Without huggingface acclerator

In [6]:
def train_test_wo_acc(task, lr, batch_size, seed):
    """
    train-test pipeline without huggingface accelerator
    """
    print(f"{task} prediction")  #task: "empathy" or "distress" or ...
    
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed) # just being extra cautious
    np.random.seed(seed)
    
    model = trf.AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)
    
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)
    
    opt = torch.optim.AdamW(model.parameters(), lr=lr)
  
    trainset = load_tokenised_data(filename=os.path.join("./processed_data", train_filename), task=task, tokenise_fn=tokenise, train_test="train")
       
    trainloader = torch.utils.data.DataLoader(
        trainset, shuffle=True, batch_size=batch_size, collate_fn=data_collator
    )
    
    training_steps = NUM_EPOCH * len(trainloader)
    lr_scheduler = trf.get_scheduler(
        "linear",
        optimizer=opt,
        num_warmup_steps=0,
        num_training_steps=training_steps
    )
    
    # evaluation data loader
    testset = load_tokenised_data(filename=os.path.join("./processed_data", test_filename), task=task, tokenise_fn=tokenise, train_test="test")
    testloader = torch.utils.data.DataLoader(
        testset, shuffle=False, batch_size=batch_size, collate_fn=data_collator
    )
    
    model.train()
    for epoch in range(0, NUM_EPOCH):        
        epoch_loss = 0
        num_batches = 0

        # Iterate over the DataLoader for training data
        for batch in trainloader:
            # Perform forward pass
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss

            loss.backward()
            opt.step()
            lr_scheduler.step()         
            opt.zero_grad()
            
            epoch_loss += loss.item()
            num_batches += 1

        # Process is complete.
        avg_epoch_loss = epoch_loss / num_batches
        print(f"Epoch {epoch+1}: average loss = {avg_epoch_loss}")    
            
        # Starting evaluation
        model.eval()
        y_pred = []

        for batch in testloader:
            with torch.no_grad():
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)

            batch_pred = [item for sublist in outputs.logits.tolist() for item in sublist]  #convert 2D list to 1D
            y_pred.extend(batch_pred)
  
    y_pred_df = pd.DataFrame({task: y_pred})
    filename = "./tmp/predictions_" + task + ".tsv"
    y_pred_df.to_csv(filename, sep='\t', header=False, index=False)

## Accelerator

In [None]:
from accelerate import Accelerator
from accelerate import notebook_launcher

In [None]:
def train_test(model, task, lr, batch_size):
    """
    train-test steps with huggingface accelerator
    """
    accelerator = Accelerator()
    
    accelerator.print(f"{task} prediction")  #task: "empathy" or "distress" or ...
    
    opt = torch.optim.AdamW(model.parameters(), lr=lr)
  
    trainset = load_tokenised_data(filename=os.path.join("./processed_data", train_filename), task=task, tokenise_fn=tokenise, train_test="train")
       
    trainloader = torch.utils.data.DataLoader(
        trainset, shuffle=True, batch_size=batch_size, collate_fn=data_collator
    )
    
    training_steps = NUM_EPOCH * len(trainloader)
    lr_scheduler = trf.get_scheduler(
        "linear",
        optimizer=opt,
        num_warmup_steps=0,
        num_training_steps=training_steps
    )

    trainloader, model, opt = accelerator.prepare(
        trainloader, model, opt    
    )
    
    # evaluation data loader
    testset = load_tokenised_data(filename=os.path.join("./processed_data", test_filename), task=task, tokenise_fn=tokenise, train_test="test")
    testloader = torch.utils.data.DataLoader(
        testset, shuffle=False, batch_size=batch_size, collate_fn=data_collator
    )
    
    model.train()
    for epoch in range(0, NUM_EPOCH):        
        epoch_loss = 0
        num_batches = 0

        # Iterate over the DataLoader for training data
        for batch in trainloader:
            # Perform forward pass
            outputs = model(**batch)
            
            loss = outputs.loss

            accelerator.backward(loss)
        
            opt.step()
            lr_scheduler.step()
            
            opt.zero_grad()
            
            epoch_loss += loss.item()
            num_batches += 1

        # Process is complete.
        avg_epoch_loss = epoch_loss / num_batches
        accelerator.print(f"Epoch {epoch+1}: average loss = {avg_epoch_loss}")    
            
    # Starting evaluation after full-training
    model.eval()
    y_pred = []

    for batch in testloader:
        with torch.no_grad():
            outputs = model(**batch)

        batch_pred = [item for sublist in outputs.logits.tolist() for item in sublist]  #convert 2D list to 1D
        y_pred.extend(batch_pred)
  
    y_pred_df = pd.DataFrame({task: y_pred})
    filename = "./prediction/predictions_" + task + ".tsv"
    y_pred_df.to_csv(filename, sep='\t', header=False, index=False)

def final_prediction(task, lr, batch, seed):
    """
    Run train-test using Huggingface accelerator
    """

    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    
    model = trf.AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)

    notebook_launcher(train_test, (model,task,lr,batch), num_processes=torch.cuda.device_count())

## train-test

In [4]:
task = "EmotionalPolarity"
train_test_wo_acc(task=task, lr=1.06e-05, batch_size=10, seed=96)
predictions_em_pol = pd.read_csv("./tmp/predictions_" + task + ".tsv", sep='\t', header=None)

In [5]:
task="Emotion"
train_test_wo_acc(task=task, lr=1.44e-05, batch_size=10, seed=87)
predictions_emo = pd.read_csv("./tmp/predictions_" + task + ".tsv", sep='\t', header=None)

In [6]:
task="Empathy"
train_test_wo_acc(tas=task, lr=1.97e-05, batch_size= 12, seed= 68)
predictions_emp = pd.read_csv("./tmp/predictions_" + task + ".tsv", sep='\t', header=None)

In [7]:
predictions_CONV = pd.concat([predictions_em_pol, predictions_emo, predictions_emp], axis=1)

predictions_CONV.to_csv("./prediction/predictions_CONV.tsv", sep='\t', header=False, index=False)

## Just checking the dev set performance

In [16]:
from evaluation import pearsonr

In [17]:
gold_dev = pd.read_csv('./dataset/dev/goldstandard_CONV_dev.tsv', sep='\t', header=None) # no header
# print(f"EmotionalPolarity: {pearsonr(gold_dev.loc[:,0].tolist(), predictions_em_pol.loc[:,0].tolist())}")
print(f"Emotion: {pearsonr(gold_dev.loc[:,1].tolist(), predictions_emo.loc[:,0].tolist())}")
# print(f"Empathy: {pearsonr(gold_dev.loc[:,2].tolist(), predictions_emp.loc[:,0].tolist())}")

Emotion: 0.765


# Hyperparam tuning

In [5]:
import torch
import transformers as trf
import optuna
import plotly
from functools import partial
from evaluation import pearsonr

In [6]:
NUM_EPOCH = 35

train_filename = "CONV_train_train_paraphrased.csv"
test_filename = "CONV_preprocessed_complete_dev.csv"

## during final test time
# train_filename = "CONV_train_dev_paraphrased.csv"
# test_filename = "CONV_preprocessed_test.csv"

#Chosen features
feature_1 = 'text'
feature_2 = 'article'

checkpoint = "bert-base-uncased"

tokeniser = trf.AutoTokenizer.from_pretrained(checkpoint)

# data collator due to variable max token length per batch size
data_collator = trf.DataCollatorWithPadding(tokenizer = tokeniser)

In [7]:
def objective(trial, task):    
    # Tuning hyperparams:
    LEARNING_RATE = trial.suggest_float("LEARNING_RATE", 1e-05, 1e-04, log=True)
    BATCH_SIZE = trial.suggest_int("BATCH_SIZE", 2, 16)
    SEED = trial.suggest_int("SEED", 1, 100)
    # checkpoint = trial.suggest_categorical("checkpoint", ("bert-base-uncased", "albert-base-v2"))
    
    torch.manual_seed(SEED)
    np.random.seed(SEED)
    
    model = trf.AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)

    device = torch.device("cuda:1") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)
    
    opt = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
    
    # Training by train set only
    trainset = load_tokenised_data(filename=os.path.join("./processed_data", train_filename), task=task, tokenise_fn=tokenise, train_test="train")
    
    trainloader = torch.utils.data.DataLoader(
        trainset, shuffle=True, batch_size=BATCH_SIZE, collate_fn=data_collator
    )
    
    training_steps = NUM_EPOCH * len(trainloader)
    lr_scheduler = trf.get_scheduler(
        "linear",
        optimizer=opt,
        num_warmup_steps=0,
        num_training_steps=training_steps
    )
    
    # Evaluation data loader
    testset = load_tokenised_data(filename=os.path.join("./processed_data", test_filename), task=task, tokenise_fn=tokenise, train_test="train")
    testloader = torch.utils.data.DataLoader(
        testset, shuffle=False, batch_size=BATCH_SIZE, collate_fn=data_collator
    )
    
    model.train()
    for epoch in range(0, NUM_EPOCH):
        # Iterate over the DataLoader for training data
        for batch in trainloader:
            # Perform forward pass
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss

            loss.backward()    
            opt.step()
            lr_scheduler.step()
            opt.zero_grad()

        # Evaluation   
        model.eval()
        y_pred = []
        y_true =[]

        for batch in testloader:
            with torch.no_grad():
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)

            batch_pred = [item for sublist in outputs.logits.tolist() for item in sublist]  #convert 2D list to 1D
            y_pred.extend(batch_pred)
            y_true.extend(batch['labels'].tolist())
        
        pearson_r = pearsonr(y_true, y_pred)
            
        trial.report(pearson_r, epoch)
            
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
    
    return pearson_r

def optuna_tuner(task):
    """
    Run optuna study trial and generate plots
    """
    study = optuna.create_study(
        study_name = task,
        storage = "sqlite:///{}.db".format(task),
        sampler = optuna.samplers.TPESampler(seed=28),
        pruner = optuna.pruners.MedianPruner(),
        direction = "maximize",
        load_if_exists = True
    )
    
    objective_param = partial(objective, task=task) #sending parameters to the objective function
    study.optimize(objective_param, n_trials=100, show_progress_bar=True)

    trial_results = study.trials_dataframe() #trial results as a dataframe
    trial_results.to_csv("CONV_trial_results_" + task + ".csv")

    print(f"Best Pearson r: {study.best_value}")
    print(f"Best parameter: {study.best_params}")
    
    fig_1 = optuna.visualization.plot_slice(study)
    fig_1.show()
    fig_1.write_image("./prediction/CONV_" + task + "-param-plots.pdf")

    fig_2 = optuna.visualization.plot_param_importances(study)
    fig_2.show()
    fig_2.write_image("./prediction/CONV_" + task + "-param-importance.pdf")

In [None]:
tasks = [
    # 'EmotionalPolarity',
    'Emotion',
    # 'Empathy'
]

for task in tasks:
    optuna_tuner(task=task)

## Manual checking which features

In [6]:
from sklearn.model_selection import KFold
from evaluation import pearsonr, calculate_pearson

In [7]:
def train_test_kfold(model):
    accelerator = Accelerator()
    
    opt = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

    training_steps = NUM_EPOCH * len(trainloader)
    lr_scheduler = trf.get_scheduler(
        "linear",
        optimizer=opt,
        num_warmup_steps=0,
        num_training_steps=training_steps
    )
  
    trainloader_acclerate, model, opt = accelerator.prepare(
        trainloader, model, opt    
    )  
    
    for epoch in range(0, NUM_EPOCH):
        
        epoch_loss = 0
        num_batches = 0

        # Iterate over the DataLoader for training data
        for batch in trainloader_acclerate:
            # Perform forward pass
            outputs = model(**batch)
            
            loss = outputs.loss
#             loss = loss_function(outputs, targets)

            accelerator.backward(loss)
        
            opt.step()
            lr_scheduler.step()
            
            opt.zero_grad()
            
            epoch_loss += loss.item()
            num_batches += 1

        # Process is complete.
        avg_epoch_loss = epoch_loss / num_batches
        accelerator.print(f"Epoch {epoch}: average loss = {avg_epoch_loss}")
    
    
    # Evaluation
        
    model.eval()

    y_true =[]
    y_pred = []

    for batch in testloader:
        with torch.no_grad():
            outputs = model(**batch)

        y_true.extend(batch['labels'].tolist())

        batch_pred = [item for sublist in outputs.logits.tolist() for item in sublist]  #convert 2D list to 1D
        y_pred.extend(batch_pred)
        
    pearson_r = pearsonr(y_true, y_pred)
    
    accelerator.print('\n' + checkpoint + ' & ' + str(LEARNING_RATE) + ' & ' + str(BATCH_SIZE) + ' & ' + feature_1 + '-' + feature_2 + ' & ' + str(pearson_r) + ' fold-' + str(fold) + '\n')

In [None]:
NUM_EPOCH = 35
BATCH_SIZE = 32
LEARNING_RATE = 5e-5

train_dev_filename = "CONV_train_dev_paraphrased.csv"

#Chosen features
feature_1 = 'text'
feature_2 = 'article'

checkpoint = "bert-base-uncased"

tokeniser = trf.AutoTokenizer.from_pretrained(checkpoint)

# data collator due to variable max token length per batch size
data_collator = trf.DataCollatorWithPadding(tokenizer = tokeniser)

checkpoint = "bert-base-uncased"

#padding="longest" can be deferred to do dynamic padding
def tokenise(sentence):
    return tokeniser(sentence[feature_1], sentence[feature_2], truncation=True)
    # return tokeniser(sentence[feature_1], truncation=True)
    
# data collator due to variable max token length per batch size
data_collator = trf.DataCollatorWithPadding(tokenizer = tokeniser)

train_dev = load_tokenised_data(filename=os.path.join("./processed_data", train_dev_filename), task=task, tokenise_fn=tokenise, train_test="train")

K_FOLD = 5

# Set fixed random number seed
torch.manual_seed(42)

kfold = KFold(n_splits=K_FOLD, shuffle=True)

print('--------------------------------')

# K-fold Cross Validation model evaluation
for fold, (train_idx, test_idx) in enumerate(kfold.split(train_dev)):

    print(f'FOLD {fold}')
    print('--------------------------------')

    # Sample elements randomly from a given list of ids, no replacement.
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_idx)
    test_subsampler = torch.utils.data.SubsetRandomSampler(test_idx)

    # Define data loaders for training and testing data in this fold
    trainloader = torch.utils.data.DataLoader(
        train_dev,
        batch_size=BATCH_SIZE,
        collate_fn=data_collator,
        sampler=train_subsampler
    )
    testloader = torch.utils.data.DataLoader(
        train_dev,
        batch_size=BATCH_SIZE,
        collate_fn=data_collator,
        sampler=test_subsampler
    )

    # Init the neural network
    model = trf.AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)

    notebook_launcher(train_test_kfold, (model,), num_processes=torch.cuda.device_count())

### for both features

In [None]:
# for both features

def load_tokenised_data(filename, task, tokenise_fn, train_test):
   
    input_data = pd.read_csv(filename, header=0, index_col=0)
    
    if train_test == "train":
        chosen_data = input_data[[feature_1, feature_2, task]]
    elif train_test == "test":
        chosen_data = input_data[[feature_1, feature_2]]  #test data shouldn't have output label
    

    hugging_dataset = Dataset.from_pandas(chosen_data, preserve_index=False)

    tokenised_hugging_dataset = hugging_dataset.map(tokenise_fn, batched=True, remove_columns = [feature_1, feature_2])
    
    if train_test == "train":
        tokenised_hugging_dataset = tokenised_hugging_dataset.rename_column(task, "labels") # as huggingface requires
    
    tokenised_hugging_dataset = tokenised_hugging_dataset.with_format("torch")

    return tokenised_hugging_dataset

### for single feature

In [None]:
def load_tokenised_data(filename, task, tokenise_fn, train_test):
   
    input_data = pd.read_csv(filename, header=0, index_col=0)
    
    # for single feature
    if train_test == "train":
        chosen_data = input_data[[feature_1, task]]
    elif train_test == "test":
        chosen_data = input_data[[feature_1]]  #test data shouldn't have output label

    hugging_dataset = Dataset.from_pandas(chosen_data, preserve_index=False)

    tokenised_hugging_dataset = hugging_dataset.map(tokenise_fn, batched=True, remove_columns = [feature_1])
    
    if train_test == "train":
        tokenised_hugging_dataset = tokenised_hugging_dataset.rename_column(task, "labels") # as huggingface requires
    
    tokenised_hugging_dataset = tokenised_hugging_dataset.with_format("torch")

    return tokenised_hugging_dataset