In [73]:
import numpy as np
import pandas as pd
import os
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset

from transformers import (
    DataCollatorWithPadding,
    AutoModel,
    AutoTokenizer,
    get_linear_schedule_with_warmup
)
from datasets import Dataset

from sklearn.preprocessing import MinMaxScaler

os.chdir("/g/data/jr19/rh2942/text-empathy/")
from evaluation import pearsonr
from utils.utils import plot, get_device, set_all_seeds

In [74]:
os.environ['TOKENIZERS_PARALLELISM'] = 'false' # due to huggingface warning
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'

In [85]:
class DataModule():
    def __init__(self, task, checkpoint, batch_size, feature_to_tokenise):
        super(DataModule, self).__init__()
        self.task = task
        self.checkpoint = checkpoint
        self.batch_size = batch_size
        self.tokeniser = AutoTokenizer.from_pretrained(
            self.checkpoint,
            use_fast=True
        )
        self.data_collator = DataCollatorWithPadding(tokenizer=self.tokeniser)
        self.feature_to_tokenise = feature_to_tokenise # to tokenise function
        assert len(self.feature_to_tokenise) == 1, 'feature_to_tokenise must be a list with one element'
    
    def _process_raw(self, path, send_label):
        data = pd.read_csv(path, sep='\t')
    
        if send_label:
            text = data[self.feature_to_tokenise + self.task]
        else:
            text = data[self.feature_to_tokenise]

        demog = ['gender', 'education', 'race', 'age', 'income']        
        data_demog = data[demog]
        scaler = MinMaxScaler()
        data_demog = pd.DataFrame(
            scaler.fit_transform(data_demog),
            columns=demog
        )
        data = pd.concat([text, data_demog], axis=1) 
            
        return data

    def _tokeniser_fn(self, sentence):
        assert len(self.feature_to_tokenise) == 1 # only one feature
        return self.tokeniser(sentence[self.feature_to_tokenise[0]], truncation=True)

    def _process_input(self, file, send_label):
        data = self._process_raw(path=file, send_label=send_label)
        data = data.reset_index(drop=True)
        data = Dataset.from_pandas(data, preserve_index=False) # convert to huggingface dataset
        data = data.map(self._tokeniser_fn, batched=True, remove_columns=self.feature_to_tokenise) # tokenise
        data = data.with_format('torch')
        return data

    # taken from https://pytorch.org/docs/stable/notes/randomness.html
    def _seed_worker(self, worker_id):
        worker_seed = torch.initial_seed() % 2**32
        np.random.seed(worker_seed)
        random.seed(worker_seed)     

    def dataloader(self, file, send_label, shuffle):
        data = self._process_input(file=file, send_label=send_label)

        # making sure the shuffling is reproducible
        g = torch.Generator()
        g.manual_seed(0)
        
        return DataLoader(
            data,
            batch_size=self.batch_size,
            shuffle=shuffle,
            collate_fn=self.data_collator,
            num_workers=24,
            worker_init_fn=self._seed_worker,
            generator=g
        )

In [93]:
class BiEncoder(nn.Module):
    def __init__(self, checkpoint):
        super(BiEncoder, self).__init__()
        self.transformer_article = AutoModel.from_pretrained(checkpoint, output_hidden_states=True, add_pooling_layer=False)
        self.transformer_essay = AutoModel.from_pretrained(checkpoint, output_hidden_states=True, add_pooling_layer=False)
        
        self.dropout = nn.Dropout(0.2)
        
        # self.fc1 = nn.Linear(1538, 768) #two hidden states (each of 768) and two similarity scores
        self.fc1 = nn.Linear(6151, 768) #two hidden states (each of 3072) and two similarity scores
        
        self.fc2 = nn.Linear(768, 512)
        self.fc3 = nn.Linear(512, 256)
        self.final = nn.Linear(256, 1)

    def _get_embeddings(self, output):
        layers = [-4, -3, -2, -1] #last four hidden states
        states = output.hidden_states
        embedding = torch.stack([states[i] for i in layers]).sum(0).squeeze()
        # print(embedding.shape)
        return embedding[:, 0, :] #return only CLS token's embedding
        
    def forward(
        self,
        input_ids_article=None,
        attention_mask_article=None,
        input_ids_essay=None,
        attention_mask_essay=None,
        gender=None,
        education=None,
        race=None,
        age=None,
        income=None
    ):

        output_article = self.transformer_article(
            input_ids=input_ids_article,
            attention_mask=attention_mask_article
        )
        
        embedding_article = self._get_embeddings(output_article)
        
        ## Using last hidden state only
        # output_article = output_article.last_hidden_state
        # output_article = output_article[:, 0] # CLS token. shape: (batch_size,768)

        ## Using last four hidden states
        output_article = output_article.hidden_states
        output_article = torch.cat([output_article[i] for i in [-4, -3, -2, -1]], dim=-1) #shape: (batch_size, seq_length, 768*4)
        output_article = output_article[:, 0] #CLS token. shape: (batch_size, 768*4)

        ### Essay

        output_essay = self.transformer_essay(
            input_ids=input_ids_essay,
            attention_mask=attention_mask_essay
        )
        embedding_essay = self._get_embeddings(output_essay)
        
        ## Using last hidden state only
        # output_essay = output_essay.last_hidden_state
        # output_essay = output_essay[:, 0]

        ## Using last four hidden states
        output_essay = output_essay.hidden_states
        output_essay = torch.cat([output_essay[i] for i in [-4, -3, -2, -1]], dim=-1) #shape: (batch_size, seq_length, 768*4)
        output_essay = output_essay[:, 0] #CLS token

        cosine = F.cosine_similarity(embedding_article, embedding_essay, dim=1).view(-1, 1) #shape: (batch_size, 1)
        euclidean = F.pairwise_distance(embedding_article, embedding_essay).view(-1, 1) #shape: (batch_size, 1)

        output = torch.cat((output_article, output_essay, cosine, euclidean,
                           gender, education, race, age, income), dim=1) # shape: (batch_size, 3072*2+1+5)
        
        output = self.fc1(output)
        output = nn.ReLU()(output)
        output = self.dropout(output)
        output = self.fc2(output)
        output = nn.ReLU()(output)
        output = self.fc3(output)
        output = nn.ReLU()(output)
        output = self.final(output)
        return output

In [94]:
class Trainer:
    def __init__(self, task, model, lr, n_epoch, train_loader_article, train_loader_essay,
                dev_loader_article, dev_loader_essay, dev_label_file):
        self.device = get_device(0)
        
        self.task = task
        self.model = model.to(self.device)
        
        self.lr = lr
        self.n_epoch = n_epoch
        self.train_loader_article = train_loader_article
        self.train_loader_essay = train_loader_essay
        self.dev_loader_article = dev_loader_article
        self.dev_loader_essay = dev_loader_essay
        self.dev_label_file = dev_label_file
        
        self.loss_fn = nn.MSELoss()
        
        self.optimiser = torch.optim.AdamW(
            params=self.model.parameters(),
            lr=self.lr,
            betas=(0.9, 0.999),
            eps=1e-06,
            weight_decay=0.01
        )

        n_training_step = self.n_epoch*len(self.train_loader_article)
        self.lr_scheduler = get_linear_schedule_with_warmup(
            optimizer=self.optimiser,
            num_warmup_steps=0.01*n_training_step,
            num_training_steps=n_training_step
        )
        
        self.best_pearson_r = -1.0 # initiliasation
        
        assert len(self.task) == 1, 'Task must be a list with one element'

    def _training_step(self, epoch):
        tr_loss = 0.0
        
        self.model.train()
    
        for (data_article, data_essay) in zip(self.train_loader_article, self.train_loader_essay):
            input_ids_article = data_article['input_ids'].to(self.device, dtype=torch.long)
            attention_mask_article = data_article['attention_mask'].to(self.device, dtype=torch.long)
            input_ids_essay = data_essay['input_ids'].to(self.device, dtype=torch.long)
            attention_mask_essay = data_essay['attention_mask'].to(self.device, dtype=torch.long)

            assert (data_article[self.task[0]].detach().numpy() == data_essay[self.task[0]].detach().numpy()).all(), 'Ground truth is different between encoders'
            targets = data_article[self.task[0]].to(self.device, dtype=torch.float).view(-1, 1)
            
            gender = data_article['gender'].to(self.device, dtype=torch.float).view(-1, 1)
            education = data_article['education'].to(self.device, dtype=torch.float).view(-1, 1)
            race = data_article['race'].to(self.device, dtype=torch.float).view(-1, 1)
            age = data_article['age'].to(self.device, dtype=torch.float).view(-1, 1)
            income = data_article['income'].to(self.device, dtype=torch.float).view(-1, 1)
            
            outputs = self.model(
                input_ids_article=input_ids_article,
                attention_mask_article=attention_mask_article,
                input_ids_essay=input_ids_essay,
                attention_mask_essay=attention_mask_essay,
                gender=gender,
                education=education,
                race=race,
                age=age,
                income=income
            )          
            loss = self.loss_fn(outputs, targets)
            self.optimiser.zero_grad()
            loss.backward()
            self.optimiser.step()
            self.lr_scheduler.step()
            
            tr_loss += loss.item()

        epoch_loss = tr_loss / len(self.train_loader_article)
        print(f'The total loss: {epoch_loss}')

    def fit(self, save_model=False):
        
        for epoch in range(self.n_epoch):
            print(f'Epoch: {epoch+1}')
            self._training_step(epoch)

            preds = self.evaluate(dataloader_article=self.dev_loader_article,
                                  dataloader_essay=self.dev_loader_essay, load_model=False)

            dev_label = pd.read_csv(self.dev_label_file, sep='\t', header=None)
            if self.task[0] == 'empathy':
                true = dev_label.iloc[:, 0].tolist()
            if self.task[0] == 'distress':
                true = dev_label.iloc[:, 1].tolist()
            pearson_r = pearsonr(true, preds)
            print(f'Pearson r: {pearson_r}')
            
            if save_model and (pearson_r > self.best_pearson_r):
                self.best_pearson_r = pearson_r   
                torch.save(self.model.state_dict(), 'roberta-empathy.pth')
                print("Saved the model in epoch " + str(epoch+1))

        torch.cuda.empty_cache()

    def evaluate(self, dataloader_article, dataloader_essay, load_model=False):
        if load_model:
            self.model.load_state_dict(torch.load('roberta-empathy.pth'))
    
        pred = torch.empty((len(dataloader_article.dataset), 1), device=self.device) # len(self.dev_loader.dataset) --> # of samples
        
        self.model.eval()
    
        with torch.no_grad():
            idx = 0
            for (data_article, data_essay) in zip(dataloader_article, dataloader_essay):
                input_ids_article = data_article['input_ids'].to(self.device, dtype=torch.long)
                attention_mask_article = data_article['attention_mask'].to(self.device, dtype=torch.long)
                input_ids_essay = data_essay['input_ids'].to(self.device, dtype=torch.long)
                attention_mask_essay = data_essay['attention_mask'].to(self.device, dtype=torch.long)

                #taken from article dataloader. can also be taken from essay dataloader
                gender = data_article['gender'].to(self.device, dtype=torch.float).view(-1, 1)
                education = data_article['education'].to(self.device, dtype=torch.float).view(-1, 1)
                race = data_article['race'].to(self.device, dtype=torch.float).view(-1, 1)
                age = data_article['age'].to(self.device, dtype=torch.float).view(-1, 1)
                income = data_article['income'].to(self.device, dtype=torch.float).view(-1, 1)
                
                outputs = self.model(
                    input_ids_article=input_ids_article,
                    attention_mask_article=attention_mask_article,
                    input_ids_essay=input_ids_essay,
                    attention_mask_essay=attention_mask_essay,
                    gender=gender,
                    education=education,
                    race=race,
                    age=age,
                    income=income
                )

                batch_size = outputs.shape[0]
                pred[idx:idx+batch_size, :] = outputs
                idx += batch_size
            
        return [float(k) for k in pred]

In [95]:
BATCH_SIZE = 8
lr = 1e-05
checkpoint = 'roberta-base'
task_list = ['empathy', 'distress']
feature_list = ['article', 'demographic_essay']

train_file = './data/PREPROCESSED-WS22-WS23-train.tsv'
# train_file = './data/COMBINED-PREPROCESSED-PARAPHRASED-WS22-WS23-train.tsv'

# WASSA 2022
# dev_file = './data/PREPROCESSED-WS22-dev.tsv'
# dev_label_file = './data/WASSA22/goldstandard_dev_2022.tsv'
# test_file = './data/PREPROCESSED-WS22-test.tsv'

# WASSA 2023
dev_file = './data/PREPROCESSED-WS23-dev.tsv'
dev_label_file = './data/WASSA23/goldstandard_dev.tsv'
test_file = './data/PREPROCESSED-WS23-test.tsv'

In [96]:
data_module_article = DataModule(
    task=[task_list[0]],
    checkpoint=checkpoint,
    batch_size=BATCH_SIZE,
    feature_to_tokenise=[feature_list[0]]
)

data_module_essay = DataModule(
    task=[task_list[0]],
    checkpoint=checkpoint,
    batch_size=BATCH_SIZE,
    feature_to_tokenise=[feature_list[1]]
)

In [97]:
train_loader_article = data_module_article.dataloader(file=train_file, send_label=True, shuffle=True)
dev_loader_article = data_module_article.dataloader(file=dev_file, send_label=False, shuffle=False)

train_loader_essay = data_module_essay.dataloader(file=train_file, send_label=True, shuffle=True)
dev_loader_essay = data_module_essay.dataloader(file=dev_file, send_label=False, shuffle=False)

Map:   0%|          | 0/2636 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/2636 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

In [98]:
set_all_seeds(0)
model = BiEncoder(checkpoint=checkpoint)

trainer = Trainer(
    task=[task_list[0]],
    model=model,
    lr=lr,
    n_epoch=10,
    train_loader_article=train_loader_article,
    train_loader_essay=train_loader_essay,
    dev_loader_article=dev_loader_article,
    dev_loader_essay=dev_loader_essay,
    dev_label_file=dev_label_file,
)

In [None]:
trainer.fit(
    save_model=False
)

Epoch: 1
The total loss: 5.759729770819346
Pearson r: 0.275
Epoch: 2
The total loss: 3.0137373674999584
Pearson r: 0.379
Epoch: 3
The total loss: 2.3595968879533538
Pearson r: 0.554
Epoch: 4
The total loss: 1.7387050343282295
Pearson r: 0.556
Epoch: 5
The total loss: 1.1739419195236582
Pearson r: 0.536
Epoch: 6
The total loss: 0.8594366723502224
Pearson r: 0.541
Epoch: 7
The total loss: 0.6777805758245063
Pearson r: 0.555
Epoch: 8
The total loss: 0.5228156495162032
Pearson r: 0.553
Epoch: 9


In [None]:
def objective(trial):
    lr = trial.suggest_float('Learning rate', 1e-05, 1e-03, log=True)
    num_warmup = trial.suggest_int('Warmup steps', 0, 100)
    beta_1 = 
    beta_2 = 

# Test

In [21]:
test_loader_article = data_module_article.dataloader(file=test_file, send_label=False, shuffle=False)
test_loader_essay = data_module_essay.dataloader(file=test_file, send_label=False, shuffle=False)
pred = trainer.evaluate(dataloader_article=test_loader_article, dataloader_essay=test_loader_essay, load_model=True)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [22]:
pred_df = pd.DataFrame({'emp': pred, 'dis': pred})

In [23]:
pred_df

Unnamed: 0,emp,dis
0,4.348185,4.348185
1,4.576570,4.576570
2,5.480320,5.480320
3,4.093744,4.093744
4,4.806672,4.806672
...,...,...
95,3.575655,3.575655
96,5.516463,5.516463
97,5.515535,5.515535
98,5.277962,5.277962


In [24]:
pred_df.to_csv('./tmp/predictions_EMP.tsv', sep='\t', index=None, header=None)