In [1]:
import numpy as np
import pandas as pd
import os

import torch
import torch.nn as nn
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader, TensorDataset

from transformers import (
    DataCollatorWithPadding,
    AutoModel,
    AutoTokenizer
)
from datasets import Dataset

os.chdir("/g/data/jr19/rh2942/text-empathy/")
from evaluation import pearsonr
from utils.utils import plot, get_device, set_all_seeds

In [2]:
os.environ['TOKENIZERS_PARALLELISM'] = 'false' # due to huggingface warning
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'

In [20]:
class DataModule():
    def __init__(self, task_list, checkpoint, batch_size, feature_to_tokenise):
        super(DataModule, self).__init__()
        self.task_list = task_list
        self.checkpoint = checkpoint
        self.batch_size = batch_size
        self.tokeniser = AutoTokenizer.from_pretrained(
            self.checkpoint,
            use_fast=True
        )
        self.data_collator = DataCollatorWithPadding(tokenizer=self.tokeniser)
        self.feature_to_tokenise = feature_to_tokenise # to tokenise function
    
    def _process_raw(self, path, send_label):
        data = pd.read_csv(path, sep='\t')
    
        if send_label:
            data = data[self.feature_to_tokenise + self.task_list]
        else:
            data = data[self.feature_to_tokenise]
            
        return data

    def _tokeniser_fn(self, sentence):
        if len(self.feature_to_tokenise) == 1: # only one feature
            return self.tokeniser(sentence[self.feature_to_tokenise[0]], truncation=True)
        # otherwise tokenise a pair of sentence
        return self.tokeniser(sentence[self.feature_to_tokenise[0]], sentence[self.feature_to_tokenise[1]], truncation=True)

    def _process_input(self, file, send_label):
        data = self._process_raw(path=file, send_label=send_label)
        data = data.reset_index(drop=True)
        data = Dataset.from_pandas(data, preserve_index=False) # convert to huggingface dataset
        data = data.map(self._tokeniser_fn, batched=True, remove_columns=self.feature_to_tokenise) # tokenise
        data = data.with_format('torch')
        return data

    def dataloader(self, file, send_label, shuffle):
        data = self._process_input(file=file, send_label=send_label)
        return DataLoader(
            data,
            batch_size=self.batch_size,
            shuffle=shuffle,
            collate_fn=self.data_collator,
            num_workers=24
        )

In [21]:
class Guide(nn.Module):
    def __init__(self, checkpoint):
        super(Guide, self).__init__()
        self.transformer = AutoModel.from_pretrained(checkpoint)
        self.dropout = nn.Dropout(0.2)
        
        self.pre_classifier = nn.Linear(768, 768)
        
        self.classifier = nn.Linear(768, 512)
        self.pre_final = nn.Linear(512, 256)
        self.final = nn.Linear(256, 1)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        # token_type_ids=None,
    ):

        output = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask,
            # token_type_ids=token_type_ids
        )
        
        output = output[0]
        output = output[:, 0]

        output = self.pre_classifier(output)
        
        output = nn.ReLU()(output)
        output = self.dropout(output)
        output = self.classifier(output)
        output = nn.ReLU()(output)
        output = self.pre_final(output)
        output = nn.ReLU()(output)
        output = self.final(output)
        return output

In [22]:
class Predictor(nn.Module):
    def __init__(self, checkpoint):
        super(Predictor, self).__init__()
        self.transformer = AutoModel.from_pretrained(checkpoint)
        self.dropout = nn.Dropout(0.2)
        
        self.pre_classifier = nn.Linear(769, 768)
        
        self.classifier = nn.Linear(768, 512)
        self.pre_final = nn.Linear(512, 256)
        self.final = nn.Linear(256, 1)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        # token_type_ids=None,
        guide=None
    ):

        output = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask,
            # token_type_ids=token_type_ids
        )
        
        output = output[0]
        output = output[:, 0]

        output = torch.cat([output, guide], 1)
        output = self.pre_classifier(output)
    
        output = nn.ReLU()(output)
        output = self.dropout(output)
        output = self.classifier(output)
        output = nn.ReLU()(output)
        output = self.pre_final(output)
        output = nn.ReLU()(output)
        output = self.final(output)
        return output

In [23]:
class Trainer:
    def __init__(self, task_guide, task_predictor, model_guide, model_predictor, lr):
        self.device = get_device(0)
        
        self.task_guide = task_guide
        self.task_predictor = task_predictor
        self.model_guide = model_guide.to(self.device)
        self.model_predictor = model_predictor.to(self.device)
        
        self.lr = lr
        self.loss_fn = nn.MSELoss()
        
        self.optimizer_guide = torch.optim.Adam(params=self.model_guide.parameters(), lr = self.lr)
        self.optimizer_predictor = torch.optim.Adam(params=self.model_predictor.parameters(), lr = self.lr)
        
        self.best_pearson_r = -1.0 # initiliasation

    def _training_step(self, epoch, train_loader):
        tr_loss = 0.0
        
        self.model_guide.train()
        self.model_predictor.train()
    
        for i, data in enumerate(train_loader, 0):
            input_ids = data['input_ids'].to(self.device, dtype=torch.long)
            attention_mask = data['attention_mask'].to(self.device, dtype=torch.long)
            # token_type_ids = data['token_type_ids'].to(self.device, dtype=torch.long)
            guide = data[self.task_guide].to(self.device, dtype=torch.float).view(-1, 1)
            targets = data[self.task_predictor].to(self.device, dtype=torch.float).view(-1, 1)
            
            outputs = self.model_guide(
                input_ids=input_ids,
                attention_mask=attention_mask,
                # token_type_ids=token_type_ids,
            )          
            loss = self.loss_fn(outputs, guide)
            self.optimizer_guide.zero_grad()
            loss.backward(retain_graph=True)
            self.optimizer_guide.step()

            final_outputs = self.model_predictor(
                input_ids=input_ids,
                attention_mask=attention_mask,
                # token_type_ids=token_type_ids,
                guide=outputs
            )
            
            final_loss = self.loss_fn(final_outputs, targets)
            self.optimizer_predictor.zero_grad()
            final_loss.backward()
            self.optimizer_predictor.step()
            
            tr_loss += final_loss.item()
    
            if i % 50 == 0:
                loss_step = tr_loss / (i+1)
                print(f'Training loss per 50 steps: {loss_step}')
    
        epoch_loss = tr_loss / (i+1)
        print(f'Epoch: {epoch+1}')
        print(f'The total loss: {epoch_loss}')

    def fit(self, n_epochs, train_loader, dev_loader, output_label, save_model=False):
        for epoch in range(n_epochs):
            self._training_step(epoch, train_loader)

            preds = self.evaluate(dataloader=dev_loader, load_model=False)
            if self.task_predictor == 'empathy':
                true = output_label.iloc[:, 0].tolist()
            if self.task_predictor == 'distress':
                true = output_label.iloc[:, 1].tolist()
            pearson_r = pearsonr(true, preds)
            print(f'Pearson r: {pearson_r}')
            
            if save_model and (pearson_r > self.best_pearson_r):
                self.best_pearson_r = pearson_r   
                torch.save(self.model_guide.state_dict(), 'roberta-guide-empathy.pth')
                torch.save(self.model_predictor.state_dict(), 'roberta-predictor-empathy.pth')
                print("Saved the model in epoch " + str(epoch+1))

    def evaluate(self, dataloader, load_model=False):
        if load_model:
            self.model_guide.load_state_dict(torch.load('roberta-guide-empathy.pth'))
            self.model_predictor.load_state_dict(torch.load('roberta-predictor-empathy.pth'))
    
        pred = torch.empty((len(dataloader.dataset), 1), device=self.device) # len(self.dev_loader.dataset) --> # of samples
        
        self.model_guide.eval()
        self.model_predictor.eval()
    
        with torch.no_grad():
            idx = 0
            for data in dataloader:
                input_ids = data['input_ids'].to(self.device, dtype=torch.long)
                attention_mask = data['attention_mask'].to(self.device, dtype=torch.long)
                # token_type_ids = data['token_type_ids'].to(self.device, dtype=torch.long)
                
                outputs = self.model_guide(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    # token_type_ids=token_type_ids,
                )
    
                final_outputs = self.model_predictor(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    # token_type_ids=token_type_ids,
                    guide=outputs
                )

                batch_size = final_outputs.shape[0]
                pred[idx:idx+batch_size, :] = final_outputs
                idx += batch_size
            
        return [float(k) for k in pred]

In [24]:
BATCH_SIZE = 8
lr = 1e-05
checkpoint = 'roberta-base'
task_list = ['empathy', 'distress']

train_file = './data/PREPROCESSED-WS22-WS23-train.tsv'

# WASSA 2022
# dev_file = './data/PREPROCESSED-WS22-dev.tsv'
# dev_label_file = './data/WASSA22/goldstandard_dev_2022.tsv'
# test_file = './data/PREPROCESSED-WS22-test.tsv'

# WASSA 2023
dev_file = './data/PREPROCESSED-WS23-dev.tsv'
dev_label_file = './data/WASSA23/goldstandard_dev.tsv'
test_file = './data/PREPROCESSED-WS23-test.tsv'

In [25]:
data_module = DataModule(
    task_list=task_list,
    checkpoint=checkpoint,
    batch_size=BATCH_SIZE,
    feature_to_tokenise=['demographic_essay', 'article']
)

In [26]:
train_loader = data_module.dataloader(file=train_file, send_label=True, shuffle=True)
dev_loader = data_module.dataloader(file=dev_file, send_label=False, shuffle=False)

Map:   0%|          | 0/2636 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

In [27]:
dev_label = pd.read_csv(dev_label_file, sep='\t', header=None)

In [28]:
set_all_seeds(0)
model_guide = Guide(checkpoint=checkpoint)
model_predictor = Predictor(checkpoint=checkpoint)

trainer = Trainer(
    task_guide=task_list[1],
    task_predictor=task_list[0],
    model_guide=model_guide,
    model_predictor=model_predictor,
    lr=lr
)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
trainer.fit(n_epochs=20, train_loader=train_loader, dev_loader=dev_loader,
            output_label=dev_label, save_model=True)

Training loss per 50 steps: 18.687795639038086
Training loss per 50 steps: 16.35341345095167
Training loss per 50 steps: 12.853866619638877
Training loss per 50 steps: 10.030268036766557
Training loss per 50 steps: 8.461887044100026
Training loss per 50 steps: 7.525218012798355
Training loss per 50 steps: 6.91431711282445
Epoch: 1
The total loss: 6.610729033296758
Pearson r: 0.162
Saved the model in epoch 1
Training loss per 50 steps: 3.240436315536499
Training loss per 50 steps: 3.881721057143866
Training loss per 50 steps: 3.591704461244073
Training loss per 50 steps: 3.4685914923023704
Training loss per 50 steps: 3.462682224921326
Training loss per 50 steps: 3.5199079480304185
Training loss per 50 steps: 3.475900487646312
Epoch: 2
The total loss: 3.4165198667482897
Pearson r: 0.354
Saved the model in epoch 2
Training loss per 50 steps: 2.496554374694824
Training loss per 50 steps: 2.8197640741572663
Training loss per 50 steps: 2.719561696642696
Training loss per 50 steps: 2.85479490

# Test

In [42]:
test_loader = data_module.dataloader(file=test_file, send_label=False, shuffle=False)
pred = trainer.evaluate(dataloader=test_loader, load_model=True)

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [43]:
pred_df = pd.DataFrame({'emp': pred, 'dis': pred})

In [44]:
pred_df

Unnamed: 0,emp,dis
0,4.743536,4.743536
1,5.375017,5.375017
2,5.558261,5.558261
3,4.441684,4.441684
4,4.513950,4.513950
...,...,...
95,5.794048,5.794048
96,5.106700,5.106700
97,3.298420,3.298420
98,4.998303,4.998303


In [45]:
pred_df.to_csv('./tmp/predictions_EMP.tsv', sep='\t', index=None, header=None)

# Extra

## Combined guide and final

In [15]:
class GuidedRegression(nn.Module):
    def __init__(self, checkpoint):
        super(GuidedRegression, self).__init__()
        self.transformer = AutoModel.from_pretrained(checkpoint)
        self.dropout = nn.Dropout(0.2)
        
        self.pre_classifier = nn.Linear(768, 768)
        self.pre_classifier_guide = nn.Linear(769, 768)
        
        self.classifier = nn.Linear(768, 512)
        self.pre_final = nn.Linear(512, 256)
        self.final = nn.Linear(256, 1)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        # token_type_ids=None,
        guide=None
    ):

        output = self.transformer(
            input_ids=input_ids,
            attention_mask=attention_mask,
            # token_type_ids=token_type_ids
        )
        
        output = output[0]
        output = output[:, 0]

        if guide is None:
            output = self.pre_classifier(output)
        else:
            output = torch.cat([output, guide], 1)
            output = self.pre_classifier_guide(output)
        
        output = nn.ReLU()(output)
        output = self.dropout(output)
        output = self.classifier(output)
        output = nn.ReLU()(output)
        output = self.pre_final(output)
        output = nn.ReLU()(output)
        output = self.final(output)
        return output

In [16]:
class Trainer:
    def __init__(self, task_guide, task_predictor, model, lr):
        self.device = get_device(0)
        
        self.task_guide = task_guide
        self.task_predictor = task_predictor
        self.model = model.to(self.device)
        
        self.lr = lr
        self.loss_fn = nn.MSELoss()
        
        self.optimizer = torch.optim.Adam(params=self.model.parameters(), lr = self.lr)
        
        self.best_pearson_r = -1.0 # initiliasation

    def _training_step(self, epoch, train_loader):
        tr_loss = 0.0
        
        self.model.train()
    
        for i, data in enumerate(train_loader, 0):
            input_ids = data['input_ids'].to(self.device, dtype=torch.long)
            attention_mask = data['attention_mask'].to(self.device, dtype=torch.long)
            # token_type_ids = data['token_type_ids'].to(self.device, dtype=torch.long)
            guide = data[self.task_guide].to(self.device, dtype=torch.float).view(-1, 1)
            targets = data[self.task_predictor].to(self.device, dtype=torch.float).view(-1, 1)
            
            outputs = self.model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                # token_type_ids=token_type_ids,
                guide=None
            )          
            loss = self.loss_fn(outputs, guide)
            self.optimizer.zero_grad()
            loss.backward(retain_graph=True)
            self.optimizer.step()

            final_outputs = self.model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                # token_type_ids=token_type_ids,
                guide=outputs
            )
            
            final_loss = self.loss_fn(final_outputs, targets)
            self.optimizer.zero_grad()
            final_loss.backward()
            self.optimizer.step()
            
            tr_loss += final_loss.item()
    
            if i % 50 == 0:
                loss_step = tr_loss / (i+1)
                print(f'Training loss per 50 steps: {loss_step}')
    
        epoch_loss = tr_loss / (i+1)
        print(f'Epoch: {epoch+1}')
        print(f'The total loss: {epoch_loss}')

    def fit(self, n_epochs, train_loader, dev_loader, output_label, save_model=False):
        set_all_seeds(0)
        for epoch in range(n_epochs):
            self._training_step(epoch, train_loader)

            preds = self.evaluate(dataloader=dev_loader, load_model=False)
            if self.task_predictor == 'empathy':
                true = output_label.iloc[:, 0].tolist()
            if self.task_predictor == 'distress':
                true = output_label.iloc[:, 1].tolist()
            pearson_r = pearsonr(true, preds)
            print(f'Pearson r: {pearson_r}')
            
            if save_model and (pearson_r > self.best_pearson_r):
                self.best_pearson_r = pearson_r   
                torch.save(self.model.state_dict(), 'roberta-empathy.pth')
                print("Saved the model in epoch " + str(epoch+1))

    def evaluate(self, dataloader, load_model=False):
        if load_model:
            self.model.load_state_dict(torch.load('roberta-empathy.pth'))
    
        pred = torch.empty((len(dataloader.dataset), 1), device=self.device) # len(self.dev_loader.dataset) --> # of samples
        
        self.model.eval()
    
        with torch.no_grad():
            idx = 0
            for data in dataloader:
                input_ids = data['input_ids'].to(self.device, dtype=torch.long)
                attention_mask = data['attention_mask'].to(self.device, dtype=torch.long)
                # token_type_ids = data['token_type_ids'].to(self.device, dtype=torch.long)
                
                outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    # token_type_ids=token_type_ids,
                    guide=None
                )
    
                final_outputs = self.model(
                    input_ids=input_ids,
                    attention_mask=attention_mask,
                    # token_type_ids=token_type_ids,
                    guide=outputs
                )

                batch_size = final_outputs.shape[0]
                pred[idx:idx+batch_size, :] = final_outputs
                idx += batch_size
            
        return [float(k) for k in pred]

In [17]:
model = GuidedRegression(checkpoint=checkpoint)

trainer = Trainer(
    task_guide=task_list[1],
    task_predictor=task_list[0],
    model=model,
    lr=lr
)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
trainer.fit(n_epochs=20, train_loader=train_loader, dev_loader=dev_loader,
            output_label=dev_label, save_model=False)

Training loss per 50 steps: 10.352550506591797
Training loss per 50 steps: 13.305788320653578
Training loss per 50 steps: 9.461502163717062
Training loss per 50 steps: 7.592380867888596
Training loss per 50 steps: 6.637593874883889
Training loss per 50 steps: 6.020316544282009
Training loss per 50 steps: 5.642060121824575
Epoch: 1
The total loss: 5.467915393728198
Pearson r: 0.125
Training loss per 50 steps: 4.183372974395752
Training loss per 50 steps: 3.317987002578436
Training loss per 50 steps: 3.424180183965381
Training loss per 50 steps: 3.474616882619479
Training loss per 50 steps: 3.39283124249966
Training loss per 50 steps: 3.3362700516484174
Training loss per 50 steps: 3.303164756377274
Epoch: 2
The total loss: 3.2939254133990317
Pearson r: 0.317
Training loss per 50 steps: 2.191816806793213
Training loss per 50 steps: 2.84420525560192
Training loss per 50 steps: 2.738071873636529
Training loss per 50 steps: 2.7836127170663794
Training loss per 50 steps: 2.7267616471247886
Tr