In [1]:
import pandas as pd
import numpy as np
import transformers as trf
from datasets import Dataset
import torch
from accelerate import Accelerator
from accelerate import notebook_launcher

In [2]:
import os
os.chdir("/home/573/rh2942/WASSA-2023-EMP") # changing dir for evaluation file
# print(os.getcwd())

In [3]:
os.environ["TOKENIZERS_PARALLELISM"] = "false" # due to huggingface warning

In [5]:
def load_tokenised_data(filename, tokeniser, train_test):
   
    input_data = pd.read_csv(filename, header=0, index_col=0)
    
    if train_test == "train":
        chosen_data = input_data[[feature_1, feature_2, task]]
    elif train_test == "test":
        chosen_data = input_data[[feature_1, feature_2]]  #test data shouldn't have output label

    hugging_dataset = Dataset.from_pandas(chosen_data, preserve_index=False)

    tokenised_hugging_dataset = hugging_dataset.map(tokenise, batched=True, remove_columns = [feature_1, feature_2])
    
    if train_test == "train":
        tokenised_hugging_dataset = tokenised_hugging_dataset.rename_column(task, "labels") # as huggingface requires
    
    tokenised_hugging_dataset = tokenised_hugging_dataset.with_format("torch")

    return tokenised_hugging_dataset

# Prediction

In [73]:
def train_test(model):
    accelerator = Accelerator()
    
    accelerator.print(f"{task} prediction")  #task: "empathy" or "distress"
    
    opt = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
    # loss_function = torch.nn.MSELoss()
  
    trainset = load_tokenised_data(filename=train_filename, tokeniser=tokeniser, train_test="train")
       
    trainloader = torch.utils.data.DataLoader(
        trainset, shuffle=True, batch_size=BATCH_SIZE, collate_fn=data_collator
    )
    
    training_steps = NUM_EPOCH * len(trainloader)
    lr_scheduler = trf.get_scheduler(
        "linear",
        optimizer=opt,
        num_warmup_steps=0,
        num_training_steps=training_steps
    )

    trainloader, model, opt = accelerator.prepare(
        trainloader, model, opt    
    )
    
    for epoch in range(0, NUM_EPOCH):

        # Print epoch
        accelerator.print(f'Starting epoch {epoch+1}')
        
        epoch_loss = 0
        num_batches = 0

        # Iterate over the DataLoader for training data
        for batch in trainloader:
            # Perform forward pass
            outputs = model(**batch)
            
            loss = outputs.loss
            # loss = loss_function(outputs.logits, batch["labels"])

            accelerator.backward(loss)
        
            opt.step()
            lr_scheduler.step()
            
            opt.zero_grad()
            
            epoch_loss += loss.item()
            num_batches += 1

        # Process is complete.
        avg_epoch_loss = epoch_loss / num_batches
        accelerator.print(f"Epoch {epoch}: average loss = {avg_epoch_loss}")
        
    
    # evaluation on test set
    testset = load_tokenised_data(filename=test_filename, tokeniser=tokeniser, train_test="test")
    testloader = torch.utils.data.DataLoader(
        testset, shuffle=False, batch_size=BATCH_SIZE, collate_fn=data_collator
    )

    # testloader = accelerator.prepare(testloader)
            
    model.eval()

    y_pred = []

    for batch in testloader:
        with torch.no_grad():
            outputs = model(**batch)

        batch_pred = [item for sublist in outputs.logits.tolist() for item in sublist]  #convert 2D list to 1D
        y_pred.extend(batch_pred)
  
    y_pred_df = pd.DataFrame({task: y_pred})
    filename = "predictions_" + task + ".tsv"
    y_pred_df.to_csv(filename, sep='\t', header=False, index=False)

In [80]:
NUM_EPOCH = 20
BATCH_SIZE = 2
LEARNING_RATE = 5e-5

train_filename = "preprocessed_train.csv"
test_filename = "preprocessed_test.csv"

#Chosen features
feature_1 = 'demographic_essay'
feature_2 = 'article'

# feature_1 = 'essay_demographic_prompt'
# feature_2 = 'article'
    
# checkpoint = "bert-base-uncased"
# checkpoint = "bhadresh-savani/bert-base-uncased-emotion"
checkpoint = "distilbert-base-uncased"
# checkpoint = "cardiffnlp/twitter-roberta-base-sentiment-latest"

tokeniser = trf.AutoTokenizer.from_pretrained(checkpoint)

#padding="longest" can be deferred to do dynamic padding
def tokenise(sentence):
    return tokeniser(sentence[feature_1], sentence[feature_2], truncation=True) 
  # return tokeniser(sentence["essay"], sentence["article"], padding="max_length", max_length=514, truncation=True)   #for Cardiff-emotion one
    
# data collator due to variable max token length per batch size
data_collator = trf.DataCollatorWithPadding(tokenizer = tokeniser)

model = trf.AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classi

In [75]:
task = "empathy"
notebook_launcher(train_test, (model,), num_processes=torch.cuda.device_count())

Launching training on 4 GPUs.
empathy prediction


Map:   0%|          | 0/779 [00:00<?, ? examples/s]

Starting epoch 1


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 0: average loss = 4.45981561240493
Starting epoch 2
Epoch 1: average loss = 1.908328813519709
Starting epoch 3
Epoch 2: average loss = 1.7248383440951607
Starting epoch 4
Epoch 3: average loss = 1.597538237823904
Starting epoch 5
Epoch 4: average loss = 0.9531372569460417
Starting epoch 6
Epoch 5: average loss = 0.49677760310456803
Starting epoch 7
Epoch 6: average loss = 0.40076955104702894
Starting epoch 8
Epoch 7: average loss = 0.24228849640527114
Starting epoch 9
Epoch 8: average loss = 0.16356863237376032
Starting epoch 10
Epoch 9: average loss = 0.07018695200308778
Starting epoch 11
Epoch 10: average loss = 0.051487028649865596
Starting epoch 12
Epoch 11: average loss = 0.03391433773801082
Starting epoch 13
Epoch 12: average loss = 0.022823318324113568
Starting epoch 14
Epoch 13: average loss = 0.017042009361867547
Starting epoch 15
Epoch 14: average loss = 0.01030592015930134
Starting epoch 16
Epoch 15: average loss = 0.007714364536473206
Starting epoch 17
Epoch 16: avera

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

In [81]:
task = "distress"
notebook_launcher(train_test, (model,), num_processes=torch.cuda.device_count())

Launching training on 4 GPUs.
distress prediction


Map:   0%|          | 0/779 [00:00<?, ? examples/s]

Starting epoch 1


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 0: average loss = 4.376671379184996
Starting epoch 2
Epoch 1: average loss = 2.7903449830375804
Starting epoch 3
Epoch 2: average loss = 1.6605603583347128
Starting epoch 4
Epoch 3: average loss = 1.2612255334531015
Starting epoch 5
Epoch 4: average loss = 0.6370725640195555
Starting epoch 6
Epoch 5: average loss = 0.3870368437813025
Starting epoch 7
Epoch 6: average loss = 0.29291493202112995
Starting epoch 8
Epoch 7: average loss = 0.14886259694425721
Starting epoch 9
Epoch 8: average loss = 0.06167113888533299
Starting epoch 10
Epoch 9: average loss = 0.03662283393989165
Starting epoch 11
Epoch 10: average loss = 0.03110947483412242
Starting epoch 12
Epoch 11: average loss = 0.01768521182371582
Starting epoch 13
Epoch 12: average loss = 0.013343499078993253
Starting epoch 14
Epoch 13: average loss = 0.009250169645290915
Starting epoch 15
Epoch 14: average loss = 0.005673151922880551
Starting epoch 16
Epoch 15: average loss = 0.007620463703739117
Starting epoch 17
Epoch 16: ave

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

In [82]:
# Creating the submission file as per requirement
predictions_empathy = pd.read_csv("predictions_empathy.tsv", sep='\t', header=None)
predictions_distress = pd.read_csv("predictions_distress.tsv", sep='\t', header=None)

predictions_EMP = pd.concat([predictions_empathy, predictions_distress], axis=1)

predictions_EMP.to_csv("predictions_EMP.tsv", sep='\t', header=False, index=False)

In [78]:
from evaluation import pearsonr, calculate_pearson

In [83]:
# Just checking the dev set performance
gold_dev = pd.read_csv('./dataset/dev/goldstandard_dev.tsv', sep='\t', header=None) # no header
pearson_empathy = pearsonr(gold_dev.loc[:,0].tolist(), predictions_empathy.loc[:,0].tolist())
print(f"Empathy: {pearson_empathy}")
pearson_distress = pearsonr(gold_dev.loc[:,1].tolist(), predictions_distress.loc[:,0].tolist())
print(f"Distress: {pearson_distress}")

Empathy: 0.6957
Distress: 0.5418


# Hyperparam tuning

In [6]:
import csv
from sklearn.model_selection import KFold
from evaluation import pearsonr, calculate_pearson

In [12]:
def train_test_kfold(model):
    accelerator = Accelerator()
    
    opt = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

    training_steps = NUM_EPOCH * len(trainloader)
    lr_scheduler = trf.get_scheduler(
        "linear",
        optimizer=opt,
        num_warmup_steps=0,
        num_training_steps=training_steps
    )
  
    trainloader_acclerate, model, opt = accelerator.prepare(
        trainloader, model, opt    
    )  
    
    for epoch in range(0, NUM_EPOCH):

        # Print epoch
        accelerator.print(f'Starting epoch {epoch+1}')
        
        epoch_loss = 0
        num_batches = 0

        # Iterate over the DataLoader for training data
        for batch in trainloader_acclerate:
            # Perform forward pass
            outputs = model(**batch)
            
            loss = outputs.loss
#             loss = loss_function(outputs, targets)

            accelerator.backward(loss)
        
            opt.step()
            lr_scheduler.step()
            
            opt.zero_grad()
            
            epoch_loss += loss.item()
            num_batches += 1

        # Process is complete.
        avg_epoch_loss = epoch_loss / num_batches
        accelerator.print(f"Epoch {epoch}: average loss = {avg_epoch_loss}")
    
    
    # Evaluation
        
    model.eval()

    y_true =[]
    y_pred = []

    for batch in testloader:
        with torch.no_grad():
            outputs = model(**batch)

        y_true.extend(batch['labels'].tolist())

        batch_pred = [item for sublist in outputs.logits.tolist() for item in sublist]  #convert 2D list to 1D
        y_pred.extend(batch_pred)
        
    pearson_r = pearsonr(y_true, y_pred)
    
    accelerator.print('\n' + checkpoint + ' & ' + str(LEARNING_RATE) + ' & ' + str(BATCH_SIZE) + ' & ' + feature_1 + '-' + feature_2 + ' & ' + str(pearson_r) + ' fold-' + str(fold) + '\n')

In [72]:
NUM_EPOCH = 10
BATCH_SIZE = 8
LEARNING_RATE = 5e-5

task = "empathy"

train_dev_filename = "preprocessed_train_dev.csv"

#Chosen features
feature_1 = 'demographic_essay'
feature_2 = 'article'

# feature_1 = 'essay'
# feature_2 = 'demographic'

# feature_1 = 'essay'
# feature_2 ='article'
    
# checkpoint = "bert-base-uncased"
# checkpoint = "distilbert-base-uncased"

tokeniser = trf.AutoTokenizer.from_pretrained(checkpoint)

#padding="longest" can be deferred to do dynamic padding
def tokenise(sentence):
    return tokeniser(sentence[feature_1], sentence[feature_2], truncation=True)
    
# data collator due to variable max token length per batch size
data_collator = trf.DataCollatorWithPadding(tokenizer = tokeniser)

train_dev = load_tokenised_data(filename=train_dev_filename, tokeniser=tokeniser, train_test="train")

K_FOLD = 5

# Set fixed random number seed
torch.manual_seed(42)

kfold = KFold(n_splits=K_FOLD, shuffle=True)

print('--------------------------------')

# K-fold Cross Validation model evaluation
for fold, (train_idx, test_idx) in enumerate(kfold.split(train_dev)):

    print(f'FOLD {fold}')
    print('--------------------------------')

    # Sample elements randomly from a given list of ids, no replacement.
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_idx)
    test_subsampler = torch.utils.data.SubsetRandomSampler(test_idx)

    # Define data loaders for training and testing data in this fold
    trainloader = torch.utils.data.DataLoader(
        train_dev,
        batch_size=BATCH_SIZE,
        collate_fn=data_collator,
        sampler=train_subsampler
    )
    testloader = torch.utils.data.DataLoader(
        train_dev,
        batch_size=BATCH_SIZE,
        collate_fn=data_collator,
        sampler=test_subsampler
    )

    # Init the neural network
    model = trf.AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)

    notebook_launcher(train_test_kfold, (model,), num_processes=torch.cuda.device_count())

Map:   0%|          | 0/987 [00:00<?, ? examples/s]

--------------------------------
FOLD 0
--------------------------------


Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

Launching training on 4 GPUs.
Starting epoch 1


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 0: average loss = 7.213689975738525
Starting epoch 2
Epoch 1: average loss = 3.504651494026184
Starting epoch 3
Epoch 2: average loss = 3.627466974258423
Starting epoch 4
Epoch 3: average loss = 1.8103101205825807
Starting epoch 5
Epoch 4: average loss = 1.518955283164978
Starting epoch 6
Epoch 5: average loss = 1.2208479496836662
Starting epoch 7
Epoch 6: average loss = 1.2117237094044686
Starting epoch 8
Epoch 7: average loss = 0.6970827853679658
Starting epoch 9
Epoch 8: average loss = 0.5291986376047134
Starting epoch 10
Epoch 9: average loss = 0.2736331722140312

bert-large-uncased & 5e-05 & 8 & demographic_essay-article & 0.7977 fold-0

FOLD 1
--------------------------------


Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

Launching training on 4 GPUs.
Starting epoch 1


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 0: average loss = 10.47298833847046
Starting epoch 2
Epoch 1: average loss = 4.196836357116699
Starting epoch 3
Epoch 2: average loss = 4.3771005058288575
Starting epoch 4
Epoch 3: average loss = 4.045954556465149
Starting epoch 5
Epoch 4: average loss = 3.6713451862335207
Starting epoch 6
Epoch 5: average loss = 4.030801820755005
Starting epoch 7
Epoch 6: average loss = 3.761001286506653
Starting epoch 8
Epoch 7: average loss = 3.821267967224121
Starting epoch 9
Epoch 8: average loss = 3.99852801322937
Starting epoch 10
Epoch 9: average loss = 3.6609089183807373

bert-large-uncased & 5e-05 & 8 & demographic_essay-article & 0.0469 fold-1

FOLD 2
--------------------------------


Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

Launching training on 4 GPUs.
Starting epoch 1


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 0: average loss = 11.501055421829223
Starting epoch 2
Epoch 1: average loss = 3.8675417041778566
Starting epoch 3
Epoch 2: average loss = 3.6557919073104856
Starting epoch 4
Epoch 3: average loss = 2.618813760280609
Starting epoch 5
Epoch 4: average loss = 1.6289004373550415
Starting epoch 6
Epoch 5: average loss = 1.5414224410057067
Starting epoch 7
Epoch 6: average loss = 0.8820622348785401
Starting epoch 8
Epoch 7: average loss = 0.5472309976816178
Starting epoch 9
Epoch 8: average loss = 0.3199688667058945
Starting epoch 10
Epoch 9: average loss = 0.16565409243106843

bert-large-uncased & 5e-05 & 8 & demographic_essay-article & 0.8002 fold-2

FOLD 3
--------------------------------


Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

Launching training on 4 GPUs.
Starting epoch 1


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 0: average loss = 5.486358122825623
Starting epoch 2
Epoch 1: average loss = 3.3773480463027954
Starting epoch 3
Epoch 2: average loss = 3.858164029121399
Starting epoch 4
Epoch 3: average loss = 3.092499678134918
Starting epoch 5
Epoch 4: average loss = 2.3774873328208925
Starting epoch 6
Epoch 5: average loss = 2.290995116233826
Starting epoch 7
Epoch 6: average loss = 2.1909004259109497
Starting epoch 8
Epoch 7: average loss = 1.3470318806171417
Starting epoch 9
Epoch 8: average loss = 0.9972421827912331
Starting epoch 10
Epoch 9: average loss = 0.7596608233451844

bert-large-uncased & 5e-05 & 8 & demographic_essay-article & 0.7008 fold-3

FOLD 4
--------------------------------


Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

Launching training on 4 GPUs.
Starting epoch 1


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 0: average loss = 8.129480237960815
Starting epoch 2
Epoch 1: average loss = 3.9551459836959837
Starting epoch 3
Epoch 2: average loss = 3.8354076099395753
Starting epoch 4
Epoch 3: average loss = 3.8489742612838747
Starting epoch 5
Epoch 4: average loss = 5.5751168823242185
Starting epoch 6
Epoch 5: average loss = 3.7704463481903074
Starting epoch 7
Epoch 6: average loss = 3.774736123085022
Starting epoch 8
Epoch 7: average loss = 3.518358378410339
Starting epoch 9
Epoch 8: average loss = 3.6325153732299804
Starting epoch 10
Epoch 9: average loss = 3.7241961479187013

bert-large-uncased & 5e-05 & 8 & demographic_essay-article & -0.035 fold-4



# Extras

In [123]:
# prompt_checkpoint = 'gpt2'
# prompt_generator = trf.pipeline('text-generation', model=prompt_checkpoint)

# def prompt_generate(text):
#     """
#     extend "text" to max_length. It will be a list of dictionaries. First item is the first return_sequence. 'generated_text' is self-explanatory.
#     """
#     prompt = prompt_generator(text, max_length=100, num_return_sequences=1)[0]['generated_text']

In [None]:
# checking length after tokenisation

# length = []
# for i in range(tokenised_hugging_dataset['train'].num_rows):
#   length.append(len(tokenised_hugging_dataset['train']['input_ids'][i]))

# print(f"Lengths: {length}")

In [None]:
# prediction_model.save_pretrained("model")

In [6]:
# from google.colab import drive
# mount_path = '/content/drive'
# drive.mount(mount_path)
# %cd $mount_path"/MyDrive/WASSA2023"

# !pip install transformers datasets sentencepiece

## Training by Huggingface API

In [None]:
from transformers import Trainer, TrainingArguments
from sklearn.metrics import mean_squared_error

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {"rmse": rmse}

In [None]:
training_args = TrainingArguments(output_dir="empathy-transformer",
                                  logging_strategy="epoch",
                                  evaluation_strategy="epoch",
                                  per_device_train_batch_size=16,
                                  per_device_eval_batch_size=16,
                                  num_train_epochs=3,
                                  # learning_rate=2e-5,
                                  save_total_limit=2,
                                  save_strategy='no',
                                  load_best_model_at_end=False)

trainer = Trainer(
    model=empathy_prediction,
    args=training_args,
    train_dataset=tokenised_hugging_dataset["train"],
    eval_dataset=tokenised_hugging_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokeniser,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,Rmse
1,3.4157,3.094043,1.758989
2,2.1608,2.74757,1.65758
3,1.4134,2.890623,1.700183


TrainOutput(global_step=117, training_loss=2.329951457488231, metrics={'train_runtime': 30.014, 'train_samples_per_second': 62.271, 'train_steps_per_second': 3.898, 'total_flos': 76837223949486.0, 'train_loss': 2.329951457488231, 'epoch': 3.0})

In [None]:
raw_pred, _, _ = trainer.predict(tokenised_hugging_dataset["test"])

In [50]:
def train(model):

    device = "cuda:0"
    if torch.cuda.device_count() > 1:
            model = torch.nn.DataParallel(model)
    model.to(device)
    
    # device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    # model.to(device)
    
    # criterion = torch.nn.MSELoss()
    opt = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
    
    trainset = load_tokenised_data(raw_data, tokeniser) #train
       
    train_dataloader = torch.utils.data.DataLoader(
        trainset, shuffle=True, batch_size=BATCH_SIZE, collate_fn=data_collator
    )
    
    training_steps = NUM_EPOCH * len(train_dataloader)
    # lr_scheduler = torch.optim.lr_scheduler.StepLR(opt, step_size=1, gamma=0.1)
    lr_scheduler = trf.get_scheduler(
        "linear",
        optimizer=opt,
        num_warmup_steps=0,
        num_training_steps=training_steps
    )
    
    model.train()
    for epoch in range(NUM_EPOCH):
        epoch_loss = 0
        num_batches = 0
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss 
            # loss = criterion(outputs.logits, batch['labels'])

            loss.backward()    
            opt.step()
            lr_scheduler.step()
            opt.zero_grad()

            epoch_loss += loss.item()
            num_batches += 1

        avg_epoch_loss = epoch_loss / num_batches
        print(f"Epoch {epoch}: average loss = {avg_epoch_loss}")

## Ray tune

In [11]:
from ray import tune

In [17]:
def train(config):
    model = trf.AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)

    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)
    
    opt = torch.optim.AdamW(prediction_model.parameters(), lr=config["learning_rate"], momentum=0.9)
    
    train_dev = load_tokenised_data(filename=train_dev_filename, tokeniser=tokeniser, train_test="train")
    
    train_portion = int(len(train_dev) * 0.8)
    validation_portion = len(train_dev) - train_portion
    train_subset, val_subset = torch.utils.data.random_split(train_dev, [train_portion, validation_portion])
    
    train_dataloader = torch.utils.data.DataLoader(
        train_subset, shuffle=True, batch_size=int(config["batch_size"]), collate_fn=data_collator
    )
    
    validation_dataloader = torch.utils.data.DataLoader(
        validation_subset, shuffle=True, batch_size=int(config["batch_size"]), collate_fn=data_collator
    )
    
    training_steps = NUM_EPOCH * len(train_dataloader)
    # lr_scheduler = torch.optim.lr_scheduler.StepLR(opt, step_size=1, gamma=0.1)
    lr_scheduler = trf.get_scheduler(
        "linear",
        optimizer=opt,
        num_warmup_steps=0,
        num_training_steps=training_steps
    )
    
    model.train()
    for epoch in range(NUM_EPOCH):
        epoch_loss = 0
        num_batches = 0
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss 
            # loss = criterion(outputs.logits, batch['labels'])

            loss.backward()    
            opt.step()
            lr_scheduler.step()
            opt.zero_grad()

            epoch_loss += loss.item()
            num_batches += 1

        avg_epoch_loss = epoch_loss / num_batches
        tune.report(loss=avg_epoch_loss)

        # Evaluation    
        # model.eval()

        y_true =[]
        y_pred = []
        val_loss = 0.0
        val_step = 0

        for batch in validation_dataloader:
            with torch.no_grad():
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)

                y_true.extend(batch['labels'].tolist())
                batch_pred = [item for sublist in outputs.logits.tolist() for item in sublist]  #convert 2D list to 1D
                y_pred.extend(batch_pred)
                pearson_r = pearsonr(y_true, y_pred)
                
                loss = outputs.loss
                val_loss += loss.cpu().numpy()
                val_steps += 1
                
        tune.report(loss=(val_loss / val_steps), accuracy=pearson_r)
    
    print("Finished Training")

In [9]:
from functools import partial

In [19]:
config = {
    "learning_rate": tune.loguniform(6e-5, 2e-5),
    "batch_size": tune.choice([4, 8, 16])
}

scheduler = tune.schedulers.ASHAScheduler(
    metric="loss",
    mode="min",
    max_t = 10,
    grace_period=1,
    reduction_factor=2
)

reporter = tune.CLIReporter(metric_columns=["loss", "accuracy", "training_iteration"])

analysis = tune.run(
    train,
    config=config,
    num_samples=10,
    scheduler=scheduler,
    progress_reporter=reporter
)

best_trial = analysis.get_best_trial("loss", "min", "last")
print("Best trial config: {}".format(best_trial.config))
print("Best trial final validation loss: {}".format(best_trial.last_result["loss"]))
print("Best trial final validation accuracy: {}".format(best_trial.last_result["accuracy"]))



TuneError: Traceback (most recent call last):
  File "/scratch/jr19/rh2942/miniconda3/lib/python3.10/site-packages/ray/tune/execution/trial_runner.py", line 900, in _wait_and_handle_event
    event = self.trial_executor.get_next_executor_event(
  File "/scratch/jr19/rh2942/miniconda3/lib/python3.10/site-packages/ray/tune/execution/ray_trial_executor.py", line 1183, in get_next_executor_event
    self._stage_and_update_status(live_trials)
  File "/scratch/jr19/rh2942/miniconda3/lib/python3.10/site-packages/ray/tune/execution/ray_trial_executor.py", line 324, in _stage_and_update_status
    self._resource_manager.request_resources(resource_request=resource_request)
  File "/scratch/jr19/rh2942/miniconda3/lib/python3.10/site-packages/ray/air/execution/resources/placement_group.py", line 143, in request_resources
    future = pg.ready()
  File "/scratch/jr19/rh2942/miniconda3/lib/python3.10/site-packages/ray/util/placement_group.py", line 81, in ready
    ).remote(self)
  File "/scratch/jr19/rh2942/miniconda3/lib/python3.10/site-packages/ray/remote_function.py", line 226, in remote
    return func_cls._remote(args=args, kwargs=kwargs, **updated_options)
  File "/scratch/jr19/rh2942/miniconda3/lib/python3.10/site-packages/ray/util/tracing/tracing_helper.py", line 307, in _invocation_remote_span
    return method(self, args, kwargs, *_args, **_kwargs)
  File "/scratch/jr19/rh2942/miniconda3/lib/python3.10/site-packages/ray/remote_function.py", line 412, in _remote
    return invocation(args, kwargs)
  File "/scratch/jr19/rh2942/miniconda3/lib/python3.10/site-packages/ray/remote_function.py", line 387, in invocation
    object_refs = worker.core_worker.submit_task(
  File "python/ray/_raylet.pyx", line 1969, in ray._raylet.CoreWorker.submit_task
  File "python/ray/_raylet.pyx", line 1973, in ray._raylet.CoreWorker.submit_task
  File "python/ray/_raylet.pyx", line 425, in ray._raylet.prepare_args_and_increment_put_refs
  File "python/ray/_raylet.pyx", line 416, in ray._raylet.prepare_args_and_increment_put_refs
  File "python/ray/_raylet.pyx", line 462, in ray._raylet.prepare_args_internal
  File "/scratch/jr19/rh2942/miniconda3/lib/python3.10/site-packages/ray/_private/worker.py", line 539, in get_serialization_context
    context_map[job_id] = serialization.SerializationContext(self)
  File "/scratch/jr19/rh2942/miniconda3/lib/python3.10/site-packages/ray/_private/serialization.py", line 135, in __init__
    serialization_addons.apply(self)
  File "/scratch/jr19/rh2942/miniconda3/lib/python3.10/site-packages/ray/util/serialization_addons.py", line 58, in apply
    register_pydantic_serializer(serialization_context)
  File "/scratch/jr19/rh2942/miniconda3/lib/python3.10/site-packages/ray/util/serialization_addons.py", line 21, in register_pydantic_serializer
    pydantic.fields.ModelField,
AttributeError: module 'pydantic.fields' has no attribute 'ModelField'
