In [1]:
import pandas as pd
import numpy as np
import transformers as trf
from datasets import Dataset
import torch
from accelerate import Accelerator
from accelerate import notebook_launcher

In [2]:
import os
os.chdir("/home/573/rh2942/WASSA-2023-EMP") # changing dir for evaluation file
# print(os.getcwd())

In [3]:
os.environ["TOKENIZERS_PARALLELISM"] = "false" # due to huggingface warning

In [4]:
def load_tokenised_data(filename, tokeniser, train_test):
   
    input_data = pd.read_csv(filename, header=0, index_col=0)
    
    if train_test == "train":
        chosen_data = input_data[[feature_1, feature_2, task]]
    elif train_test == "test":
        chosen_data = input_data[[feature_1, feature_2]]  #test data shouldn't have output label

    hugging_dataset = Dataset.from_pandas(chosen_data, preserve_index=False)

    tokenised_hugging_dataset = hugging_dataset.map(tokenise, batched=True, remove_columns = [feature_1, feature_2])
    
    if train_test == "train":
        tokenised_hugging_dataset = tokenised_hugging_dataset.rename_column(task, "labels") # as huggingface requires
    
    tokenised_hugging_dataset = tokenised_hugging_dataset.with_format("torch")

    return tokenised_hugging_dataset

# Prediction

In [14]:
def train_test(model):
    accelerator = Accelerator()
    
    accelerator.print(f"{task} prediction")  #task: "empathy" or "distress"
    
    opt = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
  
    trainset = load_tokenised_data(filename=os.path.join("./processed_data", train_filename), tokeniser=tokeniser, train_test="train")
       
    trainloader = torch.utils.data.DataLoader(
        trainset, shuffle=True, batch_size=BATCH_SIZE, collate_fn=data_collator
    )
    
    training_steps = NUM_EPOCH * len(trainloader)
    lr_scheduler = trf.get_scheduler(
        "linear",
        optimizer=opt,
        num_warmup_steps=0,
        num_training_steps=training_steps
    )

    trainloader, model, opt = accelerator.prepare(
        trainloader, model, opt    
    )
    
    for epoch in range(0, NUM_EPOCH):

        # Print epoch
        accelerator.print(f'Starting epoch {epoch+1}')
        
        epoch_loss = 0
        num_batches = 0

        # Iterate over the DataLoader for training data
        for batch in trainloader:
            # Perform forward pass
            outputs = model(**batch)
            
            loss = outputs.loss

            accelerator.backward(loss)
        
            opt.step()
            lr_scheduler.step()
            
            opt.zero_grad()
            
            epoch_loss += loss.item()
            num_batches += 1

        # Process is complete.
        avg_epoch_loss = epoch_loss / num_batches
        accelerator.print(f"Epoch {epoch}: average loss = {avg_epoch_loss}")
        
    
    # evaluation on test set
    testset = load_tokenised_data(filename=os.path.join("./processed_data", test_filename), tokeniser=tokeniser, train_test="test")
    testloader = torch.utils.data.DataLoader(
        testset, shuffle=False, batch_size=BATCH_SIZE, collate_fn=data_collator
    )
            
    model.eval()

    y_pred = []

    for batch in testloader:
        with torch.no_grad():
            outputs = model(**batch)

        batch_pred = [item for sublist in outputs.logits.tolist() for item in sublist]  #convert 2D list to 1D
        y_pred.extend(batch_pred)
  
    y_pred_df = pd.DataFrame({task: y_pred})
    filename = "./prediction/predictions_" + task + ".tsv"
    y_pred_df.to_csv(filename, sep='\t', header=False, index=False)

In [15]:
NUM_EPOCH = 20
BATCH_SIZE = 8
LEARNING_RATE = 5e-5

# train_filename = "preprocessed_train.csv"
test_filename = "preprocessed_dev.csv"
# test_filename = "preprocessed_test.csv"

train_filename = "preprocessed_train_dev_WS23_train_WS22.csv"

#Chosen features
feature_1 = 'demographic_essay'
feature_2 = 'article'

# checkpoint = "bert-base-uncased"
checkpoint = "distilbert-base-uncased"

# torch.manual_seed(42)

tokeniser = trf.AutoTokenizer.from_pretrained(checkpoint)

#padding="longest" can be deferred to do dynamic padding
def tokenise(sentence):
    return tokeniser(sentence[feature_1], sentence[feature_2], truncation=True)
    # return tokeniser(sentence[feature_1], truncation=True)
    
# data collator due to variable max token length per batch size
data_collator = trf.DataCollatorWithPadding(tokenizer = tokeniser)

model = trf.AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)

In [16]:
task = "empathy"
notebook_launcher(train_test, (model,), num_processes=torch.cuda.device_count())
predictions_empathy = pd.read_csv("./prediction/predictions_" + task + ".tsv", sep='\t', header=None)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'classifier.w

Launching training on 4 GPUs.
empathy prediction


Map:   0%|          | 0/2847 [00:00<?, ? examples/s]

Starting epoch 1


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 0: average loss = 4.721951723098755
Starting epoch 2
Epoch 1: average loss = 2.661773997746157
Starting epoch 3
Epoch 2: average loss = 1.6683810192547488
Starting epoch 4
Epoch 3: average loss = 0.8401355250330453
Starting epoch 5
Epoch 4: average loss = 0.3753522360760174
Starting epoch 6
Epoch 5: average loss = 0.188796532874027
Starting epoch 7
Epoch 6: average loss = 0.1023019790188985
Starting epoch 8
Epoch 7: average loss = 0.054939922981299044
Starting epoch 9
Epoch 8: average loss = 0.033041609686621454
Starting epoch 10
Epoch 9: average loss = 0.018698784546256904
Starting epoch 11
Epoch 10: average loss = 0.016343772039863834
Starting epoch 12
Epoch 11: average loss = 0.01864119366418277
Starting epoch 13
Epoch 12: average loss = 0.013285775289588263
Starting epoch 14
Epoch 13: average loss = 0.014584841779590156
Starting epoch 15
Epoch 14: average loss = 0.014458280967108989
Starting epoch 16
Epoch 15: average loss = 0.01674141861718106
Starting epoch 17
Epoch 16: ave

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

In [17]:
task = "distress"
notebook_launcher(train_test, (model,), num_processes=torch.cuda.device_count())
predictions_distress = pd.read_csv("./prediction/predictions_" + task + ".tsv", sep='\t', header=None)

Launching training on 4 GPUs.
distress prediction


Map:   0%|          | 0/2847 [00:00<?, ? examples/s]

Starting epoch 1


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 0: average loss = 4.774682466903429
Starting epoch 2
Epoch 1: average loss = 3.0983424682295726
Starting epoch 3
Epoch 2: average loss = 2.1682705939485785
Starting epoch 4
Epoch 3: average loss = 0.9734360507030165
Starting epoch 5
Epoch 4: average loss = 0.4192623151989465
Starting epoch 6
Epoch 5: average loss = 0.22665803504877546
Starting epoch 7
Epoch 6: average loss = 0.10624297905001748
Starting epoch 8
Epoch 7: average loss = 0.047975477856699
Starting epoch 9
Epoch 8: average loss = 0.031031839246076814
Starting epoch 10
Epoch 9: average loss = 0.020957409725995378
Starting epoch 11
Epoch 10: average loss = 0.0148684437507031
Starting epoch 12
Epoch 11: average loss = 0.014369738214973653
Starting epoch 13
Epoch 12: average loss = 0.014332627268047647
Starting epoch 14
Epoch 13: average loss = 0.015060791855657034
Starting epoch 15
Epoch 14: average loss = 0.013388310246473032
Starting epoch 16
Epoch 15: average loss = 0.0128290754588572
Starting epoch 17
Epoch 16: aver

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

In [18]:
predictions_EMP = pd.concat([predictions_empathy, predictions_distress], axis=1)

predictions_EMP.to_csv("./prediction/predictions_EMP.tsv", sep='\t', header=False, index=False)

In [19]:
from evaluation import pearsonr, calculate_pearson

In [20]:
# Just checking the dev set performance
gold_dev = pd.read_csv('./dataset/dev/goldstandard_dev.tsv', sep='\t', header=None) # no header
pearson_empathy = pearsonr(gold_dev.loc[:,0].tolist(), predictions_empathy.loc[:,0].tolist())
print(f"Empathy: {pearson_empathy}")
pearson_distress = pearsonr(gold_dev.loc[:,1].tolist(), predictions_distress.loc[:,0].tolist())
print(f"Distress: {pearson_distress}")

Empathy: 0.9813
Distress: 0.9747


# Hyperparam tuning

In [5]:
from sklearn.model_selection import KFold
from evaluation import pearsonr, calculate_pearson

In [6]:
def train_test_kfold(model):
    accelerator = Accelerator()
    
    opt = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

    training_steps = NUM_EPOCH * len(trainloader)
    lr_scheduler = trf.get_scheduler(
        "linear",
        optimizer=opt,
        num_warmup_steps=0,
        num_training_steps=training_steps
    )
  
    trainloader_acclerate, model, opt = accelerator.prepare(
        trainloader, model, opt    
    )  
    
    for epoch in range(0, NUM_EPOCH):

        # Print epoch
        accelerator.print(f'Starting epoch {epoch+1}')
        
        epoch_loss = 0
        num_batches = 0

        # Iterate over the DataLoader for training data
        for batch in trainloader_acclerate:
            # Perform forward pass
            outputs = model(**batch)
            
            loss = outputs.loss
#             loss = loss_function(outputs, targets)

            accelerator.backward(loss)
        
            opt.step()
            lr_scheduler.step()
            
            opt.zero_grad()
            
            epoch_loss += loss.item()
            num_batches += 1

        # Process is complete.
        avg_epoch_loss = epoch_loss / num_batches
        accelerator.print(f"Epoch {epoch}: average loss = {avg_epoch_loss}")
    
    
    # Evaluation
        
    model.eval()

    y_true =[]
    y_pred = []

    for batch in testloader:
        with torch.no_grad():
            outputs = model(**batch)

        y_true.extend(batch['labels'].tolist())

        batch_pred = [item for sublist in outputs.logits.tolist() for item in sublist]  #convert 2D list to 1D
        y_pred.extend(batch_pred)
        
    pearson_r = pearsonr(y_true, y_pred)
    
    accelerator.print('\n' + checkpoint + ' & ' + str(LEARNING_RATE) + ' & ' + str(BATCH_SIZE) + ' & ' + feature_1 + '-' + feature_2 + ' & ' + str(pearson_r) + ' fold-' + str(fold) + '\n')

In [8]:
NUM_EPOCH = 20
BATCH_SIZE = 4
LEARNING_RATE = 5e-5

task = "empathy"

# train_dev_filename = "preprocessed_train_dev.csv"
# train_dev_filename = "preprocessed_train_dev_WS23_train_WS22.csv"
train_dev_filename = "train_dev_paraphrased.csv"

# Chosen features
# feature_1 = 'demographic_essay'
# feature_2 = 'article'
# feature_2 = ' '

feature_1 = 'demographic'
feature_2 = 'essay'

# feature_1 = 'essay'
# feature_2 ='article'
    
# checkpoint = "bert-base-uncased"
checkpoint = "distilbert-base-uncased"

tokeniser = trf.AutoTokenizer.from_pretrained(checkpoint)

#padding="longest" can be deferred to do dynamic padding
def tokenise(sentence):
    return tokeniser(sentence[feature_1], sentence[feature_2], truncation=True)
    # return tokeniser(sentence[feature_1], truncation=True)
    
# data collator due to variable max token length per batch size
data_collator = trf.DataCollatorWithPadding(tokenizer = tokeniser)

train_dev = load_tokenised_data(filename=os.path.join("./processed_data", train_dev_filename), tokeniser=tokeniser, train_test="train")

K_FOLD = 5

# Set fixed random number seed
torch.manual_seed(42)

kfold = KFold(n_splits=K_FOLD, shuffle=True)

print('--------------------------------')

# K-fold Cross Validation model evaluation
for fold, (train_idx, test_idx) in enumerate(kfold.split(train_dev)):

    print(f'FOLD {fold}')
    print('--------------------------------')

    # Sample elements randomly from a given list of ids, no replacement.
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_idx)
    test_subsampler = torch.utils.data.SubsetRandomSampler(test_idx)

    # Define data loaders for training and testing data in this fold
    trainloader = torch.utils.data.DataLoader(
        train_dev,
        batch_size=BATCH_SIZE,
        collate_fn=data_collator,
        sampler=train_subsampler
    )
    testloader = torch.utils.data.DataLoader(
        train_dev,
        batch_size=BATCH_SIZE,
        collate_fn=data_collator,
        sampler=test_subsampler
    )

    # Init the neural network
    model = trf.AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)

    notebook_launcher(train_test_kfold, (model,), num_processes=torch.cuda.device_count())

Map:   0%|          | 0/1974 [00:00<?, ? examples/s]

--------------------------------
FOLD 0
--------------------------------


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier

Launching training on 4 GPUs.
Starting epoch 1


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 0: average loss = 4.510551939106951
Starting epoch 2
Epoch 1: average loss = 1.6533049421027453
Starting epoch 3
Epoch 2: average loss = 1.0555894731384035
Starting epoch 4
Epoch 3: average loss = 0.8073061403602061
Starting epoch 5
Epoch 4: average loss = 0.41208254172457287
Starting epoch 6
Epoch 5: average loss = 0.25229320513329123
Starting epoch 7
Epoch 6: average loss = 0.1588532264494911
Starting epoch 8
Epoch 7: average loss = 0.062482508766050965
Starting epoch 9
Epoch 8: average loss = 0.0487110253442267
Starting epoch 10
Epoch 9: average loss = 0.01988965539953135
Starting epoch 11
Epoch 10: average loss = 0.013818338413421779
Starting epoch 12
Epoch 11: average loss = 0.009479726417780374
Starting epoch 13
Epoch 12: average loss = 0.009872810438867997
Starting epoch 14
Epoch 13: average loss = 0.008118996367262996
Starting epoch 15
Epoch 14: average loss = 0.005186281256634046
Starting epoch 16
Epoch 15: average loss = 0.00455647812259026
Starting epoch 17
Epoch 16: a

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier

Launching training on 4 GPUs.
Starting epoch 1


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 0: average loss = 4.209940472034493
Starting epoch 2
Epoch 1: average loss = 1.7505121942270885
Starting epoch 3
Epoch 2: average loss = 1.137016478813056
Starting epoch 4
Epoch 3: average loss = 0.7005320207940208
Starting epoch 5
Epoch 4: average loss = 0.27245323630896484
Starting epoch 6
Epoch 5: average loss = 0.16893821153225322
Starting epoch 7
Epoch 6: average loss = 0.09297431770222958
Starting epoch 8
Epoch 7: average loss = 0.050752144586765253
Starting epoch 9
Epoch 8: average loss = 0.037301428267976854
Starting epoch 10
Epoch 9: average loss = 0.019128825150975824
Starting epoch 11
Epoch 10: average loss = 0.011662440101681935
Starting epoch 12
Epoch 11: average loss = 0.007959888976083768
Starting epoch 13
Epoch 12: average loss = 0.006229279217994631
Starting epoch 14
Epoch 13: average loss = 0.006459889223362846
Starting epoch 15
Epoch 14: average loss = 0.006060472422195927
Starting epoch 16
Epoch 15: average loss = 0.006153367633221087
Starting epoch 17
Epoch 1

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier

Launching training on 4 GPUs.
Starting epoch 1


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 0: average loss = 4.518348755860569
Starting epoch 2
Epoch 1: average loss = 1.7266797650643069
Starting epoch 3
Epoch 2: average loss = 1.5298522864327286
Starting epoch 4
Epoch 3: average loss = 1.235623741917538
Starting epoch 5
Epoch 4: average loss = 0.668731046539515
Starting epoch 6
Epoch 5: average loss = 0.35294430806405014
Starting epoch 7
Epoch 6: average loss = 0.15986699688088413
Starting epoch 8
Epoch 7: average loss = 0.10086654371233901
Starting epoch 9
Epoch 8: average loss = 0.07531826408706944
Starting epoch 10
Epoch 9: average loss = 0.04676288156299805
Starting epoch 11
Epoch 10: average loss = 0.026434426436981545
Starting epoch 12
Epoch 11: average loss = 0.01231037893548909
Starting epoch 13
Epoch 12: average loss = 0.009713458707509092
Starting epoch 14
Epoch 13: average loss = 0.0063725550032239595
Starting epoch 15
Epoch 14: average loss = 0.0043520579401772905
Starting epoch 16
Epoch 15: average loss = 0.003061638318497283
Starting epoch 17
Epoch 16: a

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier

Launching training on 4 GPUs.
Starting epoch 1


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 0: average loss = 4.213976401572276
Starting epoch 2
Epoch 1: average loss = 1.6635887875701443
Starting epoch 3
Epoch 2: average loss = 1.339716133783863
Starting epoch 4
Epoch 3: average loss = 1.0947936238213019
Starting epoch 5
Epoch 4: average loss = 0.5228192353188389
Starting epoch 6
Epoch 5: average loss = 0.269949582866346
Starting epoch 7
Epoch 6: average loss = 0.12388649941751301
Starting epoch 8
Epoch 7: average loss = 0.0708079221693686
Starting epoch 9
Epoch 8: average loss = 0.03322708794748354
Starting epoch 10
Epoch 9: average loss = 0.02437974347331975
Starting epoch 11
Epoch 10: average loss = 0.015077010560263362
Starting epoch 12
Epoch 11: average loss = 0.010007047782901374
Starting epoch 13
Epoch 12: average loss = 0.006685545526896462
Starting epoch 14
Epoch 13: average loss = 0.004278165826396405
Starting epoch 15
Epoch 14: average loss = 0.004858244802333024
Starting epoch 16
Epoch 15: average loss = 0.00403662517018889
Starting epoch 17
Epoch 16: avera

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_projector.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier

Launching training on 4 GPUs.
Starting epoch 1


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 0: average loss = 4.352602944229588
Starting epoch 2
Epoch 1: average loss = 1.6437665316643137
Starting epoch 3
Epoch 2: average loss = 1.0304440025691732
Starting epoch 4
Epoch 3: average loss = 0.7720403926766882
Starting epoch 5
Epoch 4: average loss = 0.3134985157298018
Starting epoch 6
Epoch 5: average loss = 0.2176166498525576
Starting epoch 7
Epoch 6: average loss = 0.11474446215281131
Starting epoch 8
Epoch 7: average loss = 0.056914517310250434
Starting epoch 9
Epoch 8: average loss = 0.030805336861786516
Starting epoch 10
Epoch 9: average loss = 0.020951356645908695
Starting epoch 11
Epoch 10: average loss = 0.015938912544192538
Starting epoch 12
Epoch 11: average loss = 0.00790104888805254
Starting epoch 13
Epoch 12: average loss = 0.005654610282473379
Starting epoch 14
Epoch 13: average loss = 0.004328064857735158
Starting epoch 15
Epoch 14: average loss = 0.004188275664799019
Starting epoch 16
Epoch 15: average loss = 0.004064644805056918
Starting epoch 17
Epoch 16:

# Extras

In [123]:
prompt_checkpoint = 'gpt2'
prompt_generator = trf.pipeline('text-generation', model=prompt_checkpoint)

def prompt_generate(text):
    """
    extend "text" to max_length. It will be a list of dictionaries. First item is the first return_sequence. 'generated_text' is self-explanatory.
    """
    prompt = prompt_generator(text, max_length=100, num_return_sequences=1)[0]['generated_text']

In [None]:
# checking length after tokenisation

# length = []
# for i in range(tokenised_hugging_dataset['train'].num_rows):
#   length.append(len(tokenised_hugging_dataset['train']['input_ids'][i]))

# print(f"Lengths: {length}")

In [None]:
# prediction_model.save_pretrained("model")

In [6]:
# from google.colab import drive
# mount_path = '/content/drive'
# drive.mount(mount_path)
# %cd $mount_path"/MyDrive/WASSA2023"

# !pip install transformers datasets sentencepiece

## Training by Huggingface API

In [None]:
from transformers import Trainer, TrainingArguments
from sklearn.metrics import mean_squared_error

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {"rmse": rmse}

In [None]:
training_args = TrainingArguments(output_dir="empathy-transformer",
                                  logging_strategy="epoch",
                                  evaluation_strategy="epoch",
                                  per_device_train_batch_size=16,
                                  per_device_eval_batch_size=16,
                                  num_train_epochs=3,
                                  # learning_rate=2e-5,
                                  save_total_limit=2,
                                  save_strategy='no',
                                  load_best_model_at_end=False)

trainer = Trainer(
    model=empathy_prediction,
    args=training_args,
    train_dataset=tokenised_hugging_dataset["train"],
    eval_dataset=tokenised_hugging_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokeniser,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,Rmse
1,3.4157,3.094043,1.758989
2,2.1608,2.74757,1.65758
3,1.4134,2.890623,1.700183


TrainOutput(global_step=117, training_loss=2.329951457488231, metrics={'train_runtime': 30.014, 'train_samples_per_second': 62.271, 'train_steps_per_second': 3.898, 'total_flos': 76837223949486.0, 'train_loss': 2.329951457488231, 'epoch': 3.0})

In [None]:
raw_pred, _, _ = trainer.predict(tokenised_hugging_dataset["test"])

In [50]:
def train(model):

    device = "cuda:0"
    if torch.cuda.device_count() > 1:
            model = torch.nn.DataParallel(model)
    model.to(device)
    
    # device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    # model.to(device)
    
    # criterion = torch.nn.MSELoss()
    opt = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
    
    trainset = load_tokenised_data(raw_data, tokeniser) #train
       
    train_dataloader = torch.utils.data.DataLoader(
        trainset, shuffle=True, batch_size=BATCH_SIZE, collate_fn=data_collator
    )
    
    training_steps = NUM_EPOCH * len(train_dataloader)
    # lr_scheduler = torch.optim.lr_scheduler.StepLR(opt, step_size=1, gamma=0.1)
    lr_scheduler = trf.get_scheduler(
        "linear",
        optimizer=opt,
        num_warmup_steps=0,
        num_training_steps=training_steps
    )
    
    model.train()
    for epoch in range(NUM_EPOCH):
        epoch_loss = 0
        num_batches = 0
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss 
            # loss = criterion(outputs.logits, batch['labels'])

            loss.backward()    
            opt.step()
            lr_scheduler.step()
            opt.zero_grad()

            epoch_loss += loss.item()
            num_batches += 1

        avg_epoch_loss = epoch_loss / num_batches
        print(f"Epoch {epoch}: average loss = {avg_epoch_loss}")

## Ray tune

In [11]:
from ray import tune

In [17]:
def train(config):
    model = trf.AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)

    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model.to(device)
    
    opt = torch.optim.AdamW(prediction_model.parameters(), lr=config["learning_rate"], momentum=0.9)
    
    train_dev = load_tokenised_data(filename=train_dev_filename, tokeniser=tokeniser, train_test="train")
    
    train_portion = int(len(train_dev) * 0.8)
    validation_portion = len(train_dev) - train_portion
    train_subset, val_subset = torch.utils.data.random_split(train_dev, [train_portion, validation_portion])
    
    train_dataloader = torch.utils.data.DataLoader(
        train_subset, shuffle=True, batch_size=int(config["batch_size"]), collate_fn=data_collator
    )
    
    validation_dataloader = torch.utils.data.DataLoader(
        validation_subset, shuffle=True, batch_size=int(config["batch_size"]), collate_fn=data_collator
    )
    
    training_steps = NUM_EPOCH * len(train_dataloader)
    # lr_scheduler = torch.optim.lr_scheduler.StepLR(opt, step_size=1, gamma=0.1)
    lr_scheduler = trf.get_scheduler(
        "linear",
        optimizer=opt,
        num_warmup_steps=0,
        num_training_steps=training_steps
    )
    
    model.train()
    for epoch in range(NUM_EPOCH):
        epoch_loss = 0
        num_batches = 0
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss 
            # loss = criterion(outputs.logits, batch['labels'])

            loss.backward()    
            opt.step()
            lr_scheduler.step()
            opt.zero_grad()

            epoch_loss += loss.item()
            num_batches += 1

        avg_epoch_loss = epoch_loss / num_batches
        tune.report(loss=avg_epoch_loss)

        # Evaluation    
        # model.eval()

        y_true =[]
        y_pred = []
        val_loss = 0.0
        val_step = 0

        for batch in validation_dataloader:
            with torch.no_grad():
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)

                y_true.extend(batch['labels'].tolist())
                batch_pred = [item for sublist in outputs.logits.tolist() for item in sublist]  #convert 2D list to 1D
                y_pred.extend(batch_pred)
                pearson_r = pearsonr(y_true, y_pred)
                
                loss = outputs.loss
                val_loss += loss.cpu().numpy()
                val_steps += 1
                
        tune.report(loss=(val_loss / val_steps), accuracy=pearson_r)
    
    print("Finished Training")

In [9]:
from functools import partial

In [None]:
config = {
    "learning_rate": tune.loguniform(6e-5, 2e-5),
    "batch_size": tune.choice([4, 8, 16])
}

scheduler = tune.schedulers.ASHAScheduler(
    metric="loss",
    mode="min",
    max_t = 10,
    grace_period=1,
    reduction_factor=2
)

reporter = tune.CLIReporter(metric_columns=["loss", "accuracy", "training_iteration"])

analysis = tune.run(
    train,
    config=config,
    num_samples=10,
    scheduler=scheduler,
    progress_reporter=reporter
)

best_trial = analysis.get_best_trial("loss", "min", "last")
print("Best trial config: {}".format(best_trial.config))
print("Best trial final validation loss: {}".format(best_trial.last_result["loss"]))
print("Best trial final validation accuracy: {}".format(best_trial.last_result["accuracy"]))