In [1]:
import pandas as pd
import numpy as np
import transformers as trf
from datasets import Dataset
import torch
from tqdm.auto import tqdm
from sklearn.model_selection import KFold
from accelerate import Accelerator
from accelerate import notebook_launcher
from ray import tune

In [2]:
import os
os.chdir("/home/573/rh2942/WASSA-2023-EMP") # changing dir for evaluation file
# print(os.getcwd())

from evaluation import pearsonr, calculate_pearson

In [16]:
# raw_data = pd.read_csv("./essay_article_text_train_dev.csv", index_col=0)
raw_data = pd.read_csv("./essay_article_id_train.csv", index_col=0)
raw_data.sample(3)

Unnamed: 0,conversation_id,article_id,essay,speaker_id,gender,education,race,age,income,speaker_number,essay_id,empathy,distress
467,100,57,All the poor animals that died in that zoo. ...,8,2.0,6.0,1.0,62.0,29000.0,2,599,2.0,7.0
379,481,292,There are a lot of troubled people in the worl...,42,1.0,4.0,1.0,45.0,40000.0,1,480,3.5,2.125
225,290,71,I feel for both sides on this one. The police ...,53,2.0,3.0,1.0,27.0,25000.0,1,289,6.0,6.0


In [17]:
test_data = pd.read_csv('./dataset/dev/WASSA23_essay_level_dev.tsv', sep='\t', header=0)

In [20]:
test_data.sample(3)

Unnamed: 0,conversation_id,article_id,essay,speaker_id,gender,education,race,age,income,speaker_number,split,essay_id
120,57,72,It's somewhat surprising that the EU wasn't in...,81,1,4,1,30,27000,2,dev,556
23,86,304,I just read an article about suicides in child...,74,1,4,1,29,39000,1,dev,85
6,16,336,I didn't know coal mining had such adverse eff...,81,1,4,1,30,27000,1,dev,15


In [38]:
# prompt_checkpoint = 'gpt2'

In [45]:
# prompt_generator = trf.pipeline('text-generation', model=prompt_checkpoint)

In [89]:
# prompt_generator('I am male', max_length=20, num_return_sequences=5)

In [30]:
# demographic_cols = list(input_data.columns)
# demographic_cols.remove('empathy')
# demographic_cols.remove('distress')

In [18]:
def num_to_text(raw_data):
    input_data = raw_data.copy() #mandatory step as dataframe is mutable
    categories = {1.: "I am male.", 2.: "I am female"}
    input_data['gender'] = input_data['gender'].map(categories)

    categories = {
        1.: "My education level Less than a high school diploma.",
        2.: "My education level High School diploma.",
        3.: "My education level Technical/Vocational School.",
        4.: "My education level Some college but no degree.",
        5.: "My education level Two year associate degree.",
        6.: "My education level Four year bachelor’s degree.",
        7.: "My education level Postgradute or professional degree."
    }
    input_data['education'] = input_data['education'].map(categories)

    categories = {
        1.: "My race White.",
        2.: "My race Hispanic or Latino.",
        3.: "My race Black or African American.",
        4.: "My race Native American or American Indian.",
        5.: "My race Asian/Pacific Islander.",
        6.: "My race other."
    }
    input_data['race'] = input_data['race'].map(categories)

    input_data['age'] = input_data['age'].apply(lambda x: "My age "+str(x)+" years.")
    input_data['income'] = input_data['income'].apply(lambda x: "My income "+str(x)+".")
    
    assert input_data.isna().any().any() == False #no NA values
    
    input_data['demographic'] = input_data['gender'] + ' ' + input_data['age'] + ' ' + input_data['education'] + ' ' + input_data['race'] + ' ' + input_data['income']
    
    return input_data

In [19]:
def load_tokenised_train_data(raw_data, tokeniser, task):
    '''
    task: "empathy" or "distress"
    '''
    
    input_data = num_to_text(raw_data)
    # chosen_data = input_data[['article', 'essay', 'empathy']]
    chosen_data = input_data[['essay', 'demographic', task]]

    hugging_dataset = Dataset.from_pandas(chosen_data, preserve_index=False)

    tokenised_hugging_dataset = hugging_dataset.map(tokenise, batched=True)
    # tokenised_hugging_dataset

    tokenised_hugging_dataset = tokenised_hugging_dataset.remove_columns(["essay"]) # no longer required as encoding done
    # tokenised_hugging_dataset = tokenised_hugging_dataset.remove_columns(["article"]) # no longer required as encoding done
    tokenised_hugging_dataset = tokenised_hugging_dataset.remove_columns(['demographic']) # no longer required as encoding done
    tokenised_hugging_dataset = tokenised_hugging_dataset.rename_column(task, "labels") # as huggingface requires
    tokenised_hugging_dataset = tokenised_hugging_dataset.with_format("torch")

    return tokenised_hugging_dataset

In [20]:
def load_tokenised_test_data(raw_data, tokeniser):
    
    input_data = num_to_text(raw_data)
    # chosen_data = input_data[['article', 'essay', 'empathy']]
    chosen_data = input_data[['essay', 'demographic']]

    hugging_dataset = Dataset.from_pandas(chosen_data, preserve_index=False)

    tokenised_hugging_dataset = hugging_dataset.map(tokenise, batched=True)
    # tokenised_hugging_dataset

    tokenised_hugging_dataset = tokenised_hugging_dataset.remove_columns(["essay"]) # no longer required as encoding done
    # tokenised_hugging_dataset = tokenised_hugging_dataset.remove_columns(["article"]) # no longer required as encoding done
    tokenised_hugging_dataset = tokenised_hugging_dataset.remove_columns(['demographic']) # no longer required as encoding done
    tokenised_hugging_dataset = tokenised_hugging_dataset.with_format("torch")

    return tokenised_hugging_dataset

In [21]:
def train_test(model):
    accelerator = Accelerator()
    
    accelerator.print(f"{task} prediction")  #task: "empathy" or "distress"
    
    opt = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
    # loss_function = torch.nn.MSELoss()
  
    trainset = load_tokenised_train_data(raw_data, tokeniser, task)
       
    trainloader = torch.utils.data.DataLoader(
        trainset, shuffle=True, batch_size=BATCH_SIZE, collate_fn=data_collator
    )
    
    training_steps = NUM_EPOCH * len(trainloader)
    lr_scheduler = trf.get_scheduler(
        "linear",
        optimizer=opt,
        num_warmup_steps=0,
        num_training_steps=training_steps
    )

    trainloader, model, opt = accelerator.prepare(
        trainloader, model, opt    
    )
    
    for epoch in range(0, NUM_EPOCH):

        # Print epoch
        accelerator.print(f'Starting epoch {epoch+1}')
        
        epoch_loss = 0
        num_batches = 0

        # Iterate over the DataLoader for training data
        for batch in trainloader:
            # Perform forward pass
            outputs = model(**batch)
            
            loss = outputs.loss
            # loss = loss_function(outputs.logits, batch["labels"])

            accelerator.backward(loss)
        
            opt.step()
            lr_scheduler.step()
            
            opt.zero_grad()
            
            epoch_loss += loss.item()
            num_batches += 1

        # Process is complete.
        avg_epoch_loss = epoch_loss / num_batches
        accelerator.print(f"Epoch {epoch}: average loss = {avg_epoch_loss}")
        
    # Evaluation
    
    testset = load_tokenised_test_data(test_data, tokeniser)
    testloader = torch.utils.data.DataLoader(
        testset, shuffle=False, batch_size=BATCH_SIZE, collate_fn=data_collator
    )

    # testloader = accelerator.prepare(testloader)
        
    model.eval()

    y_pred = []

    for batch in testloader:
        with torch.no_grad():
            outputs = model(**batch)

        batch_pred = [item for sublist in outputs.logits.tolist() for item in sublist]  #convert 2D list to 1D
        y_pred.extend(batch_pred)
  
    y_pred_df = pd.DataFrame({task: y_pred})
    filename = "predictions_" + task + ".tsv"
    y_pred_df.to_csv(filename, sep='\t', header=False, index=False)

In [22]:
NUM_EPOCH = 20
BATCH_SIZE = 8
LEARNING_RATE = 5e-5

# checkpoint = "bert-base-uncased"
# checkpoint = "bhadresh-savani/bert-base-uncased-emotion"
checkpoint = "distilbert-base-uncased"
# checkpoint = "cardiffnlp/twitter-roberta-base-sentiment-latest"

tokeniser = trf.AutoTokenizer.from_pretrained(checkpoint)

#padding="longest" can be deferred to do dynamic padding
def tokenise(sentence):
    # return tokeniser(sentence["essay"], sentence["article"], truncation=True)
    return tokeniser(sentence["essay"], sentence["demographic"], truncation=True) 
  # return tokeniser(sentence["essay"], sentence["article"], padding="max_length", max_length=514, truncation=True)   #for Cardiff-emotion one
  # return tokeniser(sentence["essay"], truncation=True) 
  # return tokeniser(sentence["article"], sentence["essay"], truncation=True) 
    
# data collator due to variable max token length per batch size
data_collator = trf.DataCollatorWithPadding(tokenizer = tokeniser)

model = trf.AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)

os.environ["TOKENIZERS_PARALLELISM"] = "false" # due to huggingface warning

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'classifier

In [23]:
task = "empathy"
notebook_launcher(train_test, (model,), num_processes=torch.cuda.device_count())

Launching training on 4 GPUs.
empathy prediction


Map:   0%|          | 0/779 [00:00<?, ? examples/s]

Starting epoch 1


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 0: average loss = 8.682883920669555
Starting epoch 2
Epoch 1: average loss = 3.7889806175231935
Starting epoch 3
Epoch 2: average loss = 2.523026633262634
Starting epoch 4
Epoch 3: average loss = 1.5091352593898772
Starting epoch 5
Epoch 4: average loss = 1.2626645803451537
Starting epoch 6
Epoch 5: average loss = 0.8389039313793183
Starting epoch 7
Epoch 6: average loss = 0.390454942882061
Starting epoch 8
Epoch 7: average loss = 0.45853771656751635
Starting epoch 9
Epoch 8: average loss = 0.22529063552618026
Starting epoch 10
Epoch 9: average loss = 0.16186995953321456
Starting epoch 11
Epoch 10: average loss = 0.13217000126838685
Starting epoch 12
Epoch 11: average loss = 0.1076544077694416
Starting epoch 13
Epoch 12: average loss = 0.06294035792350769
Starting epoch 14
Epoch 13: average loss = 0.04074248170480132
Starting epoch 15
Epoch 14: average loss = 0.02290039908140898
Starting epoch 16
Epoch 15: average loss = 0.019191861487925052
Starting epoch 17
Epoch 16: average lo

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

In [24]:
task = "distress"
notebook_launcher(train_test, (model,), num_processes=torch.cuda.device_count())

Launching training on 4 GPUs.
distress prediction


Map:   0%|          | 0/779 [00:00<?, ? examples/s]

Starting epoch 1


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 0: average loss = 6.75927152633667
Starting epoch 2
Epoch 1: average loss = 4.1686233806610105
Starting epoch 3
Epoch 2: average loss = 2.6896072387695313
Starting epoch 4
Epoch 3: average loss = 1.5125374686717987
Starting epoch 5
Epoch 4: average loss = 1.3759362149238585
Starting epoch 6
Epoch 5: average loss = 0.7311315959692002
Starting epoch 7
Epoch 6: average loss = 0.3396488809585571
Starting epoch 8
Epoch 7: average loss = 0.27004228889942167
Starting epoch 9
Epoch 8: average loss = 0.15612321779131888
Starting epoch 10
Epoch 9: average loss = 0.09518387511372567
Starting epoch 11
Epoch 10: average loss = 0.08685502890497446
Starting epoch 12
Epoch 11: average loss = 0.11066222324967384
Starting epoch 13
Epoch 12: average loss = 0.04582958009094
Starting epoch 14
Epoch 13: average loss = 0.042835943885147575
Starting epoch 15
Epoch 14: average loss = 0.02387387316673994
Starting epoch 16
Epoch 15: average loss = 0.018345572464168073
Starting epoch 17
Epoch 16: average lo

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

In [25]:
# Creating the submission file as per requirement
predictions_empathy = pd.read_csv("predictions_empathy.tsv", sep='\t', header=None)
predictions_distress = pd.read_csv("predictions_distress.tsv", sep='\t', header=None)

predictions_EMP = pd.concat([predictions_empathy, predictions_distress], axis=1)

predictions_EMP.to_csv("predictions_EMP.tsv", sep='\t', header=False, index=False)

In [26]:
# Just checking the dev set performance
gold_dev = pd.read_csv('./dataset/dev/goldstandard_dev.tsv', sep='\t', header=None) # no header
pearson_empathy = pearsonr(gold_dev.loc[:,0].tolist(), predictions_empathy.loc[:,0].tolist())
print(f"Empathy: {pearson_empathy}")
pearson_distress = pearsonr(gold_dev.loc[:,1].tolist(), predictions_distress.loc[:,0].tolist())
print(f"Distress: {pearson_distress}")

Empathy: 0.6608
Distress: 0.6411


# Hyperparam tuning

In [22]:
def training_loop(model,trainloader,testloader):
    accelerator = Accelerator()
    
    opt = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

    training_steps = NUM_EPOCH * len(trainloader)
    lr_scheduler = trf.get_scheduler(
        "linear",
        optimizer=opt,
        num_warmup_steps=0,
        num_training_steps=training_steps
    )
  
    trainloader, model, opt = accelerator.prepare(
        trainloader, model, opt    
    )  
    
    for epoch in range(0, NUM_EPOCH):

        # Print epoch
        accelerator.print(f'Starting epoch {epoch+1}')
        
        epoch_loss = 0
        num_batches = 0

        # Iterate over the DataLoader for training data
        for batch in trainloader:
            # Perform forward pass
            outputs = model(**batch)
            
            loss = outputs.loss
#             loss = loss_function(outputs, targets)

            accelerator.backward(loss)
        
            opt.step()
            lr_scheduler.step()
            
            opt.zero_grad()
            
            epoch_loss += loss.item()
            num_batches += 1

        # Process is complete.
        avg_epoch_loss = epoch_loss / num_batches
        accelerator.print(f"Epoch {epoch}: average loss = {avg_epoch_loss}")
    
    
    # Evaluation
        
    model.eval()

    y_true =[]
    y_pred = []

    for batch in testloader:
        with torch.no_grad():
            outputs = model(**batch)

        y_true.extend(batch['labels'].tolist())

        batch_pred = [item for sublist in outputs.logits.tolist() for item in sublist]  #convert 2D list to 1D
        y_pred.extend(batch_pred)
        
    accelerator.print(f"Pearson r: {pearsonr(y_true, y_pred)}")

In [23]:
# Configuration options
K_FOLD = 5
NUM_EPOCH = 6
BATCH_SIZE = 8
LEARNING_RATE = 5e-5



# loss_function = torch.nn.MSELoss()

# For fold results
results = {}

# Set fixed random number seed
# torch.manual_seed(42)

kfold = KFold(n_splits=K_FOLD, shuffle=True)

print('--------------------------------')

# K-fold Cross Validation model evaluation
for fold, (train_idx, test_idx) in enumerate(kfold.split(tokenised_hugging_dataset)):

    print(f'FOLD {fold}')
    print('--------------------------------')

    # Sample elements randomly from a given list of ids, no replacement.
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_idx)
    test_subsampler = torch.utils.data.SubsetRandomSampler(test_idx)

   

    # Define data loaders for training and testing data in this fold
    trainloader = torch.utils.data.DataLoader(
        tokenised_hugging_dataset,
        batch_size=BATCH_SIZE,
        collate_fn=data_collator,
        sampler=train_subsampler
    )
    testloader = torch.utils.data.DataLoader(
        tokenised_hugging_dataset,
        batch_size=BATCH_SIZE,
        collate_fn=data_collator,
        sampler=test_subsampler
    )

    # Init the neural network
    prediction_model = trf.AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)
    # prediction_model = trf.AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1, ignore_mismatched_sizes=True)

    notebook_launcher(training_loop, (prediction_model,trainloader,testloader,), num_processes=torch.cuda.device_count())

#     print('Starting testing')

#     # pearson_r = evaluation_loop(prediction_model, testloader)

#     print('Pearson r for fold %d: %f' % (fold, pearson_r))
#     print('--------------------------------')

#     results[fold] = pearson_r
    
#     # Saving the model
# #     save_path = f'./model-fold-{fold}.pth'
# #     torch.save(network.state_dict(), save_path)

# # Print fold results
# print(f'K-FOLD CROSS VALIDATION RESULTS FOR {K_FOLD} FOLDS')
# print('--------------------------------')
# sum = 0.0
# for key, value in results.items():
#     print(f'Fold {key}: {value}')
#     sum += value
# print(f'Average: {sum/len(results.items())}')

--------------------------------
FOLD 0
--------------------------------


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

Launching training on 3 GPUs.
Starting epoch 1


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 0: average loss = 6.678773800532023
Starting epoch 2
Epoch 1: average loss = 3.8564927794716577
Starting epoch 3
Epoch 2: average loss = 2.9299150596965444
Starting epoch 4
Epoch 3: average loss = 2.0145766735076904
Starting epoch 5
Epoch 4: average loss = 1.617049124204751
Starting epoch 6
Epoch 5: average loss = 1.21444748071107
Pearson r: 0.7278
FOLD 1
--------------------------------


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

Launching training on 3 GPUs.
Starting epoch 1


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 0: average loss = 6.45009006875934
Starting epoch 2
Epoch 1: average loss = 3.2881694372856254
Starting epoch 3
Epoch 2: average loss = 1.5550217447858867
Starting epoch 4
Epoch 3: average loss = 1.2373908857504528
Starting epoch 5
Epoch 4: average loss = 0.71229301302722
Starting epoch 6
Epoch 5: average loss = 0.47287860783663666
Pearson r: 0.7354
FOLD 2
--------------------------------


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

Launching training on 3 GPUs.
Starting epoch 1


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 0: average loss = 7.302216992233738
Starting epoch 2
Epoch 1: average loss = 3.01614493673498
Starting epoch 3
Epoch 2: average loss = 1.7510810377019825
Starting epoch 4
Epoch 3: average loss = 1.281657469995094
Starting epoch 5
Epoch 4: average loss = 1.2912235955397289
Starting epoch 6
Epoch 5: average loss = 0.7392225187610496
Pearson r: 0.8645
FOLD 3
--------------------------------


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

Launching training on 3 GPUs.
Starting epoch 1


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 0: average loss = 7.3006276867606426
Starting epoch 2
Epoch 1: average loss = 3.6075385736696646
Starting epoch 3
Epoch 2: average loss = 2.260803410501191
Starting epoch 4
Epoch 3: average loss = 1.2905460596084595
Starting epoch 5
Epoch 4: average loss = 0.9426887498208971
Starting epoch 6
Epoch 5: average loss = 0.7497554994893797
Pearson r: 0.8479
FOLD 4
--------------------------------


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

Launching training on 3 GPUs.
Starting epoch 1


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 0: average loss = 7.131105628880587
Starting epoch 2
Epoch 1: average loss = 3.875572630853364
Starting epoch 3
Epoch 2: average loss = 2.2853002891396033
Starting epoch 4
Epoch 3: average loss = 1.439516207485488
Starting epoch 5
Epoch 4: average loss = 1.4772590760028723
Starting epoch 6
Epoch 5: average loss = 1.0030029969233456
Pearson r: 0.8651


# Train-test split

In [11]:
BATCH_SIZE = 8
LEARNING_RATE = 5e-5
NUM_EPOCH = 3

chosen_data = input_data[['article', 'essay', 'empathy']]

hugging_dataset = Dataset.from_pandas(chosen_data, preserve_index=False)
hugging_dataset = hugging_dataset.train_test_split(test_size = 0.2)

# checkpoint = "bert-base-uncased"
# checkpoint = "bhadresh-savani/bert-base-uncased-emotion"
checkpoint = "distilbert-base-uncased"
# checkpoint = "cardiffnlp/twitter-roberta-base-sentiment-latest"

tokeniser = trf.AutoTokenizer.from_pretrained(checkpoint)

#padding="longest" can be deferred to do dynamic padding
def tokenise(sentence):
  return tokeniser(sentence["essay"], sentence["article"], truncation=True) 
  # return tokeniser(sentence["essay"], sentence["article"], padding="max_length", max_length=514, truncation=True)   #for Cardiff-emotion one
  # return tokeniser(sentence["essay"], truncation=True) 
  # return tokeniser(sentence["article"], sentence["essay"], truncation=True) 
    
tokenised_hugging_dataset = hugging_dataset.map(tokenise, batched=True)
# tokenised_hugging_dataset

tokenised_hugging_dataset = tokenised_hugging_dataset.remove_columns(["article","essay"]) # no longer required as encoding done
tokenised_hugging_dataset = tokenised_hugging_dataset.rename_column("empathy", "labels") # as huggingface requires
tokenised_hugging_dataset = tokenised_hugging_dataset.with_format("torch")

# tokenised_hugging_dataset

data_collator = trf.DataCollatorWithPadding(tokenizer = tokeniser)

train_dataloader = torch.utils.data.DataLoader(
    tokenised_hugging_dataset["train"], shuffle=True, batch_size=BATCH_SIZE, collate_fn=data_collator
)

test_dataloader = torch.utils.data.DataLoader(
    tokenised_hugging_dataset["test"], batch_size=BATCH_SIZE, collate_fn=data_collator
)

Map:   0%|          | 0/789 [00:00<?, ? examples/s]

Map:   0%|          | 0/198 [00:00<?, ? examples/s]

In [None]:
# hugging_dataset

In [None]:
# hugging_dataset['train']['essay'][:5]

In [None]:
# checking length after tokenisation

# length = []
# for i in range(tokenised_hugging_dataset['train'].num_rows):
#   length.append(len(tokenised_hugging_dataset['train']['input_ids'][i]))

# print(f"Lengths: {length}")

# Prediction model

In [5]:
prediction_model = trf.AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)
# prediction_model = trf.AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1, ignore_mismatched_sizes=True)

opt = torch.optim.AdamW(prediction_model.parameters(), lr=LEARNING_RATE)

training_steps = NUM_EPOCH * len(train_dataloader)
lr_scheduler = trf.get_scheduler(
    "linear",
    optimizer=opt,
    num_warmup_steps=0,
    num_training_steps=training_steps
)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
prediction_model.to(device)

print(device)


# criterion = torch.nn.MSELoss()

progress_bar = tqdm(range(training_steps))

prediction_model.train()
for epoch in range(NUM_EPOCH):
  epoch_loss = 0
  num_batches = 0
  for batch in train_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = prediction_model(**batch)
    loss = outputs.loss
    # loss = criterion(outputs.logits, batch['labels'])
    loss.backward()

    opt.step()
    lr_scheduler.step()
    opt.zero_grad()
    progress_bar.update(1)

    epoch_loss += loss.item()
    num_batches += 1

  avg_epoch_loss = epoch_loss / num_batches
  print(f"Epoch {epoch}: average loss = {avg_epoch_loss}")

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'classifier.w

cuda


  0%|          | 0/297 [00:00<?, ?it/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 0: average loss = 4.808817552797722
Epoch 1: average loss = 3.0758890127292786
Epoch 2: average loss = 2.118770224879486


## Evaluation

In [12]:
prediction_model.eval()

predictions = []

for batch in test_dataloader:
  batch = {k: v.to(device) for k, v in batch.items()}
  with torch.no_grad():
    outputs = prediction_model(**batch)
    
  batch_pred = [item for sublist in outputs.logits.tolist() for item in sublist]  #convert 2D list to 1D
  predictions.append(batch_pred)

y_pred = [item for sublist in predictions for item in sublist]  #convert batch-wise 2D list to 1D

y_true = hugging_dataset["test"]["empathy"]

pearsonr(y_true,y_pred)

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


0.6985

In [None]:
# prediction_model.save_pretrained("model")

In [None]:
# y_pred

In [None]:
# y_true

# Extras

In [6]:
# from google.colab import drive
# mount_path = '/content/drive'
# drive.mount(mount_path)
# %cd $mount_path"/MyDrive/WASSA2023"

# !pip install transformers datasets sentencepiece

## Training by Huggingface API

In [None]:
from transformers import Trainer, TrainingArguments
from sklearn.metrics import mean_squared_error

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {"rmse": rmse}

In [None]:
training_args = TrainingArguments(output_dir="empathy-transformer",
                                  logging_strategy="epoch",
                                  evaluation_strategy="epoch",
                                  per_device_train_batch_size=16,
                                  per_device_eval_batch_size=16,
                                  num_train_epochs=3,
                                  # learning_rate=2e-5,
                                  save_total_limit=2,
                                  save_strategy='no',
                                  load_best_model_at_end=False)

trainer = Trainer(
    model=empathy_prediction,
    args=training_args,
    train_dataset=tokenised_hugging_dataset["train"],
    eval_dataset=tokenised_hugging_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokeniser,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,Rmse
1,3.4157,3.094043,1.758989
2,2.1608,2.74757,1.65758
3,1.4134,2.890623,1.700183


TrainOutput(global_step=117, training_loss=2.329951457488231, metrics={'train_runtime': 30.014, 'train_samples_per_second': 62.271, 'train_steps_per_second': 3.898, 'total_flos': 76837223949486.0, 'train_loss': 2.329951457488231, 'epoch': 3.0})

In [None]:
raw_pred, _, _ = trainer.predict(tokenised_hugging_dataset["test"])

In [50]:
def train(model):

    device = "cuda:0"
    if torch.cuda.device_count() > 1:
            model = torch.nn.DataParallel(model)
    model.to(device)
    
    # device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    # model.to(device)
    
    # criterion = torch.nn.MSELoss()
    opt = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
    
    trainset = load_tokenised_data(raw_data, tokeniser) #train
       
    train_dataloader = torch.utils.data.DataLoader(
        trainset, shuffle=True, batch_size=BATCH_SIZE, collate_fn=data_collator
    )
    
    training_steps = NUM_EPOCH * len(train_dataloader)
    # lr_scheduler = torch.optim.lr_scheduler.StepLR(opt, step_size=1, gamma=0.1)
    lr_scheduler = trf.get_scheduler(
        "linear",
        optimizer=opt,
        num_warmup_steps=0,
        num_training_steps=training_steps
    )
    
    model.train()
    for epoch in range(NUM_EPOCH):
        epoch_loss = 0
        num_batches = 0
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss 
            # loss = criterion(outputs.logits, batch['labels'])

            loss.backward()    
            opt.step()
            lr_scheduler.step()
            opt.zero_grad()

            epoch_loss += loss.item()
            num_batches += 1

        avg_epoch_loss = epoch_loss / num_batches
        print(f"Epoch {epoch}: average loss = {avg_epoch_loss}")

## Ray tune

In [30]:
def train(config):
    model = trf.AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)

    device ="cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if torch.cuda.device_count() > 1:
            model = torch.nn.DataParallel(model)
    model.to(device)
    
    criterion = torch.nn.MSELoss()
    opt = torch.optim.AdamW(prediction_model.parameters(), lr=config["learning_rate"], momentum=0.9)
    
    train_dev_set = load_tokenised_data(raw_data, tokeniser) #train
    
    train_portion = int(len(train_dev_set) * 0.8)
    validation_portion = len(train_dev_set) - train_portion
    train_subset, val_subset = torch.utils.data.random_split(train_dev_set, [train_portion, validation_portion])
    
    train_dataloader = torch.utils.data.DataLoader(
        train_subset, shuffle=True, batch_size=int(config["batch_size"]), collate_fn=data_collator, num_workers=8
    )
    
    validation_dataloader = torch.utils.data.DataLoader(
        validation_subset, shuffle=True, batch_size=int(config["batch_size"]), collate_fn=data_collator, num_workers=9
    )
    
    training_steps = NUM_EPOCH * len(train_dataloader)
    # lr_scheduler = torch.optim.lr_scheduler.StepLR(opt, step_size=1, gamma=0.1)
    lr_scheduler = trf.get_scheduler(
        "linear",
        optimizer=opt,
        num_warmup_steps=0,
        num_training_steps=training_steps
    )
    
    model.train()
    for epoch in range(NUM_EPOCH):
        epoch_loss = 0
        num_batches = 0
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss 
            # loss = criterion(outputs.logits, batch['labels'])

            loss.backward()    
            opt.step()
            lr_scheduler.step()
            opt.zero_grad()

            epoch_loss += loss.item()
            num_batches += 1

        avg_epoch_loss = epoch_loss / num_batches
        tune.report(loss=avg_epoch_loss)

        # Evaluation    
        # model.eval()

        y_true =[]
        y_pred = []
        val_loss = 0.0
        val_step = 0

        for batch in validation_dataloader:
            with torch.no_grad():
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)

                y_true.extend(batch['labels'].tolist())
                batch_pred = [item for sublist in outputs.logits.tolist() for item in sublist]  #convert 2D list to 1D
                y_pred.extend(batch_pred)
                pearson_r = pearsonr(y_true, y_pred)
                
                loss = outputs.loss
                val_loss += loss.cpu().numpy()
                val_steps += 1
                
        tune.report(loss=(val_loss / val_steps), accuracy=pearson_r)
    
    print("Finished Training")

In [26]:
from functools import partial

In [None]:
config = {
    "learning_rate": tune.loguniform(6e-5, 2e-5),
    "batch_size": tune.choice([4, 8, 16])
}

scheduler = tune.schedulers.ASHAScheduler(
    metric="loss",
    mode="min",
    max_t = 10,
    grace_period=1,
    reduction_factor=2
)

reporter = tune.CLIReporter(metric_columns=["loss", "accuracy", "training_iteration"])

analysis = tune.run(
    partial(train),
    resources_per_trial={"cpu": 1, "gpu": 2},
    config=config,
    num_samples=10,
    scheduler=scheduler,
    progress_reporter=reporter
)

best_trial = analysis.get_best_trial("loss", "min", "last")
print("Best trial config: {}".format(best_trial.config))
print("Best trial final validation loss: {}".format(best_trial.last_result["loss"]))
print("Best trial final validation accuracy: {}".format(best_trial.last_result["accuracy"]))