In [1]:
import pandas as pd
import numpy as np
import transformers as trf
from datasets import Dataset
import torch
from accelerate import Accelerator
from accelerate import notebook_launcher

In [118]:
import os
os.chdir("/home/573/rh2942/WASSA-2023-EMP") # changing dir for evaluation file
# print(os.getcwd())

In [119]:
raw_train = pd.read_csv('./dataset/WASSA23_essay_level_with_labels_train.tsv', sep='\t', na_values='unknown', header=0) # raw csv file consists of 'unknown' values
raw_train.sample(3)

Unnamed: 0,conversation_id,article_id,essay,empathy,distress,speaker_id,gender,education,race,age,...,personality_agreeableness,personality_stability,iri_perspective_taking,iri_personal_distress,iri_fantasy,iri_empathatic_concern,speaker_number,split,essay_id,emotion
512,163,375,It is difficult to convey my thoughts and feel...,3.166667,1.0,35,1.0,6.0,3.0,33.0,...,4.5,7.0,3.714,1.0,2.429,1.429,2,train,662,Neutral
366,466,326,People love to blame Trump for anything and ev...,1.0,1.125,24,2.0,7.0,1.0,38.0,...,5.5,6.5,3.429,2.714,2.571,3.857,1,train,465,Neutral
553,214,273,Well while I am sorry for the people affected ...,1.0,1.125,24,2.0,7.0,1.0,38.0,...,5.5,6.5,3.429,2.714,2.571,3.857,2,train,713,Anger


In [120]:
train_dev = pd.read_csv("./essay_article_id_train_dev.csv", index_col=0)
train_dev.sample(3)

Unnamed: 0,conversation_id,article_id,essay,speaker_id,gender,education,race,age,income,speaker_number,essay_id,empathy,distress
556,228,139,I just read an article about ranavirus that af...,25,1.0,4.0,1.0,29.0,39000.0,2,727,2.0,2.0
404,24,281,We always think the moment someone is rich and...,40,1.0,7.0,1.0,28.0,165000.0,2,523,6.333333,6.25
490,145,94,Its always sad when these things happen becaus...,19,1.0,6.0,2.0,32.0,35000.0,2,644,5.833333,6.125


In [121]:
article_raw = pd.read_csv('./dataset/articles_adobe_AMT.csv', header=0, index_col=0)
article_raw.sample(3)

Unnamed: 0_level_0,text
article_id,Unnamed: 1_level_1
218,Millions exposed to dangerous lead levels in U...
150,"Horrific crash kills Yu Xu, 1st woman to fly C..."
50,America's war — The most common adjective emp...


In [122]:
raw_dev = pd.read_csv('./dataset/dev/WASSA23_essay_level_dev.tsv', sep='\t', header=0)
raw_dev.sample(3)

Unnamed: 0,conversation_id,article_id,essay,speaker_id,gender,education,race,age,income,speaker_number,split,essay_id
124,72,3,I'm amazed that this ride just pass inspection...,81,1,4,1,30,27000,2,dev,571
78,376,91,This is truly disgusting. I will never be able...,68,2,2,1,21,20000,1,dev,375
28,101,67,"it's a shame that this keeps happening,but it'...",59,1,2,1,30,55000,1,dev,100


In [None]:
# raw_test = pd.read_csv('./dataset/dev/', sep='\t', header=0)
# raw_test.sample(3)

In [123]:
# prompt_checkpoint = 'gpt2'
# prompt_generator = trf.pipeline('text-generation', model=prompt_checkpoint)

# def prompt_generate(text):
#     """
#     extend "text" to max_length. It will be a list of dictionaries. First item is the first return_sequence. 'generated_text' is self-explanatory.
#     """
#     prompt = prompt_generator(text, max_length=100, num_return_sequences=1)[0]['generated_text']

In [129]:
def num_to_text(raw_data):
    input_data = raw_data.copy() #mandatory step as dataframe is mutable
    
    input_data.dropna(inplace=True)
    print(f"Existing non-numeic columns:\n {input_data.select_dtypes(exclude=['number']).columns.tolist()}") #Just checking if any non-numeric value exist on numeric columns
    
    #converting article id to corresponding article texts
    input_data['article'] = input_data['article_id'].apply(lambda x: article_raw.loc[x, 'text'])
    
    categories = {1.: "I am male.", 2.: "I am female"}
    input_data['gender'] = input_data['gender'].map(categories)

    categories = {
        1.: "My education level is Less than a high school diploma.",
        2.: "My education level is High School diploma.",
        3.: "My education level is echnical/Vocational School.",
        4.: "My education level is Some college but no degree.",
        5.: "My education level is Two year associate degree.",
        6.: "My education level is Four year bachelor’s degree.",
        7.: "My education level is Postgradute or professional degree."
    }
    input_data['education'] = input_data['education'].map(categories)

    categories = {
        1.: "My race is White.",
        2.: "My race is Hispanic or Latino.",
        3.: "My race is Black or African American.",
        4.: "My race is Native American or American Indian.",
        5.: "My race is Asian/Pacific Islander.",
        6.: "My race is other."
    }
    input_data['race'] = input_data['race'].map(categories)

    input_data['age'] = input_data['age'].apply(lambda x: "My age is "+str(x)+" years.")
    input_data['income'] = input_data['income'].apply(lambda x: "My income is "+str(x)+".")
    
    assert input_data.isna().any().any() == False #no NA values
    
    input_data['demographic'] = input_data['gender'] + ' ' + input_data['age'] + ' ' + input_data['education'] + ' ' + input_data['race'] + ' ' + input_data['income']    
    input_data['demographic_essay'] = input_data['demographic'] + ' ' + input_data['essay']
    
    # empathic_prompt = " Based on my gender, education, race, age and income, my empathy level is "
    # input_data['demographic_prompt'] = input_data['demographic'] + empathic_prompt
    # input_data['demographic_prompt'] = input_data['demographic_prompt'].apply(prompt_generate)
    # input_data['essay_demographic_prompt'] = input_data['essay'] + input_data['demographic_prompt']
    
    return input_data

In [130]:
def save_preprocessed(df, dataname):
    raw_data = df.copy()
    processed_df = num_to_text(raw_data)
    processed_df.to_csv("./preprocessed_" + dataname + ".csv")

In [131]:
save_preprocessed(raw_train, dataname="train")
save_preprocessed(train_dev, dataname="train_dev")
save_preprocessed(raw_dev, dataname="dev")
# save_preprocessed(raw_test, dataname="test")

Existing non-numeic columns:
 ['essay', 'split', 'emotion']
Existing non-numeic columns:
 ['essay']
Existing non-numeic columns:
 ['essay', 'split']


# Prediction

In [85]:
def load_tokenised_data(filename, tokeniser, train_test):
   
    input_data = pd.read_csv(filename, header=0, index_col=0)
    
    if train_test == "train":
        chosen_data = input_data[[feature_1, feature_2, task]]
    elif train_test == "test":
        chosen_data = input_data[[feature_1, feature_2]]  #test data shouldn't have output label

    hugging_dataset = Dataset.from_pandas(chosen_data, preserve_index=False)

    tokenised_hugging_dataset = hugging_dataset.map(tokenise, batched=True, remove_columns = [feature_1, feature_2])
    
    if train_test == "train":
        tokenised_hugging_dataset = tokenised_hugging_dataset.rename_column(task, "labels") # as huggingface requires
    
    tokenised_hugging_dataset = tokenised_hugging_dataset.with_format("torch")

    return tokenised_hugging_dataset

In [87]:
def train_test(model):
    accelerator = Accelerator()
    
    accelerator.print(f"{task} prediction")  #task: "empathy" or "distress"
    
    opt = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
    # loss_function = torch.nn.MSELoss()
  
    trainset = load_tokenised_data(filename=train_filename, tokeniser=tokeniser, train_test="train")
       
    trainloader = torch.utils.data.DataLoader(
        trainset, shuffle=True, batch_size=BATCH_SIZE, collate_fn=data_collator
    )
    
    training_steps = NUM_EPOCH * len(trainloader)
    lr_scheduler = trf.get_scheduler(
        "linear",
        optimizer=opt,
        num_warmup_steps=0,
        num_training_steps=training_steps
    )

    trainloader, model, opt = accelerator.prepare(
        trainloader, model, opt    
    )
    
    for epoch in range(0, NUM_EPOCH):

        # Print epoch
        accelerator.print(f'Starting epoch {epoch+1}')
        
        epoch_loss = 0
        num_batches = 0

        # Iterate over the DataLoader for training data
        for batch in trainloader:
            # Perform forward pass
            outputs = model(**batch)
            
            loss = outputs.loss
            # loss = loss_function(outputs.logits, batch["labels"])

            accelerator.backward(loss)
        
            opt.step()
            lr_scheduler.step()
            
            opt.zero_grad()
            
            epoch_loss += loss.item()
            num_batches += 1

        # Process is complete.
        avg_epoch_loss = epoch_loss / num_batches
        accelerator.print(f"Epoch {epoch}: average loss = {avg_epoch_loss}")
        
    
    # evaluation on test set
    testset = load_tokenised_data(filename=test_filename, tokeniser=tokeniser, train_test="test")
    testloader = torch.utils.data.DataLoader(
        testset, shuffle=False, batch_size=BATCH_SIZE, collate_fn=data_collator
    )

    # testloader = accelerator.prepare(testloader)
            
    model.eval()

    y_pred = []

    for batch in testloader:
        with torch.no_grad():
            outputs = model(**batch)

        batch_pred = [item for sublist in outputs.logits.tolist() for item in sublist]  #convert 2D list to 1D
        y_pred.extend(batch_pred)
  
    y_pred_df = pd.DataFrame({task: y_pred})
    filename = "predictions_" + task + ".tsv"
    y_pred_df.to_csv(filename, sep='\t', header=False, index=False)

In [90]:
NUM_EPOCH = 20
BATCH_SIZE = 8
LEARNING_RATE = 5e-5

train_filename = "preprocessed_train.csv"
test_filename = "preprocessed_test.csv"

#Chosen features
feature_1 = 'demographic_essay'
feature_2 = 'article'

# feature_1 = 'essay_demographic_prompt'
# feature_2 = 'article'
    
# checkpoint = "bert-base-uncased"
# checkpoint = "bhadresh-savani/bert-base-uncased-emotion"
checkpoint = "distilbert-base-uncased"
# checkpoint = "cardiffnlp/twitter-roberta-base-sentiment-latest"

tokeniser = trf.AutoTokenizer.from_pretrained(checkpoint)

#padding="longest" can be deferred to do dynamic padding
def tokenise(sentence):
    return tokeniser(sentence[feature_1], sentence[feature_2], truncation=True) 
  # return tokeniser(sentence["essay"], sentence["article"], padding="max_length", max_length=514, truncation=True)   #for Cardiff-emotion one
    
# data collator due to variable max token length per batch size
data_collator = trf.DataCollatorWithPadding(tokenizer = tokeniser)

model = trf.AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)

os.environ["TOKENIZERS_PARALLELISM"] = "false" # due to huggingface warning

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

In [91]:
task = "empathy"
notebook_launcher(train_test, (model,), num_processes=torch.cuda.device_count())

Launching training on 4 GPUs.
empathy prediction


Map:   0%|          | 0/779 [00:00<?, ? examples/s]

Starting epoch 1


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 0: average loss = 8.495573530197143
Starting epoch 2
Epoch 1: average loss = 3.8171209716796874
Starting epoch 3
Epoch 2: average loss = 2.7714894247055053
Starting epoch 4
Epoch 3: average loss = 1.7521403813362122
Starting epoch 5
Epoch 4: average loss = 1.434909600019455
Starting epoch 6
Epoch 5: average loss = 1.3444206261634826
Starting epoch 7
Epoch 6: average loss = 0.5434165447950363
Starting epoch 8
Epoch 7: average loss = 0.45733905509114264
Starting epoch 9
Epoch 8: average loss = 0.3026233220100403
Starting epoch 10
Epoch 9: average loss = 0.11470949187874795
Starting epoch 11
Epoch 10: average loss = 0.09811431452631951
Starting epoch 12
Epoch 11: average loss = 0.06085887026041746
Starting epoch 13
Epoch 12: average loss = 0.040282936841249464
Starting epoch 14
Epoch 13: average loss = 0.032316692005842924
Starting epoch 15
Epoch 14: average loss = 0.020710504185408353
Starting epoch 16
Epoch 15: average loss = 0.016092198453843593
Starting epoch 17
Epoch 16: averag

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

In [92]:
task = "distress"
notebook_launcher(train_test, (model,), num_processes=torch.cuda.device_count())

Launching training on 4 GPUs.
distress prediction


Map:   0%|          | 0/779 [00:00<?, ? examples/s]

Starting epoch 1


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 0: average loss = 6.650102024078369
Starting epoch 2
Epoch 1: average loss = 4.2290793800354
Starting epoch 3
Epoch 2: average loss = 3.013800930976868
Starting epoch 4
Epoch 3: average loss = 2.0563190460205076
Starting epoch 5
Epoch 4: average loss = 1.2695067822933197
Starting epoch 6
Epoch 5: average loss = 0.801202487051487
Starting epoch 7
Epoch 6: average loss = 0.6858681753277779
Starting epoch 8
Epoch 7: average loss = 0.31337237447500227
Starting epoch 9
Epoch 8: average loss = 0.14200541526079177
Starting epoch 10
Epoch 9: average loss = 0.0814309460669756
Starting epoch 11
Epoch 10: average loss = 0.04676732301712036
Starting epoch 12
Epoch 11: average loss = 0.02929296424612403
Starting epoch 13
Epoch 12: average loss = 0.022756516747176648
Starting epoch 14
Epoch 13: average loss = 0.014264544351026415
Starting epoch 15
Epoch 14: average loss = 0.009471170483157038
Starting epoch 16
Epoch 15: average loss = 0.007364917527884245
Starting epoch 17
Epoch 16: average lo

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

In [93]:
# Creating the submission file as per requirement
predictions_empathy = pd.read_csv("predictions_empathy.tsv", sep='\t', header=None)
predictions_distress = pd.read_csv("predictions_distress.tsv", sep='\t', header=None)

predictions_EMP = pd.concat([predictions_empathy, predictions_distress], axis=1)

predictions_EMP.to_csv("predictions_EMP.tsv", sep='\t', header=False, index=False)

In [94]:
from evaluation import pearsonr, calculate_pearson

In [95]:
# Just checking the dev set performance
gold_dev = pd.read_csv('./dataset/dev/goldstandard_dev.tsv', sep='\t', header=None) # no header
pearson_empathy = pearsonr(gold_dev.loc[:,0].tolist(), predictions_empathy.loc[:,0].tolist())
print(f"Empathy: {pearson_empathy}")
pearson_distress = pearsonr(gold_dev.loc[:,1].tolist(), predictions_distress.loc[:,0].tolist())
print(f"Distress: {pearson_distress}")

Empathy: 0.7362
Distress: 0.5931


# Hyperparam tuning

In [136]:
def train_test_kfold(model,trainloader,testloader):
    accelerator = Accelerator()
    
    opt = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

    training_steps = NUM_EPOCH * len(trainloader)
    lr_scheduler = trf.get_scheduler(
        "linear",
        optimizer=opt,
        num_warmup_steps=0,
        num_training_steps=training_steps
    )
  
    trainloader, model, opt = accelerator.prepare(
        trainloader, model, opt    
    )  
    
    for epoch in range(0, NUM_EPOCH):

        # Print epoch
        accelerator.print(f'Starting epoch {epoch+1}')
        
        epoch_loss = 0
        num_batches = 0

        # Iterate over the DataLoader for training data
        for batch in trainloader:
            # Perform forward pass
            outputs = model(**batch)
            
            loss = outputs.loss
#             loss = loss_function(outputs, targets)

            accelerator.backward(loss)
        
            opt.step()
            lr_scheduler.step()
            
            opt.zero_grad()
            
            epoch_loss += loss.item()
            num_batches += 1

        # Process is complete.
        avg_epoch_loss = epoch_loss / num_batches
        accelerator.print(f"Epoch {epoch}: average loss = {avg_epoch_loss}")
    
    
    # Evaluation
        
    model.eval()

    y_true =[]
    y_pred = []

    for batch in testloader:
        with torch.no_grad():
            outputs = model(**batch)

        y_true.extend(batch['labels'].tolist())

        batch_pred = [item for sublist in outputs.logits.tolist() for item in sublist]  #convert 2D list to 1D
        y_pred.extend(batch_pred)
        
    accelerator.print(f"Pearson r: {pearsonr(y_true, y_pred)}")

In [132]:
from sklearn.model_selection import KFold

In [137]:
NUM_EPOCH = 20
BATCH_SIZE = 8
LEARNING_RATE = 5e-5

train_dev_filename = "preprocessed_train_dev.csv"

#Chosen features
feature_1 = 'demographic_essay'
feature_2 = 'article'

# feature_1 = 'essay_demographic_prompt'
# feature_2 = 'article'
    
# checkpoint = "bert-base-uncased"
# checkpoint = "bhadresh-savani/bert-base-uncased-emotion"
checkpoint = "distilbert-base-uncased"
# checkpoint = "cardiffnlp/twitter-roberta-base-sentiment-latest"

tokeniser = trf.AutoTokenizer.from_pretrained(checkpoint)

#padding="longest" can be deferred to do dynamic padding
def tokenise(sentence):
    return tokeniser(sentence[feature_1], sentence[feature_2], truncation=True) 
  # return tokeniser(sentence["essay"], sentence["article"], padding="max_length", max_length=514, truncation=True)   #for Cardiff-emotion one
    
# data collator due to variable max token length per batch size
data_collator = trf.DataCollatorWithPadding(tokenizer = tokeniser)

model = trf.AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)

train_dev = load_tokenised_data(filename=train_dev_filename, tokeniser=tokeniser, train_test="train")

K_FOLD = 5

# For fold results
results = {}

# Set fixed random number seed
# torch.manual_seed(42)

kfold = KFold(n_splits=K_FOLD, shuffle=True)

print('--------------------------------')

# K-fold Cross Validation model evaluation
for fold, (train_idx, test_idx) in enumerate(kfold.split(train_dev)):

    print(f'FOLD {fold}')
    print('--------------------------------')

    # Sample elements randomly from a given list of ids, no replacement.
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_idx)
    test_subsampler = torch.utils.data.SubsetRandomSampler(test_idx)

   

    # Define data loaders for training and testing data in this fold
    trainloader = torch.utils.data.DataLoader(
        train_dev,
        batch_size=BATCH_SIZE,
        collate_fn=data_collator,
        sampler=train_subsampler
    )
    testloader = torch.utils.data.DataLoader(
        train_dev,
        batch_size=BATCH_SIZE,
        collate_fn=data_collator,
        sampler=test_subsampler
    )

    # Init the neural network
    model = trf.AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)
    # prediction_model = trf.AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1, ignore_mismatched_sizes=True)

    notebook_launcher(train_test_kfold, (model,trainloader,testloader), num_processes=torch.cuda.device_count())

#     print('Starting testing')

#     # pearson_r = evaluation_loop(prediction_model, testloader)

#     print('Pearson r for fold %d: %f' % (fold, pearson_r))
#     print('--------------------------------')

#     results[fold] = pearson_r
    
#     # Saving the model
# #     save_path = f'./model-fold-{fold}.pth'
# #     torch.save(network.state_dict(), save_path)

# # Print fold results
# print(f'K-FOLD CROSS VALIDATION RESULTS FOR {K_FOLD} FOLDS')
# print('--------------------------------')
# sum = 0.0
# for key, value in results.items():
#     print(f'Fold {key}: {value}')
#     sum += value
# print(f'Average: {sum/len(results.items())}')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

Map:   0%|          | 0/987 [00:00<?, ? examples/s]

--------------------------------
FOLD 0
--------------------------------


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

Launching training on 4 GPUs.
Starting epoch 1


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 0: average loss = 6.7019599914550785
Starting epoch 2
Epoch 1: average loss = 3.9741732406616213
Starting epoch 3
Epoch 2: average loss = 2.1624271965026853
Starting epoch 4
Epoch 3: average loss = 1.50183021068573
Starting epoch 5
Epoch 4: average loss = 1.0829607927799225
Starting epoch 6
Epoch 5: average loss = 0.5943052309751511
Starting epoch 7
Epoch 6: average loss = 0.3498567819595337
Starting epoch 8
Epoch 7: average loss = 0.304621284455061
Starting epoch 9
Epoch 8: average loss = 0.15605652704834938
Starting epoch 10
Epoch 9: average loss = 0.09013329096138477
Starting epoch 11
Epoch 10: average loss = 0.060516525506973264
Starting epoch 12
Epoch 11: average loss = 0.046703271958976986
Starting epoch 13
Epoch 12: average loss = 0.0254407518170774
Starting epoch 14
Epoch 13: average loss = 0.0180107778403908
Starting epoch 15
Epoch 14: average loss = 0.01127586814807728
Starting epoch 16
Epoch 15: average loss = 0.00978437946178019
Starting epoch 17
Epoch 16: average los

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

Launching training on 4 GPUs.
Starting epoch 1


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 0: average loss = 7.274856114387513
Starting epoch 2
Epoch 1: average loss = 4.014234528541565
Starting epoch 3
Epoch 2: average loss = 2.8300724029541016
Starting epoch 4
Epoch 3: average loss = 1.7337887907028198
Starting epoch 5
Epoch 4: average loss = 1.1620377737283707
Starting epoch 6
Epoch 5: average loss = 0.8615784388780594
Starting epoch 7
Epoch 6: average loss = 0.5769422573596239
Starting epoch 8
Epoch 7: average loss = 0.2225670225173235
Starting epoch 9
Epoch 8: average loss = 0.16906991794705392
Starting epoch 10
Epoch 9: average loss = 0.0945047552883625
Starting epoch 11
Epoch 10: average loss = 0.057689938545227054
Starting epoch 12
Epoch 11: average loss = 0.03892501873895526
Starting epoch 13
Epoch 12: average loss = 0.019084552321583032
Starting epoch 14
Epoch 13: average loss = 0.014424612652510405
Starting epoch 15
Epoch 14: average loss = 0.011792213553562761
Starting epoch 16
Epoch 15: average loss = 0.00941085782367736
Starting epoch 17
Epoch 16: average

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

Launching training on 4 GPUs.
Starting epoch 1


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 0: average loss = 6.205025091171264
Starting epoch 2
Epoch 1: average loss = 3.605410346984863
Starting epoch 3
Epoch 2: average loss = 2.1473122024536133
Starting epoch 4
Epoch 3: average loss = 1.017997864484787
Starting epoch 5
Epoch 4: average loss = 0.6603041756153106
Starting epoch 6
Epoch 5: average loss = 0.25558152690529823
Starting epoch 7
Epoch 6: average loss = 0.1737029016017914
Starting epoch 8
Epoch 7: average loss = 0.07249147601425648
Starting epoch 9
Epoch 8: average loss = 0.06414311714470386
Starting epoch 10
Epoch 9: average loss = 0.03607641974464059
Starting epoch 11
Epoch 10: average loss = 0.02985933463089168
Starting epoch 12
Epoch 11: average loss = 0.02334444869309664
Starting epoch 13
Epoch 12: average loss = 0.014602485853247344
Starting epoch 14
Epoch 13: average loss = 0.010757031235843897
Starting epoch 15
Epoch 14: average loss = 0.008689859677106141
Starting epoch 16
Epoch 15: average loss = 0.0071972179226577285
Starting epoch 17
Epoch 16: aver

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

Launching training on 4 GPUs.
Starting epoch 1


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 0: average loss = 7.041970872879029
Starting epoch 2
Epoch 1: average loss = 3.7633567237854004
Starting epoch 3
Epoch 2: average loss = 2.093328607082367
Starting epoch 4
Epoch 3: average loss = 1.2539209443330765
Starting epoch 5
Epoch 4: average loss = 1.1508275112509727
Starting epoch 6
Epoch 5: average loss = 0.5586384752392769
Starting epoch 7
Epoch 6: average loss = 0.48270922869443894
Starting epoch 8
Epoch 7: average loss = 0.2722008703649044
Starting epoch 9
Epoch 8: average loss = 0.20548804670572282

Starting epoch 10Epoch 9: average loss = 0.1928584533929825
Starting epoch 11
Epoch 10: average loss = 0.08330711431801319
Starting epoch 12
Epoch 11: average loss = 0.06038831308484077
Starting epoch 13
Epoch 12: average loss = 0.04660717751830816
Starting epoch 14
Epoch 13: average loss = 0.03247637189924717
Starting epoch 15
Epoch 14: average loss = 0.03611655261367559
Starting epoch 16
Epoch 15: average loss = 0.016296269670128823
Starting epoch 17
Epoch 16: average l

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_clas

Launching training on 4 GPUs.
Starting epoch 1


You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 0: average loss = 6.9509720039367675
Starting epoch 2
Epoch 1: average loss = 3.812184066772461
Starting epoch 3
Epoch 2: average loss = 2.462792706489563
Starting epoch 4
Epoch 3: average loss = 1.4527007365226745
Starting epoch 5
Epoch 4: average loss = 1.1702595520019532
Starting epoch 6
Epoch 5: average loss = 0.6875379419326783
Starting epoch 7
Epoch 6: average loss = 0.3116438892483711
Starting epoch 8
Epoch 7: average loss = 0.175219154804945
Starting epoch 9
Epoch 8: average loss = 0.14051428005099298
Starting epoch 10
Epoch 9: average loss = 0.11624352924525738
Starting epoch 11
Epoch 10: average loss = 0.05993941556662321
Starting epoch 12
Epoch 11: average loss = 0.06296095579862594
Starting epoch 13
Epoch 12: average loss = 0.050549547392874955
Starting epoch 14
Epoch 13: average loss = 0.024937705677002667
Starting epoch 15
Epoch 14: average loss = 0.02094799729064107
Starting epoch 16
Epoch 15: average loss = 0.014208995215594768
Starting epoch 17
Epoch 16: average 

# Extras

In [None]:
# checking length after tokenisation

# length = []
# for i in range(tokenised_hugging_dataset['train'].num_rows):
#   length.append(len(tokenised_hugging_dataset['train']['input_ids'][i]))

# print(f"Lengths: {length}")

In [None]:
# prediction_model.save_pretrained("model")

In [6]:
# from google.colab import drive
# mount_path = '/content/drive'
# drive.mount(mount_path)
# %cd $mount_path"/MyDrive/WASSA2023"

# !pip install transformers datasets sentencepiece

## Training by Huggingface API

In [None]:
from transformers import Trainer, TrainingArguments
from sklearn.metrics import mean_squared_error

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {"rmse": rmse}

In [None]:
training_args = TrainingArguments(output_dir="empathy-transformer",
                                  logging_strategy="epoch",
                                  evaluation_strategy="epoch",
                                  per_device_train_batch_size=16,
                                  per_device_eval_batch_size=16,
                                  num_train_epochs=3,
                                  # learning_rate=2e-5,
                                  save_total_limit=2,
                                  save_strategy='no',
                                  load_best_model_at_end=False)

trainer = Trainer(
    model=empathy_prediction,
    args=training_args,
    train_dataset=tokenised_hugging_dataset["train"],
    eval_dataset=tokenised_hugging_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokeniser,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,Rmse
1,3.4157,3.094043,1.758989
2,2.1608,2.74757,1.65758
3,1.4134,2.890623,1.700183


TrainOutput(global_step=117, training_loss=2.329951457488231, metrics={'train_runtime': 30.014, 'train_samples_per_second': 62.271, 'train_steps_per_second': 3.898, 'total_flos': 76837223949486.0, 'train_loss': 2.329951457488231, 'epoch': 3.0})

In [None]:
raw_pred, _, _ = trainer.predict(tokenised_hugging_dataset["test"])

In [50]:
def train(model):

    device = "cuda:0"
    if torch.cuda.device_count() > 1:
            model = torch.nn.DataParallel(model)
    model.to(device)
    
    # device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    # model.to(device)
    
    # criterion = torch.nn.MSELoss()
    opt = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
    
    trainset = load_tokenised_data(raw_data, tokeniser) #train
       
    train_dataloader = torch.utils.data.DataLoader(
        trainset, shuffle=True, batch_size=BATCH_SIZE, collate_fn=data_collator
    )
    
    training_steps = NUM_EPOCH * len(train_dataloader)
    # lr_scheduler = torch.optim.lr_scheduler.StepLR(opt, step_size=1, gamma=0.1)
    lr_scheduler = trf.get_scheduler(
        "linear",
        optimizer=opt,
        num_warmup_steps=0,
        num_training_steps=training_steps
    )
    
    model.train()
    for epoch in range(NUM_EPOCH):
        epoch_loss = 0
        num_batches = 0
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss 
            # loss = criterion(outputs.logits, batch['labels'])

            loss.backward()    
            opt.step()
            lr_scheduler.step()
            opt.zero_grad()

            epoch_loss += loss.item()
            num_batches += 1

        avg_epoch_loss = epoch_loss / num_batches
        print(f"Epoch {epoch}: average loss = {avg_epoch_loss}")

## Ray tune

In [30]:
def train(config):
    model = trf.AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)

    device ="cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if torch.cuda.device_count() > 1:
            model = torch.nn.DataParallel(model)
    model.to(device)
    
    criterion = torch.nn.MSELoss()
    opt = torch.optim.AdamW(prediction_model.parameters(), lr=config["learning_rate"], momentum=0.9)
    
    train_dev_set = load_tokenised_data(raw_data, tokeniser) #train
    
    train_portion = int(len(train_dev_set) * 0.8)
    validation_portion = len(train_dev_set) - train_portion
    train_subset, val_subset = torch.utils.data.random_split(train_dev_set, [train_portion, validation_portion])
    
    train_dataloader = torch.utils.data.DataLoader(
        train_subset, shuffle=True, batch_size=int(config["batch_size"]), collate_fn=data_collator, num_workers=8
    )
    
    validation_dataloader = torch.utils.data.DataLoader(
        validation_subset, shuffle=True, batch_size=int(config["batch_size"]), collate_fn=data_collator, num_workers=9
    )
    
    training_steps = NUM_EPOCH * len(train_dataloader)
    # lr_scheduler = torch.optim.lr_scheduler.StepLR(opt, step_size=1, gamma=0.1)
    lr_scheduler = trf.get_scheduler(
        "linear",
        optimizer=opt,
        num_warmup_steps=0,
        num_training_steps=training_steps
    )
    
    model.train()
    for epoch in range(NUM_EPOCH):
        epoch_loss = 0
        num_batches = 0
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss 
            # loss = criterion(outputs.logits, batch['labels'])

            loss.backward()    
            opt.step()
            lr_scheduler.step()
            opt.zero_grad()

            epoch_loss += loss.item()
            num_batches += 1

        avg_epoch_loss = epoch_loss / num_batches
        tune.report(loss=avg_epoch_loss)

        # Evaluation    
        # model.eval()

        y_true =[]
        y_pred = []
        val_loss = 0.0
        val_step = 0

        for batch in validation_dataloader:
            with torch.no_grad():
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)

                y_true.extend(batch['labels'].tolist())
                batch_pred = [item for sublist in outputs.logits.tolist() for item in sublist]  #convert 2D list to 1D
                y_pred.extend(batch_pred)
                pearson_r = pearsonr(y_true, y_pred)
                
                loss = outputs.loss
                val_loss += loss.cpu().numpy()
                val_steps += 1
                
        tune.report(loss=(val_loss / val_steps), accuracy=pearson_r)
    
    print("Finished Training")

In [26]:
from functools import partial

In [None]:
config = {
    "learning_rate": tune.loguniform(6e-5, 2e-5),
    "batch_size": tune.choice([4, 8, 16])
}

scheduler = tune.schedulers.ASHAScheduler(
    metric="loss",
    mode="min",
    max_t = 10,
    grace_period=1,
    reduction_factor=2
)

reporter = tune.CLIReporter(metric_columns=["loss", "accuracy", "training_iteration"])

analysis = tune.run(
    partial(train),
    resources_per_trial={"cpu": 1, "gpu": 2},
    config=config,
    num_samples=10,
    scheduler=scheduler,
    progress_reporter=reporter
)

best_trial = analysis.get_best_trial("loss", "min", "last")
print("Best trial config: {}".format(best_trial.config))
print("Best trial final validation loss: {}".format(best_trial.last_result["loss"]))
print("Best trial final validation accuracy: {}".format(best_trial.last_result["accuracy"]))