In [1]:
import pandas as pd
import numpy as np
import transformers as trf
from datasets import Dataset
import torch
from tqdm.auto import tqdm
from evaluation import pearsonr, calculate_pearson
from sklearn.model_selection import KFold

In [2]:
raw_data = pd.read_csv("./essay_article_text_train_dev.csv", index_col=0)
raw_data.head(2)

Unnamed: 0,conversation_id,article,essay,speaker_id,gender,education,race,age,income,speaker_number,essay_id,empathy,distress
0,2,"A month after Hurricane Matthew, 800,000 Haiti...",It breaks my heart to see people living in tho...,30,1.0,6.0,3.0,37.0,40000.0,1,1,6.833333,6.625
1,3,"A month after Hurricane Matthew, 800,000 Haiti...",I wonder why there aren't more people trying t...,19,1.0,6.0,2.0,32.0,35000.0,1,2,5.833333,6.0


# Empathy

## K-fold cross-validation

In [51]:
chosen_data = raw_data[['article', 'essay', 'empathy']]

hugging_dataset = Dataset.from_pandas(chosen_data, preserve_index=False)

# checkpoint = "bert-base-uncased"
# checkpoint = "bhadresh-savani/bert-base-uncased-emotion"
checkpoint = "distilbert-base-uncased"
# checkpoint = "cardiffnlp/twitter-roberta-base-sentiment-latest"

tokeniser = trf.AutoTokenizer.from_pretrained(checkpoint)

#padding="longest" can be deferred to do dynamic padding
def tokenise(sentence):
    return tokeniser(sentence["essay"], sentence["article"], truncation=True) 
  # return tokeniser(sentence["essay"], sentence["article"], padding="max_length", max_length=514, truncation=True)   #for Cardiff-emotion one
  # return tokeniser(sentence["essay"], truncation=True) 
  # return tokeniser(sentence["article"], sentence["essay"], truncation=True) 
    
tokenised_hugging_dataset = hugging_dataset.map(tokenise, batched=True)
# tokenised_hugging_dataset

tokenised_hugging_dataset = tokenised_hugging_dataset.remove_columns(["article","essay"]) # no longer required as encoding done
tokenised_hugging_dataset = tokenised_hugging_dataset.rename_column("empathy", "labels") # as huggingface requires
tokenised_hugging_dataset = tokenised_hugging_dataset.with_format("torch")

tokenised_hugging_dataset

Map:   0%|          | 0/987 [00:00<?, ? examples/s]

Dataset({
    features: ['labels', 'input_ids', 'attention_mask'],
    num_rows: 987
})

In [52]:
def training_loop(prediction_model, opt, trainloader):
    # Run the training loop for defined number of epochs
    for epoch in range(0, NUM_EPOCH):

        # Print epoch
        print(f'Starting epoch {epoch+1}')
        
        epoch_loss = 0
        num_batches = 0

        # Iterate over the DataLoader for training data
        for batch in trainloader:
            # Get inputs
            batch = {k: v.to(device) for k, v in batch.items()}

            # Perform forward pass
            outputs = prediction_model(**batch)
            
            loss = outputs.loss
#             loss = loss_function(outputs, targets)

            loss.backward()
            opt.step()
            lr_scheduler.step()
            
            opt.zero_grad()
            
            epoch_loss += loss.item()
            num_batches += 1

        # Process is complete.
        avg_epoch_loss = epoch_loss / num_batches
        print(f"Epoch {epoch}: average loss = {avg_epoch_loss}")

In [55]:
def evaluation_loop(model, test_data):
    model.eval()

    y_true =[]
    y_pred = []

    for batch in test_data:
        
        batch = {k: v.to(device) for k, v in batch.items()}
        features = batch['input_ids'].to(device)

        with torch.no_grad():
            outputs = model(**batch)
#             outputs = model(features)

        y_true.extend(batch['labels'].tolist())

        batch_pred = [item for sublist in outputs.logits.tolist() for item in sublist]  #convert 2D list to 1D
        y_pred.extend(batch_pred)
    
    return pearsonr(y_true, y_pred)

In [57]:
# Configuration options
K_FOLD = 5
NUM_EPOCH = 3
BATCH_SIZE = 8
LEARNING_RATE = 5e-5

# loss_function = torch.nn.MSELoss()

# For fold results
results = {}

# Set fixed random number seed
# torch.manual_seed(42)

kfold = KFold(n_splits=K_FOLD, shuffle=True)

print('--------------------------------')

# K-fold Cross Validation model evaluation
for fold, (train_idx, test_idx) in enumerate(kfold.split(tokenised_hugging_dataset)):

    print(f'FOLD {fold}')
    print('--------------------------------')

    # Sample elements randomly from a given list of ids, no replacement.
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_idx)
    test_subsampler = torch.utils.data.SubsetRandomSampler(test_idx)

    # data collator due to variable max token length per batch size
    data_collator = trf.DataCollatorWithPadding(tokenizer = tokeniser)

    # Define data loaders for training and testing data in this fold
    trainloader = torch.utils.data.DataLoader(
        tokenised_hugging_dataset,
        batch_size=BATCH_SIZE,
        collate_fn=data_collator,
        sampler=train_subsampler
    )
    testloader = torch.utils.data.DataLoader(
        tokenised_hugging_dataset,
        batch_size=BATCH_SIZE,
        collate_fn=data_collator,
        sampler=test_subsampler
    )

    # Init the neural network
    prediction_model = trf.AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)
    # prediction_model = trf.AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1, ignore_mismatched_sizes=True)

    # network.apply(reset_weights)

    opt = torch.optim.AdamW(prediction_model.parameters(), lr=LEARNING_RATE)

    training_steps = NUM_EPOCH * len(trainloader)
    lr_scheduler = trf.get_scheduler(
        "linear",
        optimizer=opt,
        num_warmup_steps=0,
        num_training_steps=training_steps
    )

    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    prediction_model.to(device)

    print(f"Device: {device}")
    
    training_loop(prediction_model, opt, trainloader)

    print('Starting testing')

    pearson_r = evaluation_loop(prediction_model, testloader)

    print('Pearson r for fold %d: %f' % (fold, pearson_r))
    print('--------------------------------')

    results[fold] = pearson_r
    
    # Saving the model
#     save_path = f'./model-fold-{fold}.pth'
#     torch.save(network.state_dict(), save_path)

# Print fold results
print(f'K-FOLD CROSS VALIDATION RESULTS FOR {K_FOLD} FOLDS')
print('--------------------------------')
sum = 0.0
for key, value in results.items():
    print(f'Fold {key}: {value}')
    sum += value
print(f'Average: {sum/len(results.items())}')

--------------------------------
FOLD 0
--------------------------------


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classi

Using cuda
Starting epoch 1
Epoch 0: average loss = 4.817841511784178
Starting epoch 2
Epoch 1: average loss = 3.288772120620265
Starting epoch 3
Epoch 2: average loss = 1.9197451415086033
Starting testing
Pearson r for fold 0: 0.621500
--------------------------------
FOLD 1
--------------------------------


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classi

Using cuda
Starting epoch 1
Epoch 0: average loss = 4.863694449867865
Starting epoch 2
Epoch 1: average loss = 3.1594221092233754
Starting epoch 3
Epoch 2: average loss = 1.605109966463513
Starting testing
Pearson r for fold 1: 0.533700
--------------------------------
FOLD 2
--------------------------------


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classi

Using cuda
Starting epoch 1
Epoch 0: average loss = 4.727649494253024
Starting epoch 2
Epoch 1: average loss = 3.697584842190598
Starting epoch 3
Epoch 2: average loss = 3.252989512501341
Starting testing
Pearson r for fold 2: 0.518200
--------------------------------
FOLD 3
--------------------------------


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classi

Using cuda
Starting epoch 1
Epoch 0: average loss = 4.72607496892563
Starting epoch 2
Epoch 1: average loss = 3.810416649086307
Starting epoch 3
Epoch 2: average loss = 3.0758098243462917
Starting testing
Pearson r for fold 3: 0.388200
--------------------------------
FOLD 4
--------------------------------


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.bias', 'pre_classi

Using cuda
Starting epoch 1
Epoch 0: average loss = 4.802791798957671
Starting epoch 2
Epoch 1: average loss = 3.6104334905894118
Starting epoch 3
Epoch 2: average loss = 2.605684275578971
Starting testing
Pearson r for fold 4: 0.560000
--------------------------------
K-FOLD CROSS VALIDATION RESULTS FOR 5 FOLDS
--------------------------------
Fold 0: 0.6215
Fold 1: 0.5337
Fold 2: 0.5182
Fold 3: 0.3882
Fold 4: 0.56
Average: 0.52432


## Train-test split

In [11]:
BATCH_SIZE = 8
LEARNING_RATE = 5e-5
NUM_EPOCH = 3

chosen_data = raw_data[['article', 'essay', 'empathy']]

hugging_dataset = Dataset.from_pandas(chosen_data, preserve_index=False)
hugging_dataset = hugging_dataset.train_test_split(test_size = 0.2)

# checkpoint = "bert-base-uncased"
# checkpoint = "bhadresh-savani/bert-base-uncased-emotion"
checkpoint = "distilbert-base-uncased"
# checkpoint = "cardiffnlp/twitter-roberta-base-sentiment-latest"

tokeniser = trf.AutoTokenizer.from_pretrained(checkpoint)

#padding="longest" can be deferred to do dynamic padding
def tokenise(sentence):
  return tokeniser(sentence["essay"], sentence["article"], truncation=True) 
  # return tokeniser(sentence["essay"], sentence["article"], padding="max_length", max_length=514, truncation=True)   #for Cardiff-emotion one
  # return tokeniser(sentence["essay"], truncation=True) 
  # return tokeniser(sentence["article"], sentence["essay"], truncation=True) 
    
tokenised_hugging_dataset = hugging_dataset.map(tokenise, batched=True)
# tokenised_hugging_dataset

tokenised_hugging_dataset = tokenised_hugging_dataset.remove_columns(["article","essay"]) # no longer required as encoding done
tokenised_hugging_dataset = tokenised_hugging_dataset.rename_column("empathy", "labels") # as huggingface requires
tokenised_hugging_dataset = tokenised_hugging_dataset.with_format("torch")

# tokenised_hugging_dataset

data_collator = trf.DataCollatorWithPadding(tokenizer = tokeniser)

train_dataloader = torch.utils.data.DataLoader(
    tokenised_hugging_dataset["train"], shuffle=True, batch_size=BATCH_SIZE, collate_fn=data_collator
)

test_dataloader = torch.utils.data.DataLoader(
    tokenised_hugging_dataset["test"], batch_size=BATCH_SIZE, collate_fn=data_collator
)

Map:   0%|          | 0/789 [00:00<?, ? examples/s]

Map:   0%|          | 0/198 [00:00<?, ? examples/s]

In [None]:
# hugging_dataset

In [None]:
# hugging_dataset['train']['essay'][:5]

In [None]:
# checking length after tokenisation

# length = []
# for i in range(tokenised_hugging_dataset['train'].num_rows):
#   length.append(len(tokenised_hugging_dataset['train']['input_ids'][i]))

# print(f"Lengths: {length}")

### Prediction model

In [5]:
prediction_model = trf.AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)
# prediction_model = trf.AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1, ignore_mismatched_sizes=True)

opt = torch.optim.AdamW(prediction_model.parameters(), lr=LEARNING_RATE)

training_steps = NUM_EPOCH * len(train_dataloader)
lr_scheduler = trf.get_scheduler(
    "linear",
    optimizer=opt,
    num_warmup_steps=0,
    num_training_steps=training_steps
)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
prediction_model.to(device)

print(device)


# criterion = torch.nn.MSELoss()

progress_bar = tqdm(range(training_steps))

prediction_model.train()
for epoch in range(NUM_EPOCH):
  epoch_loss = 0
  num_batches = 0
  for batch in train_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    outputs = prediction_model(**batch)
    loss = outputs.loss
    # loss = criterion(outputs.logits, batch['labels'])
    loss.backward()

    opt.step()
    lr_scheduler.step()
    opt.zero_grad()
    progress_bar.update(1)

    epoch_loss += loss.item()
    num_batches += 1

  avg_epoch_loss = epoch_loss / num_batches
  print(f"Epoch {epoch}: average loss = {avg_epoch_loss}")

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'classifier.w

cuda


  0%|          | 0/297 [00:00<?, ?it/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 0: average loss = 4.808817552797722
Epoch 1: average loss = 3.0758890127292786
Epoch 2: average loss = 2.118770224879486


## Hyperparameter tuning

In [21]:
from ray import tune

In [22]:
def train(config):
  prediction_model = trf.AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)

  learning_rate = config["learning_rate"]
  num_epochs = config["num_epochs"]
  batch_size = config["batch_size"]
  momentum = config["momentum"]

  opt = torch.optim.AdamW(prediction_model.parameters(), lr=learning_rate, momentum=momentum)
  # lr_scheduler = torch.optim.lr_scheduler.StepLR(opt, step_size=1, gamma=0.1)
  lr_scheduler = trf.get_scheduler(
    "linear",
    optimizer=opt,
    num_warmup_steps=0,
    num_training_steps=training_steps
  )
  # criterion = torch.nn.MSELoss()

  train_dataloader = torch.utils.data.DataLoader(
    tokenised_hugging_dataset["train"], shuffle=True, batch_size=batch_size, collate_fn=data_collator
    )
  
  prediction_model.train()
  for epoch in range(num_epochs):
    epoch_loss = 0
    num_batches = 0
    for batch in train_dataloader:
      batch = {k: v.to(device) for k, v in batch.items()}
      outputs = empathy_prediction(**batch)
      loss = outputs.loss 
      # loss = criterion(outputs.logits, batch['labels'])

      loss.backward()    
      opt.step()
      lr_scheduler.step()
      opt.zero_grad()
      progress_bar.update(1)

      epoch_loss += loss.item()
      num_batches += 1

    avg_epoch_loss = epoch_loss / num_batches
    tune.report(loss=avg_epoch_loss)

config = {
    "learning_rate": tune.loguniform(6e-5, 2e-5),
    "num_epochs": tune.choice([3, 5, 7]),
    "batch_size": tune.choice([4, 8, 16]),
    "momentum": tune.uniform(0.1, 0.9)
}

scheduler = tune.schedulers.ASHAScheduler(
    max_t = training_steps,
    grace_period=1,
    reduction_factor=2
)

analysis = tune.run(
    train,
    resources_per_trial={"cpu": 1, "gpu": 0.5},
    config=config,
    num_samples=10,
    scheduler=scheduler,
    progress_reporter=tune.CLIReporter(),
    local_dir="./ray_results"
)

best_trial = analysis.get_best_trial("loss", "min", "last")
print(f"Best trial config: {best_trial.config}")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


2023-04-17 22:00:56,056	ERROR services.py:1169 -- Failed to start the dashboard , return code 1
2023-04-17 22:00:56,058	ERROR services.py:1194 -- Error should be written to 'dashboard.log' or 'dashboard.err'. We are printing the last 20 lines for you. See 'https://docs.ray.io/en/master/ray-observability/ray-logging.html#logging-directory-structure' to find where the log file is.
2023-04-17 22:00:56,060	ERROR services.py:1238 -- 
The last 20 lines of /jobfs/79653017.gadi-pbs/ray/session_2023-04-17_22-00-53_370436_2302884/logs/dashboard.log (it contains the error message from the dashboard): 
  File "/scratch/jr19/rh2942/miniconda3/lib/python3.10/site-packages/ray/dashboard/head.py", line 204, in _load_modules
    head_cls_list = dashboard_utils.get_all_modules(DashboardHeadModule)
  File "/scratch/jr19/rh2942/miniconda3/lib/python3.10/site-packages/ray/dashboard/utils.py", line 121, in get_all_modules
    importlib.import_module(name)
  File "/scratch/jr19/rh2942/miniconda3/lib/python3.

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


[2m[33m(raylet)[0m [2023-04-17 22:00:59,651 E 2307631 2307680] (raylet) agent_manager.cc:135: The raylet exited immediately because the Ray agent failed. The raylet fate shares with the agent. This can happen because the Ray agent was unexpectedly killed or failed. Agent can fail when
[2m[33m(raylet)[0m - The version of `grpcio` doesn't follow Ray's requirement. Agent can segfault with the incorrect `grpcio` version. Check the grpcio version `pip freeze | grep grpcio`.
[2m[33m(raylet)[0m - The agent failed to start because of unexpected error or port conflict. Read the log `cat /tmp/ray/session_latest/dashboard_agent.log`. You can find the log file structure here https://docs.ray.io/en/master/ray-observability/ray-logging.html#logging-directory-structure.
[2m[33m(raylet)[0m - The agent is killed by the OS (e.g., out of memory).


TypeError: ray.cloudpickle.dumps(<class 'ray.tune.trainable.function_trainable.wrap_function.<locals>.ImplicitFunc'>) failed.
To check which non-serializable variables are captured in scope, re-run the ray script with 'RAY_PICKLE_VERBOSE_DEBUG=1'. Other options: 
-Try reproducing the issue by calling `pickle.dumps(trainable)`. 
-If the error is typing-related, try removing the type annotations and try again.

## Evaluation

In [12]:
prediction_model.eval()

predictions = []

for batch in test_dataloader:
  batch = {k: v.to(device) for k, v in batch.items()}
  with torch.no_grad():
    outputs = prediction_model(**batch)
    
  batch_pred = [item for sublist in outputs.logits.tolist() for item in sublist]  #convert 2D list to 1D
  predictions.append(batch_pred)

y_pred = [item for sublist in predictions for item in sublist]  #convert batch-wise 2D list to 1D

y_true = hugging_dataset["test"]["empathy"]

pearsonr(y_true,y_pred)

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


0.6985

In [None]:
# prediction_model.save_pretrained("model")

In [None]:
# y_pred

In [None]:
# y_true

In [6]:
# from google.colab import drive
# mount_path = '/content/drive'
# drive.mount(mount_path)
# %cd $mount_path"/MyDrive/WASSA2023"

# !pip install transformers datasets sentencepiece

## Training by Huggingface API

In [None]:
from transformers import Trainer, TrainingArguments
from sklearn.metrics import mean_squared_error

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    rmse = mean_squared_error(labels, predictions, squared=False)
    return {"rmse": rmse}

In [None]:
training_args = TrainingArguments(output_dir="empathy-transformer",
                                  logging_strategy="epoch",
                                  evaluation_strategy="epoch",
                                  per_device_train_batch_size=16,
                                  per_device_eval_batch_size=16,
                                  num_train_epochs=3,
                                  # learning_rate=2e-5,
                                  save_total_limit=2,
                                  save_strategy='no',
                                  load_best_model_at_end=False)

trainer = Trainer(
    model=empathy_prediction,
    args=training_args,
    train_dataset=tokenised_hugging_dataset["train"],
    eval_dataset=tokenised_hugging_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokeniser,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()



Epoch,Training Loss,Validation Loss,Rmse
1,3.4157,3.094043,1.758989
2,2.1608,2.74757,1.65758
3,1.4134,2.890623,1.700183


TrainOutput(global_step=117, training_loss=2.329951457488231, metrics={'train_runtime': 30.014, 'train_samples_per_second': 62.271, 'train_steps_per_second': 3.898, 'total_flos': 76837223949486.0, 'train_loss': 2.329951457488231, 'epoch': 3.0})

In [None]:
raw_pred, _, _ = trainer.predict(tokenised_hugging_dataset["test"])