In [1]:
import pandas as pd
import numpy as np
import transformers as trf
from datasets import Dataset
import torch
from accelerate import Accelerator
from accelerate import notebook_launcher

In [2]:
import os
os.chdir("/home/573/rh2942/WASSA-2023-EMP") # changing dir for evaluation file
# print(os.getcwd())

In [3]:
os.environ["TOKENIZERS_PARALLELISM"] = "false" # due to huggingface warning

In [4]:
def load_tokenised_data(filename, tokenise_fn, train_test):
   
    input_data = pd.read_csv(filename, header=0, index_col=0)
    
    if train_test == "train":
        chosen_data = input_data[[feature_1, feature_2, task]]
    elif train_test == "test":
        chosen_data = input_data[[feature_1, feature_2]]  #test data shouldn't have output label

    hugging_dataset = Dataset.from_pandas(chosen_data, preserve_index=False)

    tokenised_hugging_dataset = hugging_dataset.map(tokenise_fn, batched=True, remove_columns = [feature_1, feature_2])
    
    if train_test == "train":
        tokenised_hugging_dataset = tokenised_hugging_dataset.rename_column(task, "labels") # as huggingface requires
    
    tokenised_hugging_dataset = tokenised_hugging_dataset.with_format("torch")

    return tokenised_hugging_dataset

# Prediction

In [14]:
def train_test(model):
    accelerator = Accelerator()
    
    accelerator.print(f"{task} prediction")
    
    opt = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
  
    trainset = load_tokenised_data(filename=os.path.join("./processed_data", train_filename), tokenise_fn=tokenise, train_test="train")
       
    trainloader = torch.utils.data.DataLoader(
        trainset, shuffle=True, batch_size=BATCH_SIZE, collate_fn=data_collator
    )
    
    training_steps = NUM_EPOCH * len(trainloader)
    lr_scheduler = trf.get_scheduler(
        "linear",
        optimizer=opt,
        num_warmup_steps=0,
        num_training_steps=training_steps
    )

    trainloader, model, opt = accelerator.prepare(
        trainloader, model, opt    
    )
    
    for epoch in range(0, NUM_EPOCH):

        # Print epoch
        accelerator.print(f'Starting epoch {epoch+1}')
        
        epoch_loss = 0
        num_batches = 0

        # Iterate over the DataLoader for training data
        for batch in trainloader:
            # Perform forward pass
            outputs = model(**batch)
            
            loss = outputs.loss

            accelerator.backward(loss)
        
            opt.step()
            lr_scheduler.step()
            
            opt.zero_grad()
            
            epoch_loss += loss.item()
            num_batches += 1

        # Process is complete.
        avg_epoch_loss = epoch_loss / num_batches
        accelerator.print(f"Epoch {epoch}: average loss = {avg_epoch_loss}")
        
    
    # evaluation on test set
    testset = load_tokenised_data(filename=os.path.join("./processed_data", test_filename), ttokenise_fn=tokenise, train_test="test")
    testloader = torch.utils.data.DataLoader(
        testset, shuffle=False, batch_size=BATCH_SIZE, collate_fn=data_collator
    )
   
    model.eval()

    y_pred = []

    for batch in testloader:
        with torch.no_grad():
            outputs = model(**batch)

        batch_pred = [item for sublist in outputs.logits.tolist() for item in sublist]  #convert 2D list to 1D
        y_pred.extend(batch_pred)
  
    y_pred_df = pd.DataFrame({task: y_pred})
    filename = "./prediction/predictions_" + task + ".tsv"
    y_pred_df.to_csv(filename, sep='\t', header=False, index=False)

In [15]:
NUM_EPOCH = 35

train_filename = "preprocessed_train.csv"
test_filename = "preprocessed_dev.csv"
# test_filename = "preprocessed_test.csv"

#Chosen features
feature_1 = 'demographic_essay'
feature_2 = 'article'
    
checkpoint = "bert-base-uncased"
# checkpoint = "distilbert-base-uncased"

tokeniser = trf.AutoTokenizer.from_pretrained(checkpoint)

#padding="longest" can be deferred to do dynamic padding
def tokenise(sentence):
    return tokeniser(sentence[feature_1], sentence[feature_2], truncation=True)
    
# data collator due to variable max token length per batch size
data_collator = trf.DataCollatorWithPadding(tokenizer = tokeniser)

In [None]:
def final_prediction(task, lr, batch, seed):
    LEARNING_RATE = lr
    BATCH_SIZE = batch
    SEED = seed

    torch.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    np.random.seed(SEED)
    
    model = trf.AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)

    notebook_launcher(train_test, (model,), num_processes=torch.cuda.device_count())
    predictions_empathy = pd.read_csv("./prediction/predictions_" + task + ".tsv", sep='\t', header=None)

In [None]:
final_prediction(task='personality_conscientiousness', lr=5.98e-05, batch=7, seed=30)

In [None]:
final_prediction(task='personality_openess', lr=1.80e-05, batch=2, seed=81)

In [None]:
final_prediction(task='personality_extraversion', lr=1.61e-05, batch=7, seed=34)

In [None]:
final_prediction(task='personality_agreeableness', lr=, batch=, seed=)

In [None]:
final_prediction(task='personality_stability', lr=, batch=, seed=)

In [24]:
predictions_PER = pd.concat([predictions_cons, predictions_open, predictions_extr, predictions_agre, predictions_stab], axis=1)

predictions_PER.to_csv("./prediction/predictions_PER.tsv", sep='\t', header=False, index=False)

In [3]:
from evaluation import pearsonr, calculate_pearson

In [7]:
# # Just checking the dev set performance
gold_dev = pd.read_csv('./dataset/dev/goldstandard_dev.tsv', sep='\t', header=None) # no header
predictions_PER= pd.read_csv("./prediction/predictions_PER.tsv", sep='\t', header=None)

print(f"cons: {pearsonr(gold_dev.loc[:,3].tolist(), predictions_PER.loc[:,0].tolist())}")
print(f"open: {pearsonr(gold_dev.loc[:,4].tolist(), predictions_PER.loc[:,1].tolist())}")
print(f"extr: {pearsonr(gold_dev.loc[:,5].tolist(), predictions_PER.loc[:,2].tolist())}")
print(f"agre: {pearsonr(gold_dev.loc[:,6].tolist(), predictions_PER.loc[:,3].tolist())}")
print(f"stab: {pearsonr(gold_dev.loc[:,7].tolist(), predictions_PER.loc[:,4].tolist())}")

cons: 0.6117
open: 0.6471
extr: 0.5229
agre: 0.1422
stab: 0.2999


# Hyperparam tuning

In [10]:
import optuna
import plotly
from evaluation import pearsonr, calculate_pearson

In [19]:
# task = "personality_conscientiousness"
# task = 'personality_openess'
# task = 'personality_extraversion'
# task = 'personality_agreeableness'
task = 'personality_stability'

NUM_EPOCH = 35

# validation by dev set only
train_filename = "train_train_paraphrased.csv"
dev_filename = "dev_summarised.csv"

# Chosen features
feature_1 = 'demographic_essay'
feature_2 = 'article'

checkpoint = "bert-base-uncased"

tokeniser = trf.AutoTokenizer.from_pretrained(checkpoint)
    
#padding="longest" can be deferred to do dynamic padding
def tokenise(sentence):
    return tokeniser(sentence[feature_1], sentence[feature_2], truncation=True)

# data collator due to variable max token length per batch size
data_collator = trf.DataCollatorWithPadding(tokenizer = tokeniser)

In [20]:
def objective(trial):    
    # Tuning hyperparams:
    LEARNING_RATE = trial.suggest_float("LEARNING_RATE", 1e-05, 1e-04, log=True)
    BATCH_SIZE = trial.suggest_int("BATCH_SIZE", 2, 8)
    SEED = trial.suggest_int("SEED", 1, 100)
    # checkpoint = trial.suggest_categorical("checkpoint", ("bert-base-uncased", "albert-base-v2"))
    
    torch.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    np.random.seed(SEED)
    
    model = trf.AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)

    # device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    device = "cuda:1"
    model.to(device)
    
    opt = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
    
    ## Split train-test from thw whole train-dev dataset
    # train_dev = load_tokenised_data(filename=os.path.join("./processed_data", train_filename), tokenise_fn=tokenise, train_test="train")
    # train_portion = int(len(train_dev) * 0.8)
    # validation_portion = len(train_dev) - train_portion
    # train_subset, val_subset = torch.utils.data.random_split(train_dev, [train_portion, validation_portion])
    
    # Training by train set only
    train_subset = load_tokenised_data(filename=os.path.join("./processed_data", train_filename), tokenise_fn=tokenise, train_test="train")
    
    train_dataloader = torch.utils.data.DataLoader(
        train_subset, shuffle=True, batch_size=BATCH_SIZE, collate_fn=data_collator
    ) 
    
    training_steps = NUM_EPOCH * len(train_dataloader)
   
    lr_scheduler = trf.get_scheduler(
        "linear",
        optimizer=opt,
        num_warmup_steps=0,
        num_training_steps=training_steps
    )
    
    model.train()
    for epoch in range(NUM_EPOCH):
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss

            loss.backward()    
            opt.step()
            lr_scheduler.step()
            opt.zero_grad()

        # Evaluation   
        val_subset = load_tokenised_data(filename=os.path.join("./processed_data", dev_filename), tokenise_fn=tokenise, train_test="train") #treain_test="train" ensures output labesl are also passed
        validation_dataloader = torch.utils.data.DataLoader(
            val_subset, shuffle=False, batch_size=BATCH_SIZE, collate_fn=data_collator
        )
        model.eval()
        y_true =[]
        y_pred = []

        for batch in validation_dataloader:
            with torch.no_grad():
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)

            y_true.extend(batch['labels'].tolist())
            batch_pred = [item for sublist in outputs.logits.tolist() for item in sublist]  #convert 2D list to 1D
            y_pred.extend(batch_pred)
        
        pearson_r = pearsonr(y_true, y_pred)
            
        trial.report(pearson_r, epoch)
            
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
    
    return pearson_r

In [None]:
study = optuna.create_study(
    sampler=optuna.samplers.TPESampler(seed=28),
    pruner=optuna.pruners.MedianPruner(),
    direction="maximize"
)
study.optimize(objective, n_trials=50, show_progress_bar=True)

trial_results = study.trials_dataframe() #trial results as a dataframe
trial_results.to_csv("trial_results_" + task + ".csv")

print(f"Best Pearson r: {study.best_value}")
print(f"Best parameter: {study.best_params}")

[32m[I 2023-05-02 03:50:07,052][0m A new study created in memory with name: no-name-99eba766-8ea3-49b8-9e89-c6c893386ded[0m

Progress bar is experimental (supported from v1.2.0). The interface can change in the future.



  0%|          | 0/50 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Map:   0%|          | 0/1558 [00:00<?, ? examples/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

Map:   0%|          | 0/208 [00:00<?, ? examples/s]

In [None]:
fig_1 = optuna.visualization.plot_slice(study)
fig_1.show()
fig_1.write_image("./prediction/" + task + "-param-plots.pdf")

In [None]:
fig_2 = optuna.visualization.plot_param_importances(study)
fig_2.show()
fig_2.write_image("./prediction/" + task + "-param-importance.pdf")

## Manual

In [6]:
import csv
from sklearn.model_selection import KFold
from evaluation import pearsonr, calculate_pearson

In [12]:
def train_test_kfold(model):
    accelerator = Accelerator()
    
    opt = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

    training_steps = NUM_EPOCH * len(trainloader)
    lr_scheduler = trf.get_scheduler(
        "linear",
        optimizer=opt,
        num_warmup_steps=0,
        num_training_steps=training_steps
    )
  
    trainloader_acclerate, model, opt = accelerator.prepare(
        trainloader, model, opt    
    )  
    
    for epoch in range(0, NUM_EPOCH):

        # Print epoch
        accelerator.print(f'Starting epoch {epoch+1}')
        
        epoch_loss = 0
        num_batches = 0

        # Iterate over the DataLoader for training data
        for batch in trainloader_acclerate:
            # Perform forward pass
            outputs = model(**batch)
            
            loss = outputs.loss
#             loss = loss_function(outputs, targets)

            accelerator.backward(loss)
        
            opt.step()
            lr_scheduler.step()
            
            opt.zero_grad()
            
            epoch_loss += loss.item()
            num_batches += 1

        # Process is complete.
        avg_epoch_loss = epoch_loss / num_batches
        accelerator.print(f"Epoch {epoch}: average loss = {avg_epoch_loss}")
    
    
    # Evaluation
        
    model.eval()

    y_true =[]
    y_pred = []

    for batch in testloader:
        with torch.no_grad():
            outputs = model(**batch)

        y_true.extend(batch['labels'].tolist())

        batch_pred = [item for sublist in outputs.logits.tolist() for item in sublist]  #convert 2D list to 1D
        y_pred.extend(batch_pred)
        
    pearson_r = pearsonr(y_true, y_pred)
    
    accelerator.print('\n' + checkpoint + ' & ' + str(LEARNING_RATE) + ' & ' + str(BATCH_SIZE) + ' & ' + feature_1 + '-' + feature_2 + ' & ' + str(pearson_r) + ' fold-' + str(fold) + '\n')

In [72]:
NUM_EPOCH = 10
BATCH_SIZE = 8
LEARNING_RATE = 5e-5

task = "empathy"

train_dev_filename = "preprocessed_train_dev.csv"

#Chosen features
feature_1 = 'demographic_essay'
feature_2 = 'article'

# feature_1 = 'essay'
# feature_2 = 'demographic'

# feature_1 = 'essay'
# feature_2 ='article'
    
# checkpoint = "bert-base-uncased"
# checkpoint = "distilbert-base-uncased"

tokeniser = trf.AutoTokenizer.from_pretrained(checkpoint)

#padding="longest" can be deferred to do dynamic padding
def tokenise(sentence):
    return tokeniser(sentence[feature_1], sentence[feature_2], truncation=True)
    
# data collator due to variable max token length per batch size
data_collator = trf.DataCollatorWithPadding(tokenizer = tokeniser)

train_dev = load_tokenised_data(filename=os.path.join("./processed_data", train_dev_filename), tokeniser=tokeniser, train_test="train")

K_FOLD = 5

# Set fixed random number seed
torch.manual_seed(42)

kfold = KFold(n_splits=K_FOLD, shuffle=True)

print('--------------------------------')

# K-fold Cross Validation model evaluation
for fold, (train_idx, test_idx) in enumerate(kfold.split(train_dev)):

    print(f'FOLD {fold}')
    print('--------------------------------')

    # Sample elements randomly from a given list of ids, no replacement.
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_idx)
    test_subsampler = torch.utils.data.SubsetRandomSampler(test_idx)

    # Define data loaders for training and testing data in this fold
    trainloader = torch.utils.data.DataLoader(
        train_dev,
        batch_size=BATCH_SIZE,
        collate_fn=data_collator,
        sampler=train_subsampler
    )
    testloader = torch.utils.data.DataLoader(
        train_dev,
        batch_size=BATCH_SIZE,
        collate_fn=data_collator,
        sampler=test_subsampler
    )

    # Init the neural network
    model = trf.AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=1)

    notebook_launcher(train_test_kfold, (model,), num_processes=torch.cuda.device_count())

Map:   0%|          | 0/987 [00:00<?, ? examples/s]

--------------------------------
FOLD 0
--------------------------------


Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

Launching training on 4 GPUs.
Starting epoch 1


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 0: average loss = 7.213689975738525
Starting epoch 2
Epoch 1: average loss = 3.504651494026184
Starting epoch 3
Epoch 2: average loss = 3.627466974258423
Starting epoch 4
Epoch 3: average loss = 1.8103101205825807
Starting epoch 5
Epoch 4: average loss = 1.518955283164978
Starting epoch 6
Epoch 5: average loss = 1.2208479496836662
Starting epoch 7
Epoch 6: average loss = 1.2117237094044686
Starting epoch 8
Epoch 7: average loss = 0.6970827853679658
Starting epoch 9
Epoch 8: average loss = 0.5291986376047134
Starting epoch 10
Epoch 9: average loss = 0.2736331722140312

bert-large-uncased & 5e-05 & 8 & demographic_essay-article & 0.7977 fold-0

FOLD 1
--------------------------------


Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

Launching training on 4 GPUs.
Starting epoch 1


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 0: average loss = 10.47298833847046
Starting epoch 2
Epoch 1: average loss = 4.196836357116699
Starting epoch 3
Epoch 2: average loss = 4.3771005058288575
Starting epoch 4
Epoch 3: average loss = 4.045954556465149
Starting epoch 5
Epoch 4: average loss = 3.6713451862335207
Starting epoch 6
Epoch 5: average loss = 4.030801820755005
Starting epoch 7
Epoch 6: average loss = 3.761001286506653
Starting epoch 8
Epoch 7: average loss = 3.821267967224121
Starting epoch 9
Epoch 8: average loss = 3.99852801322937
Starting epoch 10
Epoch 9: average loss = 3.6609089183807373

bert-large-uncased & 5e-05 & 8 & demographic_essay-article & 0.0469 fold-1

FOLD 2
--------------------------------


Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

Launching training on 4 GPUs.
Starting epoch 1


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 0: average loss = 11.501055421829223
Starting epoch 2
Epoch 1: average loss = 3.8675417041778566
Starting epoch 3
Epoch 2: average loss = 3.6557919073104856
Starting epoch 4
Epoch 3: average loss = 2.618813760280609
Starting epoch 5
Epoch 4: average loss = 1.6289004373550415
Starting epoch 6
Epoch 5: average loss = 1.5414224410057067
Starting epoch 7
Epoch 6: average loss = 0.8820622348785401
Starting epoch 8
Epoch 7: average loss = 0.5472309976816178
Starting epoch 9
Epoch 8: average loss = 0.3199688667058945
Starting epoch 10
Epoch 9: average loss = 0.16565409243106843

bert-large-uncased & 5e-05 & 8 & demographic_essay-article & 0.8002 fold-2

FOLD 3
--------------------------------


Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

Launching training on 4 GPUs.
Starting epoch 1


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 0: average loss = 5.486358122825623
Starting epoch 2
Epoch 1: average loss = 3.3773480463027954
Starting epoch 3
Epoch 2: average loss = 3.858164029121399
Starting epoch 4
Epoch 3: average loss = 3.092499678134918
Starting epoch 5
Epoch 4: average loss = 2.3774873328208925
Starting epoch 6
Epoch 5: average loss = 2.290995116233826
Starting epoch 7
Epoch 6: average loss = 2.1909004259109497
Starting epoch 8
Epoch 7: average loss = 1.3470318806171417
Starting epoch 9
Epoch 8: average loss = 0.9972421827912331
Starting epoch 10
Epoch 9: average loss = 0.7596608233451844

bert-large-uncased & 5e-05 & 8 & demographic_essay-article & 0.7008 fold-3

FOLD 4
--------------------------------


Some weights of the model checkpoint at bert-large-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint a

Launching training on 4 GPUs.
Starting epoch 1


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch 0: average loss = 8.129480237960815
Starting epoch 2
Epoch 1: average loss = 3.9551459836959837
Starting epoch 3
Epoch 2: average loss = 3.8354076099395753
Starting epoch 4
Epoch 3: average loss = 3.8489742612838747
Starting epoch 5
Epoch 4: average loss = 5.5751168823242185
Starting epoch 6
Epoch 5: average loss = 3.7704463481903074
Starting epoch 7
Epoch 6: average loss = 3.774736123085022
Starting epoch 8
Epoch 7: average loss = 3.518358378410339
Starting epoch 9
Epoch 8: average loss = 3.6325153732299804
Starting epoch 10
Epoch 9: average loss = 3.7241961479187013

bert-large-uncased & 5e-05 & 8 & demographic_essay-article & -0.035 fold-4

