# Train a BERT classifier - "When to ask clarifying question" ❓
# Textual Grid world State Baseline


In [None]:
from dataclasses import dataclass
import os
import datetime
import pickle
import numpy as np
import random
import numpy as np
from numpy.random import Generator,default_rng
import torch
import pandas as pd
from torch.utils.data import TensorDataset
from transformers import BertTokenizer, RobertaTokenizer, BartTokenizer, AutoTokenizer
from sklearn.metrics import precision_score, recall_score, f1_score
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler,WeightedRandomSampler
from torch.nn import CrossEntropyLoss, MSELoss
from transformers import (
    BertForSequenceClassification,
    AdamW,
    get_linear_schedule_with_warmup,
)
from transformers import (
    RobertaForSequenceClassification,
    BartForSequenceClassification,
    AutoModelForSequenceClassification,
)
from tqdm.auto import tqdm
from sklearn.model_selection import KFold, StratifiedKFold


## Hyperparameters

In [None]:
model_name = "derbertav3_base_oversampling"
model_hug = "microsoft/deberta-v3-base"
max_seq_length = 320
batch_size = 16
epoch = 15
lr = 1e-5
seed_val = 42

## Proprocessing - Tokenize the text data for BERT

In [None]:
def get_tensor_dataset(df, tokenizer):

    # Tokenize all of the sentences and map the tokens to thier word IDs.
    input_ids = []
    token_type_ids = []
    attention_masks = []
    labels = []
    topic_ids = []

    for count, item in tqdm(
        enumerate(
            zip(
                df["GameId"],
                df["bylevel_color_context"],  # df["nonspatial_color_context"],
                df["InputInstruction"],
                df["IsInstructionClear"],
            )
        ),
        total=len(df),
        desc="Tokenizing data",
    ):
        z, w, x, y = item
        encoded_dict = tokenizer.encode_plus(
            w,
            x,
            add_special_tokens=True,
            max_length=max_seq_length,
            padding="max_length",  # use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'`
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        input_ids.append(encoded_dict["input_ids"])

        if "token_type_ids" in encoded_dict:
            token_type_ids.append(encoded_dict["token_type_ids"])

        attention_masks.append(encoded_dict["attention_mask"])
        labels.append(y)

        topic_ids.append(z)

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)
    return TensorDataset(input_ids, attention_masks, labels)


In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_hug)
tokenizer.save_pretrained(f"saved_tokenizer_{model_hug}")

('saved_tokenizer_microsoft/deberta-v3-base/tokenizer_config.json',
 'saved_tokenizer_microsoft/deberta-v3-base/special_tokens_map.json',
 'saved_tokenizer_microsoft/deberta-v3-base/spm.model',
 'saved_tokenizer_microsoft/deberta-v3-base/added_tokens.json',
 'saved_tokenizer_microsoft/deberta-v3-base/tokenizer.json')

In [None]:
from sklearn.model_selection import train_test_split
data_path='public_data/clarifying_questions_train.csv'
df = pd.read_csv(data_path, sep=",")
data_path = "public_data/clarifying_questions_train.csv"


## Train Classifier 🏋️‍♀️

In [None]:
import sys
import importlib
from preprocess.tools import block_tools
from preprocess.gridworld_preprocess import complete_df_with_grid_state

importlib.reload(preprocess.gridworld_preprocess)
sys.path.append("../")
df = complete_df_with_grid_state(df)


In [None]:
from preprocess.tools.block_tools import (
    transform_block,
    count_block_colors,
    create_context_colour_count,
)
importlib.reload(block_tools)


In [None]:
df["blocks"] = df["blocks"].apply(
    lambda blocks: [transform_block(block) for block in blocks]
)
df["blocks"]


0       [(0, -1, 0, 5), (0, 0, 0, 5), (0, 1, 0, 5), (0...
1       [(-2, 2, -1, 1), (-1, 0, -1, 5), (-1, 1, -1, 5...
2       [(-2, 2, -1, 1), (-1, 0, -1, 5), (-1, 1, -1, 5...
3       [(-1, -1, 1, 5), (-1, 0, 1, 5), (-1, 1, 1, 5),...
4       [(-5, -1, -5, 3), (-5, -1, -4, 3), (-5, -1, 4,...
                              ...                        
6823    [(-4, -1, -2, 2), (-4, -1, -1, 2), (0, -1, -1,...
6824    [(3, -1, 1, 2), (3, 1, 1, 2), (4, -1, 1, 2), (...
6825    [(-5, -1, 0, 3), (-4, -1, 0, 3), (-4, 0, 0, 3)...
6826    [(-4, -1, -1, 2), (0, -1, -1, 1), (0, 0, -1, 1...
6827    [(-4, 0, 0, 5), (-2, 0, 0, 5), (0, 0, 0, 6), (...
Name: blocks, Length: 6828, dtype: object

In [None]:
color_block_freq = df["blocks"].apply(count_block_colors)
color_block_freq


0                                         {'purple': 13}
1                     {'blue': 2, 'purple': 9, 'red': 2}
2                     {'blue': 2, 'purple': 9, 'red': 2}
3                                          {'purple': 5}
4       {'red': 4, 'purple': 4, 'yellow': 4, 'green': 4}
                              ...                       
6823                             {'green': 2, 'blue': 4}
6824                                        {'green': 8}
6825                                          {'red': 7}
6826                             {'green': 1, 'blue': 4}
6827              {'purple': 2, 'yellow': 4, 'green': 9}
Name: blocks, Length: 6828, dtype: object

In [None]:
color_block_freq_byL = df["blocks"].apply(block_tools.get_color_counter_by_level)
color_block_freq_byL


0       [{'purple': 1}, {'purple': 1}, {'purple': 2}, ...
1       [{'purple': 3, 'red': 1}, {'purple': 3, 'red':...
2       [{'purple': 3, 'red': 1}, {'purple': 3, 'red':...
3       [{'purple': 1}, {'purple': 1}, {'purple': 1}, ...
4       [{'red': 3, 'purple': 3, 'yellow': 3, 'green':...
                              ...                        
6823    [{'green': 2, 'blue': 1}, {'blue': 1}, {'blue'...
6824    [{'green': 3}, {'green': 1}, {'green': 3}, {'g...
6825                             [{'red': 4}, {'red': 3}]
6826    [{'green': 1, 'blue': 1}, {'blue': 1}, {'blue'...
6827    [{'green': 3}, {'purple': 2, 'yellow': 2, 'gre...
Name: blocks, Length: 6828, dtype: object

In [None]:
df["nonspatial_color_context"] = color_block_freq.apply(block_tools.create_context_colour_count)
df["nonspatial_color_context"].apply(len).describe()


count    6828.000000
mean      101.247803
std        25.457803
min        58.000000
25%        83.000000
50%       106.000000
75%       110.000000
max       161.000000
Name: nonspatial_color_context, dtype: float64

In [None]:
df["bylevel_color_context"] = color_block_freq_byL.apply(
    block_tools.create_context_colour_by_height_level
)

df["bylevel_color_context"].str.split(' ').apply(len).describe()
df.to_csv("public_data/clarifying_questions_w_context.csv", index=False)


count    6828.000000
mean       52.279145
std        27.529863
min         9.000000
25%        32.000000
50%        50.000000
75%        68.000000
max       125.000000
Name: bylevel_color_context, dtype: float64

In [None]:
df,df_test = train_test_split(df,test_size=0.1,stratify=df['IsInstructionClear'],random_state=42)
df = pd.read_csv("public_data/clarifying_questions_train_split.csv")
df_test = pd.read_csv("public_data/clarifying_questions_test.csv")
df_or = pd.read_csv("public_data/clarifying_questions_w_context.csv")

In [None]:
df = df.merge(df_or[['GameId',
       'InputInstructionWithGameID', 'pos', 'look', 'blocks',
       'nonspatial_color_context', 'bylevel_color_context']],on='GameId',how='left')

df_test = df_test.merge(df_or[['GameId',
       'InputInstructionWithGameID', 'pos', 'look', 'blocks',
       'nonspatial_color_context', 'bylevel_color_context']],on='GameId',how='left')

In [None]:
from sklearn.metrics import classification_report

def eval_result( labels,preds):
    """ Calculate the accuracy, f1, precision, recall of our predictions vs labels
    """
    if len(preds.shape) ==2:
        y_pred = np.argmax(preds, axis=1).flatten()
    else:
        y_pred=preds
    y_true = labels.flatten()

    precision = precision_score(y_true, y_pred,zero_division=1)
    recall = recall_score(y_true, y_pred,zero_division=1)
    f1 = f1_score(y_true, y_pred, average='macro',zero_division=1)
    accuracy = np.sum(y_pred == y_true) / len(y_true) 

    return (precision, recall, f1, accuracy)



In [None]:
def train_model(
    model_name,
    model,
    train_dataloader,
    val_dataloader,
    scheduler,
    optimizer,
    criterion,
    epochs,
    lr,
    fold=0,
):
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

    if torch.cuda.is_available():
        device = torch.device("cuda")

    else:
        device = torch.device("cpu")
    print(device)
    scaler = torch.cuda.amp.GradScaler()
    # For each epoch...
    for epoch in range(epochs):

        total_train_loss = 0
        train_n_correct = 0
        nb_tr_examples = 0

        model.train()

        for _, batch in tqdm(
            enumerate(train_dataloader),
            total=len(train_dataloader),
            desc=f"Train epoch {epoch+1}/{epochs}",
        ):

            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            model.zero_grad()
            with torch.cuda.amp.autocast():
                result = model(b_input_ids, attention_mask=b_input_mask)

                loss = criterion(result.logits, b_labels)
            #loss.backward()
            scaler.scale(loss).backward()
            total_train_loss += loss.item()

            #optimizer.step()
            scaler.step(optimizer)
            scaler.update()
            
            scheduler.step()

            logits = result.logits.detach().cpu().numpy()
            label_ids = b_labels.to("cpu").numpy()
            _, _, _, accuracy = eval_result(label_ids,logits)
            train_n_correct += accuracy
            nb_tr_examples += b_input_ids.size(0)

        avg_train_loss = total_train_loss / len(train_dataloader)
        train_acc = train_n_correct / len(train_dataloader)

        print(
            "Epoch [{}/{}], Train Loss: {:.4f}, Train Accuracy: {:.4f} ".format(
                epoch + 1, epochs, avg_train_loss, train_acc
            )
        )

        model.eval()

        test_results = []
        test_labels = []
        total_val_loss=0
        test_results_predicted_lavels = []
        test_results_predicted_lavels2 = []
        test_results_predicted_lavels3 = []
        for batch in tqdm(val_dataloader, desc="Eval model"):
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            with torch.no_grad():
                result = model(
                    b_input_ids,
                    attention_mask=b_input_mask,
                    labels=b_labels,
                    return_dict=True,
                )
                loss = criterion(result.logits, b_labels)
            total_val_loss += loss.item()
            logits = torch.softmax(result.logits,axis=1)#result.logits
            logits = logits.detach().cpu().numpy()
            test_results.extend(logits.tolist())
            tmp = np.asarray(logits.tolist())

            # test_results_predicted_lavels.extend(np.max(tmp,axis=1).flatten())

            test_results_predicted_lavels.extend(tmp[:,1]>0.15)
            test_results_predicted_lavels2.extend(tmp[:,1]>0.5)
            test_results_predicted_lavels3.extend(tmp[:,1]>0.7)
            
            avg_val_loss = total_val_loss / len(val_dataloader)
            label_ids = b_labels.to("cpu").numpy()
            test_labels.extend(label_ids)

        print(
            classification_report(
                np.asarray(test_labels), np.asarray(test_results_predicted_lavels)
            )
        )
        (precision, recall, f1, accuracy) = eval_result(
            np.asarray(test_labels), np.asarray(test_results_predicted_lavels)
        )

        print(
            "Epoch [{}/{}], Test Loss: {:.4f},Test Precision: {:.4f}, Test Recall: {:.4f}, Test Macro F1: {:.4f}, Test Accuracy: {:.4f} ".format(
                epoch + 1, epochs, avg_val_loss, precision, recall, f1, accuracy
            )
        )


        print(classification_report( np.asarray(test_labels), np.asarray(test_results_predicted_lavels2)))
        (precision, recall, f1, accuracy) = eval_result(np.asarray(test_labels), np.asarray(test_results_predicted_lavels2))

        print('Test Precision: {:.4f}, Test Recall: {:.4f}, Test Macro F1: {:.4f}, Test Accuracy: {:.4f} ' .format(precision, recall, f1, accuracy))

        (precision, recall, f1, accuracy) = eval_result(np.asarray(test_labels), np.asarray(test_results_predicted_lavels3))

        print('Test Precision: {:.4f}, Test Recall: {:.4f}, Test Macro F1: {:.4f}, Test Accuracy: {:.4f} ' .format(precision, recall, f1, accuracy))

        model.save_pretrained(f"saved_model/{model_name}_{epoch}/{epochs}e_{lr}lr_f{fold}")

    print("Training complete!")

    # model.save_pretrained(f"drive/MyDrive/IGLU-cq-data/{model_name}_{epochs}e_{lr}lr")


In [26]:
batch_size = 8
epoch = 7
FOLDS = 5


In [53]:
lr = 1e-5
batch_size=24

In [49]:
df2 =df.copy()

In [54]:
df2["IsInstructionClear"] = df2.IsInstructionClear.replace(
        {"Yes": 0, "No": 1}
    )

NameError: name 'df2' is not defined

In [51]:
df2["IsInstructionClear"].apply(lambda x: 4 if x==1 else 1).values

array([1, 1, 1, ..., 1, 1, 1])

In [52]:
datasettr = get_tensor_dataset(df2, tokenizer)

Tokenizing data:   0%|          | 0/6145 [00:00<?, ?it/s]

In [31]:
model_name="derbertav3_base_oversampling"

In [32]:
os.environ["TOKENIZERS_PARALLELISM"] = 'true'

In [161]:
print("============================================================")
print(f" {epoch} -- {lr} -- {datetime.datetime.now()}")
print("============================================================")

# with open(f'{model_name}_train.pkl', 'rb') as f:
#     train_dataset = pickle.load(f)
skf = StratifiedKFold(FOLDS, random_state=42, shuffle=True)
for fold, (tr_id, val_id) in enumerate(skf.split(df, df["IsInstructionClear"])):
    # dfdev.to_csv('public_data/clarifying_questions_val_w_context.csv', index=False)
    dftrain = df.iloc[tr_id].copy()
    dfdev = df.iloc[val_id].copy()

    dftrain["IsInstructionClear"] = dftrain.IsInstructionClear.replace(
        {"Yes": 0, "No": 1}
    )
    oversampling_weights= dftrain["IsInstructionClear"].apply(lambda x: 4 if x==1 else 1).values

    dfdev["IsInstructionClear"] = dfdev.IsInstructionClear.replace({"Yes": 0, "No": 1})
    from transformers import AutoTokenizer

    tokenizer = AutoTokenizer.from_pretrained(model_hug)

    datasettr = get_tensor_dataset(dftrain, tokenizer)
    datasetval = get_tensor_dataset(dfdev, tokenizer)

    train_dataloader = DataLoader(
        datasettr,
        sampler=WeightedRandomSampler(oversampling_weights,num_samples=len(datasettr)),#RandomSampler(datasettr),
        batch_size=batch_size,
        pin_memory=True,
        num_workers=4,
    )
    val_dataloader = DataLoader(
        datasetval, batch_size=batch_size, pin_memory=True, num_workers=4
    )

    from transformers import AutoModelForSequenceClassification

    model = AutoModelForSequenceClassification.from_pretrained(model_hug, num_labels=2)
    model.cuda()

    optimizer = AdamW(
        model.parameters(), lr=lr, weight_decay=0.05  # 0.03 73.4 ar epoch 6
    )

    criterion = CrossEntropyLoss()
    total_steps = len(train_dataloader) * epoch
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=total_steps
    )
    
    # Train Model
    train_model(
        model_name,
        model,
        train_dataloader,
        val_dataloader,
        scheduler,
        optimizer,
        criterion,
        epochs,
        lr,
        fold,
    )


 4 -- 1e-05 -- 2022-09-20 15:57:21.055736


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Tokenizing data: 100%|██████████| 4916/4916 [00:02<00:00, 1825.26it/s]
Tokenizing data: 100%|██████████| 1229/1229 [00:00<00:00, 1794.99it/s]
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.dense.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the

cuda


Train epoch 1/7: 100%|██████████| 205/205 [01:12<00:00,  2.82it/s]


Epoch [1/7], Train Loss: 0.5928, Train Accuracy: 0.6837 


Eval model: 100%|██████████| 52/52 [00:10<00:00,  5.11it/s]


              precision    recall  f1-score   support

           0       0.98      0.49      0.65      1072
           1       0.21      0.92      0.34       157

    accuracy                           0.54      1229
   macro avg       0.59      0.70      0.49      1229
weighted avg       0.88      0.54      0.61      1229

Epoch [1/7], Test Loss: 0.3347,Test Precision: 0.2072, Test Recall: 0.9172, Test Macro F1: 0.4934, Test Accuracy: 0.5411 
              precision    recall  f1-score   support

           0       0.93      0.93      0.93      1072
           1       0.54      0.54      0.54       157

    accuracy                           0.88      1229
   macro avg       0.74      0.73      0.74      1229
weighted avg       0.88      0.88      0.88      1229

Test Precision: 0.5419, Test Recall: 0.5350, Test Macro F1: 0.7357, Test Accuracy: 0.8828 
Test Precision: 0.6289, Test Recall: 0.3885, Test Macro F1: 0.7102, Test Accuracy: 0.8926 


Train epoch 2/7: 100%|██████████| 205/205 [01:21<00:00,  2.50it/s]


Epoch [2/7], Train Loss: 0.4645, Train Accuracy: 0.8059 


Eval model: 100%|██████████| 52/52 [00:11<00:00,  4.35it/s]


              precision    recall  f1-score   support

           0       0.96      0.65      0.78      1072
           1       0.26      0.83      0.40       157

    accuracy                           0.67      1229
   macro avg       0.61      0.74      0.59      1229
weighted avg       0.87      0.67      0.73      1229

Epoch [2/7], Test Loss: 0.3444,Test Precision: 0.2594, Test Recall: 0.8344, Test Macro F1: 0.5865, Test Accuracy: 0.6745 
              precision    recall  f1-score   support

           0       0.94      0.91      0.92      1072
           1       0.48      0.58      0.53       157

    accuracy                           0.87      1229
   macro avg       0.71      0.74      0.72      1229
weighted avg       0.88      0.87      0.87      1229

Test Precision: 0.4815, Test Recall: 0.5796, Test Macro F1: 0.7242, Test Accuracy: 0.8666 
Test Precision: 0.5584, Test Recall: 0.5478, Test Macro F1: 0.7442, Test Accuracy: 0.8869 


Train epoch 3/7: 100%|██████████| 205/205 [01:23<00:00,  2.46it/s]


Epoch [3/7], Train Loss: 0.3842, Train Accuracy: 0.8496 


Eval model: 100%|██████████| 52/52 [00:14<00:00,  3.71it/s]


              precision    recall  f1-score   support

           0       0.95      0.81      0.87      1072
           1       0.35      0.71      0.47       157

    accuracy                           0.79      1229
   macro avg       0.65      0.76      0.67      1229
weighted avg       0.87      0.79      0.82      1229

Epoch [3/7], Test Loss: 0.3282,Test Precision: 0.3511, Test Recall: 0.7134, Test Macro F1: 0.6717, Test Accuracy: 0.7950 
              precision    recall  f1-score   support

           0       0.93      0.93      0.93      1072
           1       0.53      0.54      0.53       157

    accuracy                           0.88      1229
   macro avg       0.73      0.74      0.73      1229
weighted avg       0.88      0.88      0.88      1229

Test Precision: 0.5280, Test Recall: 0.5414, Test Macro F1: 0.7327, Test Accuracy: 0.8796 
Test Precision: 0.5786, Test Recall: 0.5159, Test Macro F1: 0.7415, Test Accuracy: 0.8902 


Train epoch 4/7: 100%|██████████| 205/205 [01:26<00:00,  2.37it/s]


Epoch [4/7], Train Loss: 0.3729, Train Accuracy: 0.8564 


Eval model: 100%|██████████| 52/52 [00:13<00:00,  3.82it/s]


              precision    recall  f1-score   support

           0       0.95      0.74      0.83      1072
           1       0.29      0.73      0.42       157

    accuracy                           0.74      1229
   macro avg       0.62      0.74      0.62      1229
weighted avg       0.87      0.74      0.78      1229

Epoch [4/7], Test Loss: 0.3427,Test Precision: 0.2911, Test Recall: 0.7325, Test Macro F1: 0.6239, Test Accuracy: 0.7380 
              precision    recall  f1-score   support

           0       0.93      0.93      0.93      1072
           1       0.52      0.56      0.54       157

    accuracy                           0.88      1229
   macro avg       0.73      0.74      0.74      1229
weighted avg       0.88      0.88      0.88      1229

Test Precision: 0.5238, Test Recall: 0.5605, Test Macro F1: 0.7358, Test Accuracy: 0.8788 
Test Precision: 0.5556, Test Recall: 0.5096, Test Macro F1: 0.7331, Test Accuracy: 0.8853 


Train epoch 5/7: 100%|██████████| 205/205 [01:24<00:00,  2.42it/s]


Epoch [5/7], Train Loss: 0.3488, Train Accuracy: 0.8663 


Eval model: 100%|██████████| 52/52 [00:15<00:00,  3.42it/s]


              precision    recall  f1-score   support

           0       0.95      0.74      0.83      1072
           1       0.29      0.73      0.42       157

    accuracy                           0.74      1229
   macro avg       0.62      0.74      0.62      1229
weighted avg       0.87      0.74      0.78      1229

Epoch [5/7], Test Loss: 0.3427,Test Precision: 0.2911, Test Recall: 0.7325, Test Macro F1: 0.6239, Test Accuracy: 0.7380 
              precision    recall  f1-score   support

           0       0.93      0.93      0.93      1072
           1       0.52      0.56      0.54       157

    accuracy                           0.88      1229
   macro avg       0.73      0.74      0.74      1229
weighted avg       0.88      0.88      0.88      1229

Test Precision: 0.5238, Test Recall: 0.5605, Test Macro F1: 0.7358, Test Accuracy: 0.8788 
Test Precision: 0.5556, Test Recall: 0.5096, Test Macro F1: 0.7331, Test Accuracy: 0.8853 


Train epoch 6/7: 100%|██████████| 205/205 [01:21<00:00,  2.50it/s]


Epoch [6/7], Train Loss: 0.3426, Train Accuracy: 0.8655 


Eval model: 100%|██████████| 52/52 [00:14<00:00,  3.47it/s]


              precision    recall  f1-score   support

           0       0.95      0.74      0.83      1072
           1       0.29      0.73      0.42       157

    accuracy                           0.74      1229
   macro avg       0.62      0.74      0.62      1229
weighted avg       0.87      0.74      0.78      1229

Epoch [6/7], Test Loss: 0.3427,Test Precision: 0.2911, Test Recall: 0.7325, Test Macro F1: 0.6239, Test Accuracy: 0.7380 
              precision    recall  f1-score   support

           0       0.93      0.93      0.93      1072
           1       0.52      0.56      0.54       157

    accuracy                           0.88      1229
   macro avg       0.73      0.74      0.74      1229
weighted avg       0.88      0.88      0.88      1229

Test Precision: 0.5238, Test Recall: 0.5605, Test Macro F1: 0.7358, Test Accuracy: 0.8788 
Test Precision: 0.5556, Test Recall: 0.5096, Test Macro F1: 0.7331, Test Accuracy: 0.8853 


Train epoch 7/7: 100%|██████████| 205/205 [01:27<00:00,  2.35it/s]


Epoch [7/7], Train Loss: 0.3505, Train Accuracy: 0.8621 


Eval model: 100%|██████████| 52/52 [00:12<00:00,  4.16it/s]


              precision    recall  f1-score   support

           0       0.95      0.74      0.83      1072
           1       0.29      0.73      0.42       157

    accuracy                           0.74      1229
   macro avg       0.62      0.74      0.62      1229
weighted avg       0.87      0.74      0.78      1229

Epoch [7/7], Test Loss: 0.3427,Test Precision: 0.2911, Test Recall: 0.7325, Test Macro F1: 0.6239, Test Accuracy: 0.7380 
              precision    recall  f1-score   support

           0       0.93      0.93      0.93      1072
           1       0.52      0.56      0.54       157

    accuracy                           0.88      1229
   macro avg       0.73      0.74      0.74      1229
weighted avg       0.88      0.88      0.88      1229

Test Precision: 0.5238, Test Recall: 0.5605, Test Macro F1: 0.7358, Test Accuracy: 0.8788 
Test Precision: 0.5556, Test Recall: 0.5096, Test Macro F1: 0.7331, Test Accuracy: 0.8853 
Training complete!


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Tokenizing data: 100%|██████████| 4916/4916 [00:02<00:00, 1872.51it/s]
Tokenizing data: 100%|██████████| 1229/1229 [00:00<00:00, 1782.63it/s]
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.dense.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the

cuda


Train epoch 1/7: 100%|██████████| 205/205 [01:21<00:00,  2.51it/s]


Epoch [1/7], Train Loss: 0.5928, Train Accuracy: 0.6913 


Eval model: 100%|██████████| 52/52 [00:15<00:00,  3.27it/s]


              precision    recall  f1-score   support

           0       0.94      0.77      0.85      1072
           1       0.29      0.65      0.40       157

    accuracy                           0.76      1229
   macro avg       0.62      0.71      0.63      1229
weighted avg       0.86      0.76      0.79      1229

Epoch [1/7], Test Loss: 0.3327,Test Precision: 0.2939, Test Recall: 0.6497, Test Macro F1: 0.6256, Test Accuracy: 0.7559 
              precision    recall  f1-score   support

           0       0.93      0.93      0.93      1072
           1       0.52      0.49      0.50       157

    accuracy                           0.88      1229
   macro avg       0.72      0.71      0.72      1229
weighted avg       0.87      0.88      0.87      1229

Test Precision: 0.5168, Test Recall: 0.4904, Test Macro F1: 0.7163, Test Accuracy: 0.8763 
Test Precision: 1.0000, Test Recall: 0.0000, Test Macro F1: 0.4659, Test Accuracy: 0.8723 


Train epoch 2/7: 100%|██████████| 205/205 [01:32<00:00,  2.23it/s]


Epoch [2/7], Train Loss: 0.4566, Train Accuracy: 0.8078 


Eval model: 100%|██████████| 52/52 [00:13<00:00,  3.97it/s]


              precision    recall  f1-score   support

           0       0.95      0.66      0.78      1072
           1       0.24      0.74      0.36       157

    accuracy                           0.67      1229
   macro avg       0.59      0.70      0.57      1229
weighted avg       0.85      0.67      0.72      1229

Epoch [2/7], Test Loss: 0.3785,Test Precision: 0.2402, Test Recall: 0.7389, Test Macro F1: 0.5690, Test Accuracy: 0.6680 
              precision    recall  f1-score   support

           0       0.93      0.91      0.92      1072
           1       0.48      0.54      0.51       157

    accuracy                           0.87      1229
   macro avg       0.71      0.73      0.72      1229
weighted avg       0.87      0.87      0.87      1229

Test Precision: 0.4802, Test Recall: 0.5414, Test Macro F1: 0.7159, Test Accuracy: 0.8666 
Test Precision: 0.5241, Test Recall: 0.4841, Test Macro F1: 0.7169, Test Accuracy: 0.8779 


Train epoch 3/7: 100%|██████████| 205/205 [01:29<00:00,  2.30it/s]


Epoch [3/7], Train Loss: 0.3878, Train Accuracy: 0.8468 


Eval model: 100%|██████████| 52/52 [00:13<00:00,  3.83it/s]


              precision    recall  f1-score   support

           0       0.95      0.66      0.78      1072
           1       0.25      0.75      0.37       157

    accuracy                           0.67      1229
   macro avg       0.60      0.71      0.58      1229
weighted avg       0.86      0.67      0.73      1229

Epoch [3/7], Test Loss: 0.4318,Test Precision: 0.2463, Test Recall: 0.7516, Test Macro F1: 0.5758, Test Accuracy: 0.6745 
              precision    recall  f1-score   support

           0       0.93      0.88      0.91      1072
           1       0.42      0.58      0.49       157

    accuracy                           0.85      1229
   macro avg       0.68      0.73      0.70      1229
weighted avg       0.87      0.85      0.86      1229

Test Precision: 0.4233, Test Recall: 0.5796, Test Macro F1: 0.6991, Test Accuracy: 0.8454 
Test Precision: 0.4889, Test Recall: 0.5605, Test Macro F1: 0.7232, Test Accuracy: 0.8690 


Train epoch 4/7: 100%|██████████| 205/205 [01:26<00:00,  2.36it/s]


Epoch [4/7], Train Loss: 0.3532, Train Accuracy: 0.8628 


Eval model: 100%|██████████| 52/52 [00:13<00:00,  3.97it/s]


              precision    recall  f1-score   support

           0       0.94      0.71      0.81      1072
           1       0.27      0.71      0.39       157

    accuracy                           0.71      1229
   macro avg       0.60      0.71      0.60      1229
weighted avg       0.86      0.71      0.76      1229

Epoch [4/7], Test Loss: 0.3953,Test Precision: 0.2656, Test Recall: 0.7070, Test Macro F1: 0.5993, Test Accuracy: 0.7128 
              precision    recall  f1-score   support

           0       0.93      0.91      0.92      1072
           1       0.48      0.55      0.52       157

    accuracy                           0.87      1229
   macro avg       0.71      0.73      0.72      1229
weighted avg       0.88      0.87      0.87      1229

Test Precision: 0.4833, Test Recall: 0.5541, Test Macro F1: 0.7197, Test Accuracy: 0.8674 
Test Precision: 0.5238, Test Recall: 0.4904, Test Macro F1: 0.7185, Test Accuracy: 0.8779 


Train epoch 5/7: 100%|██████████| 205/205 [01:22<00:00,  2.48it/s]


Epoch [5/7], Train Loss: 0.3287, Train Accuracy: 0.8762 


Eval model: 100%|██████████| 52/52 [00:13<00:00,  3.83it/s]


              precision    recall  f1-score   support

           0       0.94      0.71      0.81      1072
           1       0.27      0.71      0.39       157

    accuracy                           0.71      1229
   macro avg       0.60      0.71      0.60      1229
weighted avg       0.86      0.71      0.76      1229

Epoch [5/7], Test Loss: 0.3953,Test Precision: 0.2656, Test Recall: 0.7070, Test Macro F1: 0.5993, Test Accuracy: 0.7128 
              precision    recall  f1-score   support

           0       0.93      0.91      0.92      1072
           1       0.48      0.55      0.52       157

    accuracy                           0.87      1229
   macro avg       0.71      0.73      0.72      1229
weighted avg       0.88      0.87      0.87      1229

Test Precision: 0.4833, Test Recall: 0.5541, Test Macro F1: 0.7197, Test Accuracy: 0.8674 
Test Precision: 0.5238, Test Recall: 0.4904, Test Macro F1: 0.7185, Test Accuracy: 0.8779 


Train epoch 6/7: 100%|██████████| 205/205 [01:29<00:00,  2.30it/s]


Epoch [6/7], Train Loss: 0.3384, Train Accuracy: 0.8700 


Eval model: 100%|██████████| 52/52 [00:13<00:00,  3.91it/s]


              precision    recall  f1-score   support

           0       0.94      0.71      0.81      1072
           1       0.27      0.71      0.39       157

    accuracy                           0.71      1229
   macro avg       0.60      0.71      0.60      1229
weighted avg       0.86      0.71      0.76      1229

Epoch [6/7], Test Loss: 0.3953,Test Precision: 0.2656, Test Recall: 0.7070, Test Macro F1: 0.5993, Test Accuracy: 0.7128 
              precision    recall  f1-score   support

           0       0.93      0.91      0.92      1072
           1       0.48      0.55      0.52       157

    accuracy                           0.87      1229
   macro avg       0.71      0.73      0.72      1229
weighted avg       0.88      0.87      0.87      1229

Test Precision: 0.4833, Test Recall: 0.5541, Test Macro F1: 0.7197, Test Accuracy: 0.8674 
Test Precision: 0.5238, Test Recall: 0.4904, Test Macro F1: 0.7185, Test Accuracy: 0.8779 


Train epoch 7/7: 100%|██████████| 205/205 [01:30<00:00,  2.26it/s]


Epoch [7/7], Train Loss: 0.3611, Train Accuracy: 0.8621 


Eval model: 100%|██████████| 52/52 [00:13<00:00,  3.93it/s]


              precision    recall  f1-score   support

           0       0.94      0.71      0.81      1072
           1       0.27      0.71      0.39       157

    accuracy                           0.71      1229
   macro avg       0.60      0.71      0.60      1229
weighted avg       0.86      0.71      0.76      1229

Epoch [7/7], Test Loss: 0.3953,Test Precision: 0.2656, Test Recall: 0.7070, Test Macro F1: 0.5993, Test Accuracy: 0.7128 
              precision    recall  f1-score   support

           0       0.93      0.91      0.92      1072
           1       0.48      0.55      0.52       157

    accuracy                           0.87      1229
   macro avg       0.71      0.73      0.72      1229
weighted avg       0.88      0.87      0.87      1229

Test Precision: 0.4833, Test Recall: 0.5541, Test Macro F1: 0.7197, Test Accuracy: 0.8674 
Test Precision: 0.5238, Test Recall: 0.4904, Test Macro F1: 0.7185, Test Accuracy: 0.8779 
Training complete!


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Tokenizing data: 100%|██████████| 4916/4916 [00:02<00:00, 1684.13it/s]
Tokenizing data: 100%|██████████| 1229/1229 [00:00<00:00, 1578.64it/s]
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.dense.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the

cuda


Train epoch 1/7: 100%|██████████| 205/205 [01:24<00:00,  2.42it/s]


Epoch [1/7], Train Loss: 0.5959, Train Accuracy: 0.6779 


Eval model: 100%|██████████| 52/52 [00:15<00:00,  3.33it/s]


              precision    recall  f1-score   support

           0       0.96      0.58      0.72      1071
           1       0.23      0.84      0.35       158

    accuracy                           0.61      1229
   macro avg       0.59      0.71      0.54      1229
weighted avg       0.87      0.61      0.67      1229

Epoch [1/7], Test Loss: 0.3480,Test Precision: 0.2253, Test Recall: 0.8354, Test Macro F1: 0.5374, Test Accuracy: 0.6094 
              precision    recall  f1-score   support

           0       0.94      0.88      0.91      1071
           1       0.43      0.60      0.50       158

    accuracy                           0.85      1229
   macro avg       0.68      0.74      0.71      1229
weighted avg       0.87      0.85      0.86      1229

Test Precision: 0.4318, Test Recall: 0.6013, Test Macro F1: 0.7061, Test Accuracy: 0.8470 
Test Precision: 1.0000, Test Recall: 0.0000, Test Macro F1: 0.4657, Test Accuracy: 0.8714 


Train epoch 2/7: 100%|██████████| 205/205 [01:29<00:00,  2.30it/s]


Epoch [2/7], Train Loss: 0.4188, Train Accuracy: 0.8322 


Eval model: 100%|██████████| 52/52 [00:11<00:00,  4.36it/s]


              precision    recall  f1-score   support

           0       0.95      0.76      0.85      1071
           1       0.31      0.73      0.44       158

    accuracy                           0.76      1229
   macro avg       0.63      0.75      0.64      1229
weighted avg       0.87      0.76      0.79      1229

Epoch [2/7], Test Loss: 0.3430,Test Precision: 0.3144, Test Recall: 0.7342, Test Macro F1: 0.6437, Test Accuracy: 0.7600 
              precision    recall  f1-score   support

           0       0.94      0.92      0.93      1071
           1       0.52      0.61      0.56       158

    accuracy                           0.88      1229
   macro avg       0.73      0.76      0.74      1229
weighted avg       0.89      0.88      0.88      1229

Test Precision: 0.5189, Test Recall: 0.6076, Test Macro F1: 0.7442, Test Accuracy: 0.8771 
Test Precision: 0.5676, Test Recall: 0.5316, Test Macro F1: 0.7424, Test Accuracy: 0.8877 


Train epoch 3/7: 100%|██████████| 205/205 [01:23<00:00,  2.45it/s]


Epoch [3/7], Train Loss: 0.4069, Train Accuracy: 0.8284 


Eval model: 100%|██████████| 52/52 [00:13<00:00,  3.95it/s]


              precision    recall  f1-score   support

           0       0.95      0.63      0.76      1071
           1       0.24      0.78      0.37       158

    accuracy                           0.65      1229
   macro avg       0.60      0.71      0.56      1229
weighted avg       0.86      0.65      0.71      1229

Epoch [3/7], Test Loss: 0.4033,Test Precision: 0.2385, Test Recall: 0.7848, Test Macro F1: 0.5621, Test Accuracy: 0.6501 
              precision    recall  f1-score   support

           0       0.94      0.87      0.91      1071
           1       0.42      0.64      0.51       158

    accuracy                           0.84      1229
   macro avg       0.68      0.76      0.71      1229
weighted avg       0.88      0.84      0.85      1229

Test Precision: 0.4226, Test Recall: 0.6392, Test Macro F1: 0.7071, Test Accuracy: 0.8413 
Test Precision: 0.5112, Test Recall: 0.5759, Test Macro F1: 0.7345, Test Accuracy: 0.8747 


Train epoch 4/7: 100%|██████████| 205/205 [01:28<00:00,  2.31it/s]


Epoch [4/7], Train Loss: 0.3409, Train Accuracy: 0.8670 


Eval model: 100%|██████████| 52/52 [00:12<00:00,  4.03it/s]


              precision    recall  f1-score   support

           0       0.95      0.74      0.84      1071
           1       0.30      0.75      0.43       158

    accuracy                           0.75      1229
   macro avg       0.63      0.75      0.63      1229
weighted avg       0.87      0.75      0.78      1229

Epoch [4/7], Test Loss: 0.3730,Test Precision: 0.3028, Test Recall: 0.7532, Test Macro F1: 0.6339, Test Accuracy: 0.7453 
              precision    recall  f1-score   support

           0       0.94      0.89      0.92      1071
           1       0.47      0.65      0.55       158

    accuracy                           0.86      1229
   macro avg       0.71      0.77      0.73      1229
weighted avg       0.88      0.86      0.87      1229

Test Precision: 0.4722, Test Recall: 0.6456, Test Macro F1: 0.7319, Test Accuracy: 0.8617 
Test Precision: 0.5263, Test Recall: 0.5696, Test Macro F1: 0.7386, Test Accuracy: 0.8788 


Train epoch 5/7: 100%|██████████| 205/205 [01:24<00:00,  2.41it/s]


Epoch [5/7], Train Loss: 0.3335, Train Accuracy: 0.8680 


Eval model: 100%|██████████| 52/52 [00:14<00:00,  3.62it/s]


              precision    recall  f1-score   support

           0       0.95      0.74      0.84      1071
           1       0.30      0.75      0.43       158

    accuracy                           0.75      1229
   macro avg       0.63      0.75      0.63      1229
weighted avg       0.87      0.75      0.78      1229

Epoch [5/7], Test Loss: 0.3730,Test Precision: 0.3028, Test Recall: 0.7532, Test Macro F1: 0.6339, Test Accuracy: 0.7453 
              precision    recall  f1-score   support

           0       0.94      0.89      0.92      1071
           1       0.47      0.65      0.55       158

    accuracy                           0.86      1229
   macro avg       0.71      0.77      0.73      1229
weighted avg       0.88      0.86      0.87      1229

Test Precision: 0.4722, Test Recall: 0.6456, Test Macro F1: 0.7319, Test Accuracy: 0.8617 
Test Precision: 0.5263, Test Recall: 0.5696, Test Macro F1: 0.7386, Test Accuracy: 0.8788 


Train epoch 6/7: 100%|██████████| 205/205 [01:31<00:00,  2.24it/s]


Epoch [6/7], Train Loss: 0.3277, Train Accuracy: 0.8749 


Eval model: 100%|██████████| 52/52 [00:12<00:00,  4.07it/s]


              precision    recall  f1-score   support

           0       0.95      0.74      0.84      1071
           1       0.30      0.75      0.43       158

    accuracy                           0.75      1229
   macro avg       0.63      0.75      0.63      1229
weighted avg       0.87      0.75      0.78      1229

Epoch [6/7], Test Loss: 0.3730,Test Precision: 0.3028, Test Recall: 0.7532, Test Macro F1: 0.6339, Test Accuracy: 0.7453 
              precision    recall  f1-score   support

           0       0.94      0.89      0.92      1071
           1       0.47      0.65      0.55       158

    accuracy                           0.86      1229
   macro avg       0.71      0.77      0.73      1229
weighted avg       0.88      0.86      0.87      1229

Test Precision: 0.4722, Test Recall: 0.6456, Test Macro F1: 0.7319, Test Accuracy: 0.8617 
Test Precision: 0.5263, Test Recall: 0.5696, Test Macro F1: 0.7386, Test Accuracy: 0.8788 


Train epoch 7/7: 100%|██████████| 205/205 [01:22<00:00,  2.47it/s]


Epoch [7/7], Train Loss: 0.3347, Train Accuracy: 0.8730 


Eval model: 100%|██████████| 52/52 [00:12<00:00,  4.12it/s]


              precision    recall  f1-score   support

           0       0.95      0.74      0.84      1071
           1       0.30      0.75      0.43       158

    accuracy                           0.75      1229
   macro avg       0.63      0.75      0.63      1229
weighted avg       0.87      0.75      0.78      1229

Epoch [7/7], Test Loss: 0.3730,Test Precision: 0.3028, Test Recall: 0.7532, Test Macro F1: 0.6339, Test Accuracy: 0.7453 
              precision    recall  f1-score   support

           0       0.94      0.89      0.92      1071
           1       0.47      0.65      0.55       158

    accuracy                           0.86      1229
   macro avg       0.71      0.77      0.73      1229
weighted avg       0.88      0.86      0.87      1229

Test Precision: 0.4722, Test Recall: 0.6456, Test Macro F1: 0.7319, Test Accuracy: 0.8617 
Test Precision: 0.5263, Test Recall: 0.5696, Test Macro F1: 0.7386, Test Accuracy: 0.8788 
Training complete!


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Tokenizing data: 100%|██████████| 4916/4916 [00:02<00:00, 1752.86it/s]
Tokenizing data: 100%|██████████| 1229/1229 [00:00<00:00, 1962.14it/s]
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.dense.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the

cuda


Train epoch 1/7: 100%|██████████| 205/205 [01:22<00:00,  2.48it/s]


Epoch [1/7], Train Loss: 0.6004, Train Accuracy: 0.6778 


Eval model: 100%|██████████| 52/52 [00:14<00:00,  3.55it/s]


              precision    recall  f1-score   support

           0       0.95      0.72      0.82      1071
           1       0.28      0.75      0.41       158

    accuracy                           0.72      1229
   macro avg       0.62      0.73      0.61      1229
weighted avg       0.87      0.72      0.76      1229

Epoch [1/7], Test Loss: 0.3178,Test Precision: 0.2813, Test Recall: 0.7532, Test Macro F1: 0.6135, Test Accuracy: 0.7209 
              precision    recall  f1-score   support

           0       0.93      0.94      0.93      1071
           1       0.55      0.53      0.54       158

    accuracy                           0.88      1229
   macro avg       0.74      0.73      0.74      1229
weighted avg       0.88      0.88      0.88      1229

Test Precision: 0.5490, Test Recall: 0.5316, Test Macro F1: 0.7368, Test Accuracy: 0.8836 
Test Precision: 1.0000, Test Recall: 0.0000, Test Macro F1: 0.4657, Test Accuracy: 0.8714 


Train epoch 2/7: 100%|██████████| 205/205 [01:23<00:00,  2.46it/s]


Epoch [2/7], Train Loss: 0.4647, Train Accuracy: 0.8021 


Eval model: 100%|██████████| 52/52 [00:14<00:00,  3.53it/s]


              precision    recall  f1-score   support

           0       0.96      0.68      0.80      1071
           1       0.27      0.78      0.40       158

    accuracy                           0.70      1229
   macro avg       0.61      0.73      0.60      1229
weighted avg       0.87      0.70      0.75      1229

Epoch [2/7], Test Loss: 0.3191,Test Precision: 0.2672, Test Recall: 0.7848, Test Macro F1: 0.5975, Test Accuracy: 0.6957 
              precision    recall  f1-score   support

           0       0.93      0.95      0.94      1071
           1       0.58      0.50      0.54       158

    accuracy                           0.89      1229
   macro avg       0.75      0.72      0.74      1229
weighted avg       0.88      0.89      0.89      1229

Test Precision: 0.5766, Test Recall: 0.5000, Test Macro F1: 0.7361, Test Accuracy: 0.8885 
Test Precision: 0.6569, Test Recall: 0.4241, Test Macro F1: 0.7290, Test Accuracy: 0.8975 


Train epoch 3/7: 100%|██████████| 205/205 [01:26<00:00,  2.36it/s]


Epoch [3/7], Train Loss: 0.3989, Train Accuracy: 0.8383 


Eval model: 100%|██████████| 52/52 [00:13<00:00,  3.88it/s]


              precision    recall  f1-score   support

           0       0.95      0.64      0.77      1071
           1       0.24      0.78      0.37       158

    accuracy                           0.66      1229
   macro avg       0.60      0.71      0.57      1229
weighted avg       0.86      0.66      0.72      1229

Epoch [3/7], Test Loss: 0.3592,Test Precision: 0.2441, Test Recall: 0.7848, Test Macro F1: 0.5696, Test Accuracy: 0.6599 
              precision    recall  f1-score   support

           0       0.93      0.91      0.92      1071
           1       0.48      0.54      0.51       158

    accuracy                           0.87      1229
   macro avg       0.71      0.73      0.72      1229
weighted avg       0.87      0.87      0.87      1229

Test Precision: 0.4831, Test Recall: 0.5443, Test Macro F1: 0.7173, Test Accuracy: 0.8666 
Test Precision: 0.5833, Test Recall: 0.4873, Test Macro F1: 0.7342, Test Accuracy: 0.8893 


Train epoch 4/7: 100%|██████████| 205/205 [01:23<00:00,  2.44it/s]


Epoch [4/7], Train Loss: 0.3424, Train Accuracy: 0.8636 


Eval model: 100%|██████████| 52/52 [00:12<00:00,  4.15it/s]


              precision    recall  f1-score   support

           0       0.95      0.74      0.83      1071
           1       0.29      0.74      0.42       158

    accuracy                           0.74      1229
   macro avg       0.62      0.74      0.63      1229
weighted avg       0.87      0.74      0.78      1229

Epoch [4/7], Test Loss: 0.3400,Test Precision: 0.2932, Test Recall: 0.7405, Test Macro F1: 0.6251, Test Accuracy: 0.7372 
              precision    recall  f1-score   support

           0       0.93      0.93      0.93      1071
           1       0.54      0.55      0.55       158

    accuracy                           0.88      1229
   macro avg       0.74      0.74      0.74      1229
weighted avg       0.88      0.88      0.88      1229

Test Precision: 0.5404, Test Recall: 0.5506, Test Macro F1: 0.7388, Test Accuracy: 0.8820 
Test Precision: 0.6148, Test Recall: 0.4747, Test Macro F1: 0.7380, Test Accuracy: 0.8942 


Train epoch 5/7: 100%|██████████| 205/205 [01:27<00:00,  2.35it/s]


Epoch [5/7], Train Loss: 0.3409, Train Accuracy: 0.8648 


Eval model: 100%|██████████| 52/52 [00:13<00:00,  3.89it/s]


              precision    recall  f1-score   support

           0       0.95      0.74      0.83      1071
           1       0.29      0.74      0.42       158

    accuracy                           0.74      1229
   macro avg       0.62      0.74      0.63      1229
weighted avg       0.87      0.74      0.78      1229

Epoch [5/7], Test Loss: 0.3400,Test Precision: 0.2932, Test Recall: 0.7405, Test Macro F1: 0.6251, Test Accuracy: 0.7372 
              precision    recall  f1-score   support

           0       0.93      0.93      0.93      1071
           1       0.54      0.55      0.55       158

    accuracy                           0.88      1229
   macro avg       0.74      0.74      0.74      1229
weighted avg       0.88      0.88      0.88      1229

Test Precision: 0.5404, Test Recall: 0.5506, Test Macro F1: 0.7388, Test Accuracy: 0.8820 
Test Precision: 0.6148, Test Recall: 0.4747, Test Macro F1: 0.7380, Test Accuracy: 0.8942 


Train epoch 6/7: 100%|██████████| 205/205 [01:31<00:00,  2.25it/s]


Epoch [6/7], Train Loss: 0.3256, Train Accuracy: 0.8757 


Eval model: 100%|██████████| 52/52 [00:13<00:00,  3.92it/s]


              precision    recall  f1-score   support

           0       0.95      0.74      0.83      1071
           1       0.29      0.74      0.42       158

    accuracy                           0.74      1229
   macro avg       0.62      0.74      0.63      1229
weighted avg       0.87      0.74      0.78      1229

Epoch [6/7], Test Loss: 0.3400,Test Precision: 0.2932, Test Recall: 0.7405, Test Macro F1: 0.6251, Test Accuracy: 0.7372 
              precision    recall  f1-score   support

           0       0.93      0.93      0.93      1071
           1       0.54      0.55      0.55       158

    accuracy                           0.88      1229
   macro avg       0.74      0.74      0.74      1229
weighted avg       0.88      0.88      0.88      1229

Test Precision: 0.5404, Test Recall: 0.5506, Test Macro F1: 0.7388, Test Accuracy: 0.8820 
Test Precision: 0.6148, Test Recall: 0.4747, Test Macro F1: 0.7380, Test Accuracy: 0.8942 


Train epoch 7/7: 100%|██████████| 205/205 [01:25<00:00,  2.40it/s]


Epoch [7/7], Train Loss: 0.3411, Train Accuracy: 0.8639 


Eval model: 100%|██████████| 52/52 [00:14<00:00,  3.50it/s]


              precision    recall  f1-score   support

           0       0.95      0.74      0.83      1071
           1       0.29      0.74      0.42       158

    accuracy                           0.74      1229
   macro avg       0.62      0.74      0.63      1229
weighted avg       0.87      0.74      0.78      1229

Epoch [7/7], Test Loss: 0.3400,Test Precision: 0.2932, Test Recall: 0.7405, Test Macro F1: 0.6251, Test Accuracy: 0.7372 
              precision    recall  f1-score   support

           0       0.93      0.93      0.93      1071
           1       0.54      0.55      0.55       158

    accuracy                           0.88      1229
   macro avg       0.74      0.74      0.74      1229
weighted avg       0.88      0.88      0.88      1229

Test Precision: 0.5404, Test Recall: 0.5506, Test Macro F1: 0.7388, Test Accuracy: 0.8820 
Test Precision: 0.6148, Test Recall: 0.4747, Test Macro F1: 0.7380, Test Accuracy: 0.8942 
Training complete!


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Tokenizing data: 100%|██████████| 4916/4916 [00:02<00:00, 1729.75it/s]
Tokenizing data: 100%|██████████| 1229/1229 [00:00<00:00, 2091.34it/s]
Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.dense.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the

cuda


Train epoch 1/7: 100%|██████████| 205/205 [01:28<00:00,  2.31it/s]


Epoch [1/7], Train Loss: 0.6029, Train Accuracy: 0.6801 


Eval model: 100%|██████████| 52/52 [00:12<00:00,  4.02it/s]


              precision    recall  f1-score   support

           0       0.96      0.30      0.46      1071
           1       0.16      0.92      0.28       158

    accuracy                           0.38      1229
   macro avg       0.56      0.61      0.37      1229
weighted avg       0.86      0.38      0.44      1229

Epoch [1/7], Test Loss: 0.3452,Test Precision: 0.1629, Test Recall: 0.9177, Test Macro F1: 0.3696, Test Accuracy: 0.3832 
              precision    recall  f1-score   support

           0       0.93      0.93      0.93      1071
           1       0.55      0.56      0.55       158

    accuracy                           0.88      1229
   macro avg       0.74      0.74      0.74      1229
weighted avg       0.89      0.88      0.88      1229

Test Precision: 0.5500, Test Recall: 0.5570, Test Macro F1: 0.7436, Test Accuracy: 0.8845 
Test Precision: 1.0000, Test Recall: 0.0000, Test Macro F1: 0.4657, Test Accuracy: 0.8714 


Train epoch 2/7: 100%|██████████| 205/205 [01:20<00:00,  2.54it/s]


Epoch [2/7], Train Loss: 0.4484, Train Accuracy: 0.8164 


Eval model: 100%|██████████| 52/52 [00:15<00:00,  3.44it/s]


              precision    recall  f1-score   support

           0       0.96      0.64      0.76      1071
           1       0.24      0.80      0.37       158

    accuracy                           0.66      1229
   macro avg       0.60      0.72      0.57      1229
weighted avg       0.86      0.66      0.71      1229

Epoch [2/7], Test Loss: 0.3304,Test Precision: 0.2442, Test Recall: 0.7975, Test Macro F1: 0.5687, Test Accuracy: 0.6566 
              precision    recall  f1-score   support

           0       0.94      0.93      0.93      1071
           1       0.56      0.57      0.56       158

    accuracy                           0.89      1229
   macro avg       0.75      0.75      0.75      1229
weighted avg       0.89      0.89      0.89      1229

Test Precision: 0.5556, Test Recall: 0.5696, Test Macro F1: 0.7485, Test Accuracy: 0.8861 
Test Precision: 0.6311, Test Recall: 0.4873, Test Macro F1: 0.7461, Test Accuracy: 0.8975 


Train epoch 3/7: 100%|██████████| 205/205 [01:25<00:00,  2.41it/s]


Epoch [3/7], Train Loss: 0.4025, Train Accuracy: 0.8333 


Eval model: 100%|██████████| 52/52 [00:15<00:00,  3.35it/s]


              precision    recall  f1-score   support

           0       0.95      0.81      0.88      1071
           1       0.36      0.70      0.47       158

    accuracy                           0.80      1229
   macro avg       0.65      0.76      0.67      1229
weighted avg       0.87      0.80      0.82      1229

Epoch [3/7], Test Loss: 0.3067,Test Precision: 0.3558, Test Recall: 0.7025, Test Macro F1: 0.6738, Test Accuracy: 0.7982 
              precision    recall  f1-score   support

           0       0.94      0.94      0.94      1071
           1       0.58      0.58      0.58       158

    accuracy                           0.89      1229
   macro avg       0.76      0.76      0.76      1229
weighted avg       0.89      0.89      0.89      1229

Test Precision: 0.5759, Test Recall: 0.5759, Test Macro F1: 0.7567, Test Accuracy: 0.8910 
Test Precision: 0.6087, Test Recall: 0.5316, Test Macro F1: 0.7542, Test Accuracy: 0.8959 


Train epoch 4/7: 100%|██████████| 205/205 [01:29<00:00,  2.29it/s]


Epoch [4/7], Train Loss: 0.3537, Train Accuracy: 0.8600 


Eval model: 100%|██████████| 52/52 [00:13<00:00,  3.90it/s]


              precision    recall  f1-score   support

           0       0.95      0.79      0.86      1071
           1       0.32      0.70      0.44       158

    accuracy                           0.77      1229
   macro avg       0.63      0.74      0.65      1229
weighted avg       0.87      0.77      0.80      1229

Epoch [4/7], Test Loss: 0.3056,Test Precision: 0.3235, Test Recall: 0.6962, Test Macro F1: 0.6500, Test Accuracy: 0.7738 
              precision    recall  f1-score   support

           0       0.93      0.94      0.94      1071
           1       0.58      0.54      0.56       158

    accuracy                           0.89      1229
   macro avg       0.76      0.74      0.75      1229
weighted avg       0.89      0.89      0.89      1229

Test Precision: 0.5811, Test Recall: 0.5443, Test Macro F1: 0.7499, Test Accuracy: 0.8910 
Test Precision: 0.6638, Test Recall: 0.4873, Test Macro F1: 0.7535, Test Accuracy: 0.9024 


Train epoch 5/7: 100%|██████████| 205/205 [01:30<00:00,  2.26it/s]


Epoch [5/7], Train Loss: 0.3602, Train Accuracy: 0.8594 


Eval model: 100%|██████████| 52/52 [00:12<00:00,  4.02it/s]


              precision    recall  f1-score   support

           0       0.95      0.79      0.86      1071
           1       0.32      0.70      0.44       158

    accuracy                           0.77      1229
   macro avg       0.63      0.74      0.65      1229
weighted avg       0.87      0.77      0.80      1229

Epoch [5/7], Test Loss: 0.3056,Test Precision: 0.3235, Test Recall: 0.6962, Test Macro F1: 0.6500, Test Accuracy: 0.7738 
              precision    recall  f1-score   support

           0       0.93      0.94      0.94      1071
           1       0.58      0.54      0.56       158

    accuracy                           0.89      1229
   macro avg       0.76      0.74      0.75      1229
weighted avg       0.89      0.89      0.89      1229

Test Precision: 0.5811, Test Recall: 0.5443, Test Macro F1: 0.7499, Test Accuracy: 0.8910 
Test Precision: 0.6638, Test Recall: 0.4873, Test Macro F1: 0.7535, Test Accuracy: 0.9024 


Train epoch 6/7: 100%|██████████| 205/205 [01:28<00:00,  2.32it/s]


Epoch [6/7], Train Loss: 0.3635, Train Accuracy: 0.8587 


Eval model: 100%|██████████| 52/52 [00:12<00:00,  4.29it/s]


              precision    recall  f1-score   support

           0       0.95      0.79      0.86      1071
           1       0.32      0.70      0.44       158

    accuracy                           0.77      1229
   macro avg       0.63      0.74      0.65      1229
weighted avg       0.87      0.77      0.80      1229

Epoch [6/7], Test Loss: 0.3056,Test Precision: 0.3235, Test Recall: 0.6962, Test Macro F1: 0.6500, Test Accuracy: 0.7738 
              precision    recall  f1-score   support

           0       0.93      0.94      0.94      1071
           1       0.58      0.54      0.56       158

    accuracy                           0.89      1229
   macro avg       0.76      0.74      0.75      1229
weighted avg       0.89      0.89      0.89      1229

Test Precision: 0.5811, Test Recall: 0.5443, Test Macro F1: 0.7499, Test Accuracy: 0.8910 
Test Precision: 0.6638, Test Recall: 0.4873, Test Macro F1: 0.7535, Test Accuracy: 0.9024 


Train epoch 7/7: 100%|██████████| 205/205 [01:26<00:00,  2.36it/s]


Epoch [7/7], Train Loss: 0.3440, Train Accuracy: 0.8665 


Eval model: 100%|██████████| 52/52 [00:13<00:00,  3.79it/s]


              precision    recall  f1-score   support

           0       0.95      0.79      0.86      1071
           1       0.32      0.70      0.44       158

    accuracy                           0.77      1229
   macro avg       0.63      0.74      0.65      1229
weighted avg       0.87      0.77      0.80      1229

Epoch [7/7], Test Loss: 0.3056,Test Precision: 0.3235, Test Recall: 0.6962, Test Macro F1: 0.6500, Test Accuracy: 0.7738 
              precision    recall  f1-score   support

           0       0.93      0.94      0.94      1071
           1       0.58      0.54      0.56       158

    accuracy                           0.89      1229
   macro avg       0.76      0.74      0.75      1229
weighted avg       0.89      0.89      0.89      1229

Test Precision: 0.5811, Test Recall: 0.5443, Test Macro F1: 0.7499, Test Accuracy: 0.8910 
Test Precision: 0.6638, Test Recall: 0.4873, Test Macro F1: 0.7535, Test Accuracy: 0.9024 
Training complete!


In [35]:
f"saved_model/{model_name}_5/7e_{lr}lr_"

'saved_model/derbertav3_base_oversampling_5/7e_1e-05lr_'

In [36]:
epochs=7

In [162]:

df_test["IsInstructionClear"] = df_test.IsInstructionClear.replace({"Yes": 0, "No": 1})
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_hug)


datasetval = get_tensor_dataset(df_test, tokenizer)


val_dataloader = DataLoader(
    datasetval, batch_size=batch_size, pin_memory=True, num_workers=4
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Tokenizing data: 100%|██████████| 683/683 [00:00<00:00, 1439.61it/s]


In [90]:
device ='cuda'

In [91]:
f"saved_model/{model_name}_{epoch}/{epochs}e_{lr}lr_f{fold}/pytorch_model.bin"

'saved_model/deberta_v3_base_proper_5/7e_1e-05lr_f4/pytorch_model.bin'

In [97]:
all_res=[]
for fold in range(FOLDS):
  res=[]
  true_labels=[]
  model = AutoModelForSequenceClassification.from_pretrained(model_hug, num_labels=2)
  model.cuda()
  model.load_state_dict(torch.load(f"saved_model/{model_name}_{epoch}/{epochs}e_{lr}lr_f{fold}/pytorch_model.bin"))
  model.eval() 
  
  for _, batch in tqdm(
            enumerate(val_dataloader),
            total=len(val_dataloader),
            #desc=f"Train epoch {epoch+1}/{epochs}",
        ):
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)
    with torch.no_grad():
      pred = torch.softmax(model(b_input_ids,attention_mask=b_input_mask).logits,axis=1).detach().cpu().numpy()

    #res = int(pred[:,1]>0.5)
    res.append((pred[:,1]>0.5).astype(int))
    true_labels.append(b_labels.detach().cpu().numpy())
  all_res.append(np.concatenate(res))
all_res = np.stack(all_res,axis=1)
final = np.mean(all_res,axis=1)>3/5
labels=np.concatenate(true_labels)


f1_score(labels,final,average="macro")

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.dense.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a

0.7495075027244531

In [247]:
all_res=[]
for fold in range(FOLDS):
  res=[]
  true_labels=[]
  model = AutoModelForSequenceClassification.from_pretrained(model_hug, num_labels=2)
  model.cuda()
  model.load_state_dict(torch.load(f"saved_model/{model_name}_{epoch}/{epochs}e_{lr}lr_f{fold}/pytorch_model.bin"))
  model.eval() 
  
  for _, batch in tqdm(
            enumerate(val_dataloader),
            total=len(val_dataloader),
            #desc=f"Train epoch {epoch+1}/{epochs}",
        ):
    b_input_ids = batch[0].to(device)
    b_input_mask = batch[1].to(device)
    b_labels = batch[2].to(device)
    with torch.no_grad():
      pred = torch.softmax(model(b_input_ids,attention_mask=b_input_mask).logits,axis=1).detach().cpu().numpy()

    #res = int(pred[:,1]>0.5)
    #res.append((pred[:,1]>0.5).astype(int))
    res.append((pred[:,1]))
    true_labels.append(b_labels.detach().cpu().numpy())
  all_res.append(np.concatenate(res))
all_res = np.stack(all_res,axis=1)
final = np.mean(all_res,axis=1)>0.5
labels=np.concatenate(true_labels)


f1_score(labels,final,average="macro")

Some weights of the model checkpoint at microsoft/deberta-v3-base were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.dense.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a

0.7554600787683494

In [245]:
final = np.mean(all_res,axis=1)>0.65
labels=np.concatenate(true_labels)
f1_score(labels,final,average="macro")

0.7671983950761561

In [39]:
block_colour_name_map = {
    # voxelworld's colour id : iglu colour id
    0: "air",
    1: "blue",
    6: "yellow",
    2: "green",
    4: "orange",
    5: "purple",
    3: "red",
}

In [40]:

original_colours =[val for key,val in block_colour_name_map.items()if val !='air']

In [None]:
gen = default_rng(seed=42)
gen.permutation(original_colours)
cur_per = gen.permutation(original_colours)
no_per = {key:key for key in original_colours}
per_dict ={ key:val for key,val in zip(original_colours,cur_per)}

In [51]:
import re
def replace_full_words(text, dic):
    for i, j in dic.items():
        text = re.sub(r"\b%s\b" % i, j, text)
        # r"\b%s\b"% enables replacing by whole word matches only
    return text

In [52]:
sentence = 'There are 6 levels. There are 12 different blocks. At the 0th level there are 2 red  blocks  Above at the 1st level there are 4 red  blocks  Above at the 2nd level there are 2 red and 1 green  blocks  Above at the 3rd level there are 1 green  blocks  Above at the 4th level there are 1 green  blocks  Above at the 5th level there are 1 green  blocks '

In [53]:
replace_full_words(sentence,per_dict)

'There are 6 levels. There are 12 different blocks. At the 0th level there are 2 red  blocks  Above at the 1st level there are 4 red  blocks  Above at the 2nd level there are 2 red and 1 blue  blocks  Above at the 3rd level there are 1 blue  blocks  Above at the 4th level there are 1 blue  blocks  Above at the 5th level there are 1 blue  blocks '

In [None]:
def get_tensor_dataset_with_permutations(df, tokenizer,per_dict=no_per):

    # Tokenize all of the sentences and map the tokens to thier word IDs.
    input_ids = []
    token_type_ids = []
    attention_masks = []
    labels = []
    topic_ids = []

    for count, item in tqdm(
        enumerate(
            zip(
                df["GameId"],
                df["bylevel_color_context"],  # df["nonspatial_color_context"],
                df["InputInstruction"],
                df["IsInstructionClear"],
            )
        ),
        total=len(df),
        desc="Tokenizing data",
    ):
        z, w, x, y = item

        w = replace_full_words(w,per_dict)
        x = replace_full_words(x,per_dict)
        encoded_dict = tokenizer.encode_plus(
            w,
            x,
            add_special_tokens=True,
            max_length=max_seq_length,
            padding="max_length",  # use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'`
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        input_ids.append(encoded_dict["input_ids"])

        if "token_type_ids" in encoded_dict:
            token_type_ids.append(encoded_dict["token_type_ids"])

        attention_masks.append(encoded_dict["attention_mask"])
        labels.append(y)

        topic_ids.append(z)

    input_ids = torch.cat(input_ids, dim=0)
    attention_masks = torch.cat(attention_masks, dim=0)
    labels = torch.tensor(labels)
    return TensorDataset(input_ids, attention_masks, labels)


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_hug)
all_per_preds=[]
for step in range(5):
  if step==0:
      datasetval = get_tensor_dataset_with_permutations(df_test, tokenizer)
  else:
      cur_per = gen.permutation(original_colours)
      per_dict ={ key:val for key,val in zip(original_colours,cur_per)}
      datasetval = get_tensor_dataset_with_permutations(df_test, tokenizer,per_dict)

  val_dataloader = DataLoader(
      datasetval, batch_size=batch_size, pin_memory=True, num_workers=4
  )

  all_res=[]
  for fold in range(FOLDS):
    res=[]
    true_labels=[]
    model = AutoModelForSequenceClassification.from_pretrained(model_hug, num_labels=2)
    model.cuda()
    model.load_state_dict(torch.load(f"saved_model/{model_name}_{epoch}/{epochs}e_{lr}lr_f{fold}/pytorch_model.bin"))
    model.eval() 
    
    for _, batch in tqdm(
              enumerate(val_dataloader),
              total=len(val_dataloader),
              #desc=f"Train epoch {epoch+1}/{epochs}",
          ):
      b_input_ids = batch[0].to(device)
      b_input_mask = batch[1].to(device)
      b_labels = batch[2].to(device)
      with torch.no_grad():
        pred = torch.softmax(model(b_input_ids,attention_mask=b_input_mask).logits,axis=1).detach().cpu().numpy()

      #res = int(pred[:,1]>0.5)
      #res.append((pred[:,1]>0.5).astype(int))
      res.append((pred[:,1]))
      true_labels.append(b_labels.detach().cpu().numpy())
    all_res.append(np.concatenate(res))
  all_res = np.stack(all_res,axis=1)
  final = np.mean(all_res,axis=1)>0.5
  labels=np.concatenate(true_labels)


  f1_score(labels,final,average="macro")
  del model
  del batch
  torch.cuda.empty_cache()

  all_per_preds.append(all_res)