In [1]:

from IPython.display import clear_output
import torch
import evaluate
import numpy as np
from datasets import load_from_disk, disable_caching, concatenate_datasets
from sklearn.metrics import f1_score
from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    Trainer,
    TrainingArguments,
    set_seed,
)
import sys
import os
sys.path.append(os.path.abspath('../../modules'))
from experiment_1.RoBERTaEntity import RoBERTaEntity
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
import random
seeds = [random.randint(0, 1e9) for _ in range(5)]
seeds

[900263936, 237090229, 900569911, 553032617, 115865595]

In [3]:
disable_caching()

In [4]:
num_labels = 5
id2label = {
    0: "reject",
    1: "B_supplies_A",
    2: "A_supplies_B",
    3: "ambiguous",
    4: "ownership",
}
label2id = {
    "reject": 0,
    "B_supplies_A": 1,
    "A_supplies_B": 2,
    "ambiguous": 3,
    "ownership": 4,
}
metric = evaluate.load("f1")

In [5]:
model_name = "FacebookAI/roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens(
    {"additional_special_tokens": ["__NE_FROM__", "__NE_TO__", "__NE_OTHER__"]}
)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [6]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    f1_micro = f1_score(labels, predictions, average='micro')
    f1_macro = f1_score(labels, predictions, average='macro')
    f1_classwise = f1_score(labels, predictions, average=None)

    return {
        "f1_micro": f1_micro,
        "f1_macro": f1_macro,
        **{f"f1_class_{i}": score for i, score in enumerate(f1_classwise)}
    }


def model_init():
    model = RoBERTaEntity.from_pretrained(
        model_name,
        num_labels=5,
        id2label=id2label,
        label2id=label2id,
    )
    model.resize_token_embeddings(len(tokenizer))
    return model

In [7]:
ds_zero = load_from_disk("../../datasets/ZeroShotDataset")
ds_zero = ds_zero.select_columns(["masked_text", "label"])
ds_zero = ds_zero.rename_column("masked_text", "text")
ds_zero = ds_zero.map(preprocess_function, batched=True)

ds_few = load_from_disk("../../datasets/TenShotDataset")
ds_few = ds_few.select_columns(["masked_text", "label"])
ds_few = ds_few.rename_column("masked_text", "text")
ds_few = ds_few.map(preprocess_function, batched=True)

ds_two = load_from_disk("../../datasets/TwoStageDataset")
ds_two = ds_two.select_columns(["masked_text", "label"])
ds_two = ds_two.rename_column("masked_text", "text")
ds_two = ds_two.map(preprocess_function, batched=True)

ds = load_from_disk("../../datasets/ManualDataset")
ds = ds.select_columns(["masked_text", "label"])
ds = ds.rename_column("masked_text", "text")
ds = ds.map(preprocess_function, batched=True)

Map:   0%|          | 0/2246 [00:00<?, ? examples/s]

Map:   0%|          | 0/2954 [00:00<?, ? examples/s]

Map:   0%|          | 0/1765 [00:00<?, ? examples/s]

Map:   0%|          | 0/2559 [00:00<?, ? examples/s]

Map:   0%|          | 0/418 [00:00<?, ? examples/s]

Map:   0%|          | 0/745 [00:00<?, ? examples/s]

In [8]:
def run_experiment(seed, train_dataset, output_dir="logs/experiment_2"):
    set_seed(seed)
    training_args = TrainingArguments(
        seed=seed,
        data_seed=seed,
        tf32=True,
        output_dir=output_dir,
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=10,
        weight_decay=0.01,
        eval_strategy="epoch",
        save_strategy="epoch",
        warmup_ratio=0.1,
        load_best_model_at_end=True,
        save_total_limit=1,
        report_to=[],
        save_only_model=True,
    )
    trainer = Trainer(
        model_init=model_init,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=ds["valid"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )
    trainer.train()
    test_results = trainer.predict(ds["test"])
    return test_results

# ZeroShotDataset

In [9]:
seeds = [115624676, 313063585, 725591958, 966378087, 644779121]
all_results = []
for seed in seeds:
    results = run_experiment(seed, ds_zero)
    all_results.append(results)
clear_output(wait=False)
metrics = ["test_f1_micro", "test_f1_macro"] + [f"test_f1_class_{i}" for i in range(5)]
avg_results = {}
for metric in metrics:
    scores = [r[metric] for r in [x.metrics for x in all_results]]
    avg_results[metric] = {
        'mean': np.mean(scores),
        'std': np.std(scores)
    }
for metric in metrics:
    print(f"Average {metric}: {avg_results[metric]['mean']:.4f} ± {avg_results[metric]['std']:.4f}")
torch.cuda.empty_cache()

Average test_f1_micro: 0.5431 ± 0.0295
Average test_f1_macro: 0.4733 ± 0.0318
Average test_f1_class_0: 0.6474 ± 0.0221
Average test_f1_class_1: 0.4486 ± 0.0641
Average test_f1_class_2: 0.5203 ± 0.0772
Average test_f1_class_3: 0.2480 ± 0.0415
Average test_f1_class_4: 0.5021 ± 0.0387


# TenShotDataset

In [10]:
seeds = [764077453, 710632782, 865707432, 482052655, 452353057]
all_results = []
for seed in seeds:
    results = run_experiment(seed, ds_few)
    all_results.append(results)
clear_output(wait=False)
metrics = ["test_f1_micro", "test_f1_macro"] + [f"test_f1_class_{i}" for i in range(5)]
avg_results = {}
for metric in metrics:
    scores = [r[metric] for r in [x.metrics for x in all_results]]
    avg_results[metric] = {
        'mean': np.mean(scores),
        'std': np.std(scores)
    }
for metric in metrics:
    print(f"Average {metric}: {avg_results[metric]['mean']:.4f} ± {avg_results[metric]['std']:.4f}")
torch.cuda.empty_cache()

Average test_f1_micro: 0.6239 ± 0.0277
Average test_f1_macro: 0.5753 ± 0.0340
Average test_f1_class_0: 0.6927 ± 0.0143
Average test_f1_class_1: 0.5756 ± 0.0189
Average test_f1_class_2: 0.6093 ± 0.0729
Average test_f1_class_3: 0.4202 ± 0.0778
Average test_f1_class_4: 0.5787 ± 0.0398


# TwoStageDataset

In [11]:
seeds = [624420665, 719682587, 749390431, 356261854, 724384563]
all_results = []
for seed in seeds:
    results = run_experiment(seed, ds_two)
    all_results.append(results)
clear_output(wait=False)
metrics = ["test_f1_micro", "test_f1_macro"] + [f"test_f1_class_{i}" for i in range(5)]
avg_results = {}
for metric in metrics:
    scores = [r[metric] for r in [x.metrics for x in all_results]]
    avg_results[metric] = {
        'mean': np.mean(scores),
        'std': np.std(scores)
    }
for metric in metrics:
    print(f"Average {metric}: {avg_results[metric]['mean']:.4f} ± {avg_results[metric]['std']:.4f}")
torch.cuda.empty_cache()

Average test_f1_micro: 0.5611 ± 0.0314
Average test_f1_macro: 0.5249 ± 0.0249
Average test_f1_class_0: 0.6402 ± 0.0348
Average test_f1_class_1: 0.4791 ± 0.0915
Average test_f1_class_2: 0.5472 ± 0.0284
Average test_f1_class_3: 0.3940 ± 0.0399
Average test_f1_class_4: 0.5640 ± 0.0472


# ManualDataset + ZeroShotDataset

In [12]:
seeds = [230818434, 598147794, 339224527, 228940900, 111815430]
all_results = []
for seed in seeds:
    results = run_experiment(seed, concatenate_datasets([ds_zero, ds['train']]))
    all_results.append(results)
clear_output(wait=False)
metrics = ["test_f1_micro", "test_f1_macro"] + [f"test_f1_class_{i}" for i in range(5)]
avg_results = {}
for metric in metrics:
    scores = [r[metric] for r in [x.metrics for x in all_results]]
    avg_results[metric] = {
        'mean': np.mean(scores),
        'std': np.std(scores)
    }
for metric in metrics:
    print(f"Average {metric}: {avg_results[metric]['mean']:.4f} ± {avg_results[metric]['std']:.4f}")
torch.cuda.empty_cache()

Average test_f1_micro: 0.7936 ± 0.0107
Average test_f1_macro: 0.7867 ± 0.0122
Average test_f1_class_0: 0.7926 ± 0.0185
Average test_f1_class_1: 0.8202 ± 0.0235
Average test_f1_class_2: 0.8259 ± 0.0088
Average test_f1_class_3: 0.6668 ± 0.0148
Average test_f1_class_4: 0.8278 ± 0.0188


# ManualDataset + TenShotDataset

In [13]:
seeds = [173548468, 351960359, 597931470, 910980050, 774195904]
all_results = []
for seed in seeds:
    results = run_experiment(seed, concatenate_datasets([ds_few, ds['train']]))
    all_results.append(results)
clear_output(wait=False)
metrics = ["test_f1_micro", "test_f1_macro"] + [f"test_f1_class_{i}" for i in range(5)]
avg_results = {}
for metric in metrics:
    scores = [r[metric] for r in [x.metrics for x in all_results]]
    avg_results[metric] = {
        'mean': np.mean(scores),
        'std': np.std(scores)
    }
for metric in metrics:
    print(f"Average {metric}: {avg_results[metric]['mean']:.4f} ± {avg_results[metric]['std']:.4f}")
torch.cuda.empty_cache()

Average test_f1_micro: 0.7981 ± 0.0112
Average test_f1_macro: 0.7816 ± 0.0109
Average test_f1_class_0: 0.8160 ± 0.0100
Average test_f1_class_1: 0.7951 ± 0.0194
Average test_f1_class_2: 0.8366 ± 0.0110
Average test_f1_class_3: 0.6714 ± 0.0272
Average test_f1_class_4: 0.7889 ± 0.0191


# ManualDataset + TwoStageDataset

In [14]:
seeds = [271054931, 88464423, 222017585, 272392254, 668356033]
all_results = []
for seed in seeds:
    results = run_experiment(seed, concatenate_datasets([ds_two, ds['train']]))
    all_results.append(results)
clear_output(wait=False)
metrics = ["test_f1_micro", "test_f1_macro"] + [f"test_f1_class_{i}" for i in range(5)]
avg_results = {}
for metric in metrics:
    scores = [r[metric] for r in [x.metrics for x in all_results]]
    avg_results[metric] = {
        'mean': np.mean(scores),
        'std': np.std(scores)
    }
for metric in metrics:
    print(f"Average {metric}: {avg_results[metric]['mean']:.4f} ± {avg_results[metric]['std']:.4f}")
torch.cuda.empty_cache()

Average test_f1_micro: 0.7817 ± 0.0100
Average test_f1_macro: 0.7667 ± 0.0115
Average test_f1_class_0: 0.7807 ± 0.0171
Average test_f1_class_1: 0.8039 ± 0.0184
Average test_f1_class_2: 0.8314 ± 0.0153
Average test_f1_class_3: 0.6264 ± 0.0275
Average test_f1_class_4: 0.7912 ± 0.0498


## Reduced Dataset

In [10]:
ds_reduce = load_from_disk("../../datasets/ManualReducedDataset")
ds_reduce = ds_reduce.select_columns(["masked_text", "label"])
ds_reduce = ds_reduce.rename_column("masked_text", "text")
ds_reduce = ds_reduce.map(preprocess_function, batched=True)

ds_zero_reduce = load_from_disk("../../datasets/ZeroShotReducedDataset")
ds_zero_reduce = ds_zero_reduce.select_columns(["masked_text", "label"])
ds_zero_reduce = ds_zero_reduce.rename_column("masked_text", "text")
ds_zero_reduce = ds_zero_reduce.map(preprocess_function, batched=True)

ds_few_reduce = load_from_disk("../../datasets/TenShotReducedDataset")
ds_few_reduce = ds_few_reduce.select_columns(["masked_text", "label"])
ds_few_reduce = ds_few_reduce.rename_column("masked_text", "text")
ds_few_reduce = ds_few_reduce.map(preprocess_function, batched=True)

Map:   0%|          | 0/181 [00:00<?, ? examples/s]

Map:   0%|          | 0/1968 [00:00<?, ? examples/s]

Map:   0%|          | 0/2699 [00:00<?, ? examples/s]

# ManualReducedDataset

In [16]:
seeds = [286683549, 6420921, 197067174, 959206791, 413404281]
all_results = []
for seed in seeds:
    results = run_experiment(seed, ds_reduce)
    all_results.append(results)
clear_output(wait=False)
metrics = ["test_f1_micro", "test_f1_macro"] + [f"test_f1_class_{i}" for i in range(5)]
avg_results = {}
for metric in metrics:
    scores = [r[metric] for r in [x.metrics for x in all_results]]
    avg_results[metric] = {
        'mean': np.mean(scores),
        'std': np.std(scores)
    }
for metric in metrics:
    print(f"Average {metric}: {avg_results[metric]['mean']:.4f} ± {avg_results[metric]['std']:.4f}")
torch.cuda.empty_cache()

Average test_f1_micro: 0.5930 ± 0.0270
Average test_f1_macro: 0.5231 ± 0.0372
Average test_f1_class_0: 0.6801 ± 0.0140
Average test_f1_class_1: 0.3799 ± 0.1069
Average test_f1_class_2: 0.6292 ± 0.0365
Average test_f1_class_3: 0.3996 ± 0.0294
Average test_f1_class_4: 0.5266 ± 0.0298


# ZeroShotReducedDataset

In [17]:
seeds = [320896692, 932940650, 883472909, 968454980, 421657822]
all_results = []
for seed in seeds:
    results = run_experiment(seed, ds_zero_reduce)
    all_results.append(results)
clear_output(wait=False)
metrics = ["test_f1_micro", "test_f1_macro"] + [f"test_f1_class_{i}" for i in range(5)]
avg_results = {}
for metric in metrics:
    scores = [r[metric] for r in [x.metrics for x in all_results]]
    avg_results[metric] = {
        'mean': np.mean(scores),
        'std': np.std(scores)
    }
for metric in metrics:
    print(f"Average {metric}: {avg_results[metric]['mean']:.4f} ± {avg_results[metric]['std']:.4f}")
torch.cuda.empty_cache()

Average test_f1_micro: 0.5348 ± 0.0141
Average test_f1_macro: 0.4797 ± 0.0175
Average test_f1_class_0: 0.6535 ± 0.0132
Average test_f1_class_1: 0.4567 ± 0.0221
Average test_f1_class_2: 0.5069 ± 0.0524
Average test_f1_class_3: 0.3217 ± 0.0543
Average test_f1_class_4: 0.4598 ± 0.0351


# TenShotReducedDataset

In [18]:
seeds = [706375894, 14365483, 593256681, 791003381, 210000475]
all_results = []
for seed in seeds:
    results = run_experiment(seed, ds_few_reduce)
    all_results.append(results)
clear_output(wait=False)
metrics = ["test_f1_micro", "test_f1_macro"] + [f"test_f1_class_{i}" for i in range(5)]
avg_results = {}
for metric in metrics:
    scores = [r[metric] for r in [x.metrics for x in all_results]]
    avg_results[metric] = {
        'mean': np.mean(scores),
        'std': np.std(scores)
    }
for metric in metrics:
    print(f"Average {metric}: {avg_results[metric]['mean']:.4f} ± {avg_results[metric]['std']:.4f}")
torch.cuda.empty_cache()

Average test_f1_micro: 0.6113 ± 0.0214
Average test_f1_macro: 0.5394 ± 0.0244
Average test_f1_class_0: 0.6832 ± 0.0196
Average test_f1_class_1: 0.5462 ± 0.0421
Average test_f1_class_2: 0.6441 ± 0.0477
Average test_f1_class_3: 0.3175 ± 0.0485
Average test_f1_class_4: 0.5063 ± 0.0214


# ManualReducedDataset + ZeroShotReducedDataset

In [19]:
seeds = [686579303, 119540831, 26855092, 796233790, 295310485]
all_results = []
for seed in seeds:
    results = run_experiment(seed, concatenate_datasets([ds_zero_reduce, ds_reduce]))
    all_results.append(results)
clear_output(wait=False)
metrics = ["test_f1_micro", "test_f1_macro"] + [f"test_f1_class_{i}" for i in range(5)]
avg_results = {}
for metric in metrics:
    scores = [r[metric] for r in [x.metrics for x in all_results]]
    avg_results[metric] = {
        'mean': np.mean(scores),
        'std': np.std(scores)
    }
for metric in metrics:
    print(f"Average {metric}: {avg_results[metric]['mean']:.4f} ± {avg_results[metric]['std']:.4f}")
torch.cuda.empty_cache()

Average test_f1_micro: 0.6231 ± 0.0452
Average test_f1_macro: 0.5822 ± 0.0417
Average test_f1_class_0: 0.6125 ± 0.1074
Average test_f1_class_1: 0.6012 ± 0.0341
Average test_f1_class_2: 0.7172 ± 0.0132
Average test_f1_class_3: 0.3893 ± 0.0911
Average test_f1_class_4: 0.5908 ± 0.0463


# ManualReducedDataset + TenShotReducedDataset

In [20]:
seeds = [262950628, 239670711, 149827706, 790779946, 110053353]
all_results = []
for seed in seeds:
    results = run_experiment(seed, concatenate_datasets([ds_few_reduce, ds_reduce]))
    all_results.append(results)
clear_output(wait=False)
metrics = ["test_f1_micro", "test_f1_macro"] + [f"test_f1_class_{i}" for i in range(5)]
avg_results = {}
for metric in metrics:
    scores = [r[metric] for r in [x.metrics for x in all_results]]
    avg_results[metric] = {
        'mean': np.mean(scores),
        'std': np.std(scores)
    }
for metric in metrics:
    print(f"Average {metric}: {avg_results[metric]['mean']:.4f} ± {avg_results[metric]['std']:.4f}")
torch.cuda.empty_cache()

Average test_f1_micro: 0.6623 ± 0.0092
Average test_f1_macro: 0.5905 ± 0.0083
Average test_f1_class_0: 0.7026 ± 0.0080
Average test_f1_class_1: 0.6466 ± 0.0115
Average test_f1_class_2: 0.7328 ± 0.0181
Average test_f1_class_3: 0.3661 ± 0.0413
Average test_f1_class_4: 0.5046 ± 0.0367


# ManualReducedDataset + TwoStageReducedDataset

In [21]:
seeds = [802453211, 158255423, 704616756, 802301731, 241810196]
all_results = []
for seed in seeds:
    results = run_experiment(seed, concatenate_datasets([ds_two, ds_reduce]))
    all_results.append(results)
clear_output(wait=False)
metrics = ["test_f1_micro", "test_f1_macro"] + [f"test_f1_class_{i}" for i in range(5)]
avg_results = {}
for metric in metrics:
    scores = [r[metric] for r in [x.metrics for x in all_results]]
    avg_results[metric] = {
        'mean': np.mean(scores),
        'std': np.std(scores)
    }
for metric in metrics:
    print(f"Average {metric}: {avg_results[metric]['mean']:.4f} ± {avg_results[metric]['std']:.4f}")
torch.cuda.empty_cache()

Average test_f1_micro: 0.6204 ± 0.0358
Average test_f1_macro: 0.5874 ± 0.0419
Average test_f1_class_0: 0.6621 ± 0.0218
Average test_f1_class_1: 0.5667 ± 0.0875
Average test_f1_class_2: 0.6460 ± 0.0824
Average test_f1_class_3: 0.4775 ± 0.0202
Average test_f1_class_4: 0.5846 ± 0.0359


## Used for Case Study

In [11]:
run_experiment(42, concatenate_datasets([ds_few_reduce, ds_reduce]), "../../CaseStudyModel")

Some weights of RoBERTaEntity were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of RoBERTaEntity were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/1800 [00:00<?, ?it/s]

  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 1.1119701862335205, 'eval_f1_micro': 0.5956937799043063, 'eval_f1_macro': 0.49572818003252783, 'eval_f1_class_0': 0.6826923076923077, 'eval_f1_class_1': 0.4155844155844156, 'eval_f1_class_2': 0.6044444444444445, 'eval_f1_class_3': 0.391304347826087, 'eval_f1_class_4': 0.38461538461538464, 'eval_runtime': 0.3599, 'eval_samples_per_second': 1161.418, 'eval_steps_per_second': 75.02, 'epoch': 1.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 0.988167941570282, 'eval_f1_micro': 0.638755980861244, 'eval_f1_macro': 0.559051808858369, 'eval_f1_class_0': 0.7025641025641025, 'eval_f1_class_1': 0.4523809523809524, 'eval_f1_class_2': 0.6747967479674797, 'eval_f1_class_3': 0.4827586206896552, 'eval_f1_class_4': 0.4827586206896552, 'eval_runtime': 0.3604, 'eval_samples_per_second': 1159.678, 'eval_steps_per_second': 74.907, 'epoch': 2.0}
{'loss': 0.7515, 'grad_norm': 9.953536033630371, 'learning_rate': 1.6049382716049385e-05, 'epoch': 2.78}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 1.2224868535995483, 'eval_f1_micro': 0.6291866028708134, 'eval_f1_macro': 0.5326521675667786, 'eval_f1_class_0': 0.717948717948718, 'eval_f1_class_1': 0.3870967741935484, 'eval_f1_class_2': 0.6334841628959276, 'eval_f1_class_3': 0.40860215053763443, 'eval_f1_class_4': 0.5161290322580645, 'eval_runtime': 0.3773, 'eval_samples_per_second': 1107.739, 'eval_steps_per_second': 71.552, 'epoch': 3.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 1.1359838247299194, 'eval_f1_micro': 0.6674641148325359, 'eval_f1_macro': 0.5743029095754962, 'eval_f1_class_0': 0.712468193384224, 'eval_f1_class_1': 0.45901639344262296, 'eval_f1_class_2': 0.7368421052631579, 'eval_f1_class_3': 0.4470588235294118, 'eval_f1_class_4': 0.5161290322580645, 'eval_runtime': 0.4269, 'eval_samples_per_second': 979.169, 'eval_steps_per_second': 63.248, 'epoch': 4.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 1.1057027578353882, 'eval_f1_micro': 0.6698564593301436, 'eval_f1_macro': 0.5767814554494064, 'eval_f1_class_0': 0.7304785894206549, 'eval_f1_class_1': 0.44776119402985076, 'eval_f1_class_2': 0.728, 'eval_f1_class_3': 0.46153846153846156, 'eval_f1_class_4': 0.5161290322580645, 'eval_runtime': 0.4, 'eval_samples_per_second': 1045.039, 'eval_steps_per_second': 67.503, 'epoch': 5.0}
{'loss': 0.3482, 'grad_norm': 19.060897827148438, 'learning_rate': 9.876543209876543e-06, 'epoch': 5.56}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 1.1563602685928345, 'eval_f1_micro': 0.69377990430622, 'eval_f1_macro': 0.6243627850765329, 'eval_f1_class_0': 0.764102564102564, 'eval_f1_class_1': 0.5671641791044776, 'eval_f1_class_2': 0.7142857142857143, 'eval_f1_class_3': 0.5137614678899083, 'eval_f1_class_4': 0.5625, 'eval_runtime': 0.4017, 'eval_samples_per_second': 1040.621, 'eval_steps_per_second': 67.217, 'epoch': 6.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 1.2259196043014526, 'eval_f1_micro': 0.6507177033492823, 'eval_f1_macro': 0.5799449155613539, 'eval_f1_class_0': 0.7513227513227513, 'eval_f1_class_1': 0.5172413793103449, 'eval_f1_class_2': 0.6484018264840182, 'eval_f1_class_3': 0.4827586206896552, 'eval_f1_class_4': 0.5, 'eval_runtime': 0.3595, 'eval_samples_per_second': 1162.624, 'eval_steps_per_second': 75.098, 'epoch': 7.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 1.2206114530563354, 'eval_f1_micro': 0.7033492822966507, 'eval_f1_macro': 0.6214245192148005, 'eval_f1_class_0': 0.7647058823529411, 'eval_f1_class_1': 0.5714285714285714, 'eval_f1_class_2': 0.7509881422924901, 'eval_f1_class_3': 0.52, 'eval_f1_class_4': 0.5, 'eval_runtime': 0.3764, 'eval_samples_per_second': 1110.382, 'eval_steps_per_second': 71.723, 'epoch': 8.0}
{'loss': 0.2113, 'grad_norm': 11.433853149414062, 'learning_rate': 3.7037037037037037e-06, 'epoch': 8.33}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 1.2487915754318237, 'eval_f1_micro': 0.6961722488038278, 'eval_f1_macro': 0.6303399887770154, 'eval_f1_class_0': 0.7506702412868632, 'eval_f1_class_1': 0.5714285714285714, 'eval_f1_class_2': 0.7419354838709677, 'eval_f1_class_3': 0.5321100917431193, 'eval_f1_class_4': 0.5555555555555556, 'eval_runtime': 0.3899, 'eval_samples_per_second': 1072.079, 'eval_steps_per_second': 69.249, 'epoch': 9.0}


  0%|          | 0/27 [00:00<?, ?it/s]

{'eval_loss': 1.2816280126571655, 'eval_f1_micro': 0.7105263157894737, 'eval_f1_macro': 0.6421365927969038, 'eval_f1_class_0': 0.7643979057591623, 'eval_f1_class_1': 0.5671641791044776, 'eval_f1_class_2': 0.75, 'eval_f1_class_3': 0.5576923076923077, 'eval_f1_class_4': 0.5714285714285714, 'eval_runtime': 0.3665, 'eval_samples_per_second': 1140.59, 'eval_steps_per_second': 73.674, 'epoch': 10.0}
{'train_runtime': 98.4021, 'train_samples_per_second': 292.677, 'train_steps_per_second': 18.292, 'train_loss': 0.38877283520168726, 'epoch': 10.0}


  0%|          | 0/47 [00:00<?, ?it/s]

PredictionOutput(predictions=array([[ 3.4418604 , -0.866702  , -1.2586852 ,  0.16438948, -0.4466757 ],
       [ 3.4636855 , -1.3378413 , -0.99504375,  0.08696233, -0.14384496],
       [ 3.9783578 , -1.8114694 , -1.2508426 , -0.06303007,  0.23558894],
       ...,
       [-0.02859818, -0.6438564 ,  4.405754  , -0.86341256, -2.1857705 ],
       [ 1.509085  , -1.2781395 ,  2.6367295 ,  0.13552351, -1.7677331 ],
       [-0.23224574,  3.0701632 ,  1.209892  , -1.4276651 , -1.9107246 ]],
      dtype=float32), label_ids=array([3, 0, 4, 2, 2, 2, 1, 3, 2, 1, 4, 4, 0, 2, 2, 2, 1, 4, 0, 0, 3, 3,
       3, 2, 0, 0, 1, 2, 4, 0, 1, 4, 2, 3, 1, 2, 2, 0, 1, 1, 0, 0, 4, 0,
       0, 2, 1, 3, 2, 0, 1, 1, 0, 3, 0, 1, 0, 0, 1, 3, 2, 2, 2, 0, 2, 0,
       0, 4, 3, 1, 2, 1, 2, 0, 2, 2, 2, 0, 0, 3, 0, 1, 1, 2, 3, 0, 0, 0,
       0, 1, 0, 3, 2, 2, 1, 4, 2, 2, 2, 2, 0, 2, 2, 1, 2, 2, 2, 0, 0, 2,
       0, 3, 3, 2, 1, 2, 0, 0, 2, 0, 4, 2, 2, 0, 0, 0, 0, 0, 2, 1, 4, 2,
       0, 4, 3, 3, 1, 2, 0, 2, 2, 0, 2, 2, 1

In [13]:
import shutil
base_dir = "../../CaseStudyModel"
subdirs = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d))]
subdir = subdirs[0]
subdir_path = os.path.join(base_dir, subdir)
for item in os.listdir(subdir_path):
    s = os.path.join(subdir_path, item)
    d = os.path.join(base_dir, item)
    if os.path.isdir(s):
        shutil.move(s, d)
    else:
        shutil.move(s, d)
os.rmdir(subdir_path)