In [1]:
import os
import sys
import json
import torch

sys.path.append("../")
from lib.utils import get_device
from lib.utils.constants import Subtask, Track, PreprocessTextLevel, DatasetType
from lib.utils.training import EarlyStopping
from lib.data.loading import load_train_dev_test_df, build_data_loader
from lib.data.tokenizer import get_tokenizer
from lib.models import get_model
from lib.training.loss import get_loss_fn
from lib.training.metric import get_metric


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
CONFIG_FILE_PATH = os.path.relpath("../config.json")

config = {}
with open(CONFIG_FILE_PATH, "r") as config_file:
    config = json.load(config_file)

DEVICE = get_device()
print(f"Using device: {DEVICE}")

results_dir = os.path.relpath("../runs/SubtaskC")
if not os.path.exists(results_dir):
    os.makedirs(results_dir)

print(f"Will save results to: {results_dir}")

Using device: mps
Will save results to: ../runs/SubtaskC


In [3]:
task = None
if "task" in config:
    task = Subtask(config["task"])
else:
    raise ValueError("Task not specified in config")

track = None
if "track" in config:
    track = Track(config["track"])
else:
    print(f"Warning: Track not specified in config for subtask: {task}")

dataset_type = DatasetType.TransformerTruncationDataset
if "dataset_type" in config["data"]:
    dataset_type = DatasetType(config["data"]["dataset_type"])

dataset_type_settings = None
if "dataset_type_settings" in config["data"]:
    dataset_type_settings = config["data"]["dataset_type_settings"]

df_train, df_dev, df_test = load_train_dev_test_df(
    task=task,
    track=track,
    data_dir=f"../{config['data']['data_dir']}",
    label_column=config["data"]["label_column"],
    test_size=config["data"]["test_size"],
    preprocess_text_level=PreprocessTextLevel(
        config["data"]["preprocess_text_level"]
    ),
)

print(f"df_train.shape: {df_train.shape}")
print(f"df_dev.shape: {df_dev.shape}")
print(f"df_test.shape: {df_test.shape}")

Loading train data...
Train/dev split... (df_train.shape: (3649, 3))
Loading test data....././data/original_data/SubtaskC/SubtaskC_dev.jsonl
df_train.shape: (3284, 3)
df_dev.shape: (365, 3)
df_test.shape: (505, 3)


In [4]:
df_train = df_train.sample(20)
df_dev = df_dev.sample(20)
df_test = df_test.sample(20)

In [5]:
tokenizer = get_tokenizer(**config["tokenizer"])

In [5]:
tokenizer.bos_token_id

0

In [6]:
tokenizer.eos_token

'</s>'

In [6]:
dataset_type = DatasetType.TransformerTruncationDataset
if "dataset_type" in config["data"]:
    dataset_type = DatasetType(config["data"]["dataset_type"])

dataset_type_settings = None
if "dataset_type_settings" in config["data"]:
    dataset_type_settings = config["data"]["dataset_type_settings"]

In [8]:
dataset_type

<DatasetType.LongformerDataset: 'longformer_dataset'>

In [7]:
train_dataloader = build_data_loader(
    df_train,
    tokenizer,
    max_len=config["data"]["max_len"],
    batch_size=config["data"]["batch_size"],
    label_column=config["data"]["label_column"],
    shuffle=True,
    dataset_type=dataset_type,
    dataset_type_settings=dataset_type_settings,
    device=DEVICE,
)
dev_dataloader = build_data_loader(
    df_dev,
    tokenizer,
    max_len=config["data"]["max_len"],
    batch_size=config["data"]["batch_size"],
    label_column=config["data"]["label_column"],
    dataset_type=dataset_type,
    dataset_type_settings=dataset_type_settings,
    device=DEVICE,
)
test_dataloader = build_data_loader(
    df_test,
    tokenizer,
    max_len=config["data"]["max_len"],
    batch_size=config["data"]["batch_size"],
    label_column=config["data"]["label_column"],
    has_targets=False if config["data"]["test_size"] is None else True,
    dataset_type=dataset_type,
    dataset_type_settings=dataset_type_settings,
    device=DEVICE,
)

In [8]:
num_epochs = config["training"]["num_epochs"]
model = get_model(config["model"], config["model_config"]).to(DEVICE)
loss_fn = get_loss_fn(config["training"]["loss"], DEVICE)
optimizer_config = config["training"]["optimizer"]
scheduler_config = config["training"]["scheduler"]
metric_fn, is_better_metric_fn = get_metric(config["training"]["metric"])
num_epochs_before_finetune = config["training"]["num_epochs_before_finetune"]
swa_config = config["training"]["swa"] if "swa" in config["training"] else None
validation_freq = (
    config["training"]["validation_freq"]
    if "validation_freq" in config["training"] else None
)

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


AttributeError: 'Longformer' object has no attribute 'bert'

In [9]:
from transformers import LongformerModel

model = LongformerModel.from_pretrained("allenai/longformer-base-4096")
model

pytorch_model.bin: 100%|██████████| 597M/597M [00:05<00:00, 111MB/s] 
Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


LongformerModel(
  (embeddings): LongformerEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(4098, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): LongformerEncoder(
    (layer): ModuleList(
      (0-11): 12 x LongformerLayer(
        (attention): LongformerAttention(
          (self): LongformerSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (query_global): Linear(in_features=768, out_features=768, bias=True)
            (key_global): Linear(in_features=768, out_features=768, bias=True)
            (value_global): Linear(in_features=768, out_features=768, bias=True)
          )
    