In [1]:
import requests
import boto3
import json
import os
import sys
import argparse
import importlib
import transformers
import torch
import pathlib
import wandb
import awswrangler as wr
from IPython.display import display
from sagemaker.huggingface.processing import HuggingFaceProcessor
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import FrameworkProcessor
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.session import get_execution_role


# Adding ../01_modules or ./01_modules to the system path so that we can load modules from 
# there as well
if '__file__' in globals():
    script_dir = pathlib.Path(__file__).parent.resolve()
else:
    script_dir = pathlib.Path().absolute()
modules_path_in_dev = os.path.abspath(os.path.join(script_dir, '..', '01_modules'))
modules_path_in_prod = os.path.abspath(os.path.join(script_dir, '01_modules'))
if os.path.exists(modules_path_in_dev):
    sys.path.append(modules_path_in_dev)
if os.path.exists(modules_path_in_prod):
    sys.path.append(modules_path_in_prod)


# # Jupyter only reads a local module the first time after 
# # kernel start. Re-running a cell with 
# # "from mymodulename import *" would not change
# # anything, even if the imported module has since changed.
# # As a workaround, we need to directly load the module, 
# # use importlib.reload to reload it and then import * 
import utils
_ = importlib.reload(utils)
import config
_ = importlib.reload(config) 


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
config.py loaded: v0.1
utils.py loaded: v0.2.12
utils.py loaded: v0.2.12
config.py loaded: v0.1


In [2]:
# TODO: reorganize

from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
#import torch
import lighteval
import numpy as np

In [3]:
# PRETRAINED_MODEL_NAME = 'roberta-base'
# LABEL_TYPE = 'topic'

In [4]:
import torch


In [5]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33msteve-attila-kopias[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [6]:
parser = argparse.ArgumentParser()

parser.add_argument("--model_name", type=str, default='distilbert-base-uncased')
parser.add_argument("--hf_dataset_suffix", type=str, default='_Title_SubfieldIndex')
parser.add_argument("--label_type", type=str, default='subfield')
parser.add_argument("--text_key", type=str, default='title')
parser.add_argument("--text_key_rename_to", type=str, default='text')
parser.add_argument("--label_key_rename_to", type=str, default='label')
parser.add_argument("--sample", type=int, default=100)
parser.add_argument("--epochs", type=int, default=5)
parser.add_argument("--train_batch_size", type=int, default=32)
parser.add_argument("--eval_batch_size", type=int, default=64)
parser.add_argument("--warmup_steps", type=int, default=500)
parser.add_argument("--learning_rate", type=str, default=5e-5)

# Data, model, and output directories
parser.add_argument("--output_data_dir", type=str, default='SM_OUTPUT_DATA_DIR') # os.environ['SM_OUTPUT_DATA_DIR'])
parser.add_argument("--model_dir", type=str, default='SM_MODEL_DIR') # os.environ['SM_MODEL_DIR'])
parser.add_argument("--n_gpus", type=str, default=torch.cuda.device_count()) # os.environ['SM_NUM_GPUS'])

args, _ = parser.parse_known_args()
LABEL_KEY = f'{args.label_type}_index'
SAMPLE_SUFFIX = f'[:{args.sample}%]' if args.sample!=100 else ''
args

Namespace(model_name='distilbert-base-uncased', hf_dataset_suffix='_Title_SubfieldIndex', label_type='subfield', text_key='title', text_key_rename_to='text', label_key_rename_to='label', sample=100, epochs=5, train_batch_size=32, eval_batch_size=64, warmup_steps=500, learning_rate=5e-05, output_data_dir='SM_OUTPUT_DATA_DIR', model_dir='SM_MODEL_DIR', n_gpus=1)

In [7]:
SAMPLE_SUFFIX

''

In [8]:
dataset_train = load_dataset(
    'SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics'+args.hf_dataset_suffix, 
    split=f'train{SAMPLE_SUFFIX}' # [:1%]
)
dataset_test = load_dataset(
    'SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics'+args.hf_dataset_suffix, 
    split=f'test{SAMPLE_SUFFIX}' # [:1%]
)
dataset = DatasetDict({
    'train': dataset_train,
    'test': dataset_test
})
dataset = dataset.rename_column(args.text_key, args.text_key_rename_to)
dataset = dataset.rename_column(LABEL_KEY, args.label_key_rename_to)
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 346344
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 43478
    })
})

In [9]:
label_df = wr.athena.read_sql_query(
f"""
SELECT 
    {args.label_type}_index AS index, 
    {args.label_type}_display_name AS display_name
FROM
    {args.label_type}s
""", '03_core'
)
index2label = dict(zip(label_df[f'index'].astype(int), label_df['display_name']))
label2index = dict(zip(label_df['display_name'], label_df['index'].astype(int)))
print('index2label', index2label)
print('label2index', label2index)

index2label {0: 'Artificial Intelligence', 3: 'Computer Networks and Communications', 4: 'Computational Theory and Mathematics', 5: 'Signal Processing', 6: 'Human-Computer Interaction', 1: 'Computer Vision and Pattern Recognition', 2: 'Information Systems', 7: 'Computer Science Applications', 8: 'Hardware and Architecture', 9: 'Computer Graphics and Computer-Aided Design', 10: 'Software'}
label2index {'Artificial Intelligence': 0, 'Computer Networks and Communications': 3, 'Computational Theory and Mathematics': 4, 'Signal Processing': 5, 'Human-Computer Interaction': 6, 'Computer Vision and Pattern Recognition': 1, 'Information Systems': 2, 'Computer Science Applications': 7, 'Hardware and Architecture': 8, 'Computer Graphics and Computer-Aided Design': 9, 'Software': 10}


In [10]:
label_df.shape[0]

11

In [11]:
model = AutoModelForSequenceClassification.from_pretrained(
    args.model_name,
    num_labels=label_df.shape[0],
    id2label=index2label,
    label2id=label2index
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [13]:
tokenizer = AutoTokenizer.from_pretrained(args.model_name, add_prefix_space=True)
tokenizer

DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [14]:
# if tokenizer.pad_token is None:
#    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
#    model.resize_token_embeddings(len(tokenizer))

In [15]:
def tokenize_function(example):
    text = example[args.text_key_rename_to]
    tokenizer.truncation_side = 'right'
    tokenized_inputs = tokenizer(
        text,
        return_tensors='np',
        truncation=True,
        max_length=512
    )

    return tokenized_inputs


tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/346344 [00:00<?, ? examples/s]

Map:   0%|          | 0/43478 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 346344
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 43478
    })
})

In [16]:
tokenized_dataset['train'][0:5]

{'text': ['Evaluation of Weighted Nuclear Norm Minimization Algorithm for Ultrasound Image Denoising',
  'Quality-Aware Memory Network for Interactive Volumetric Image Segmentation',
  'vol2Brain: A New Online Pipeline for Whole Brain MRI Analysis',
  'A Toolchain for Privacy-Preserving Distributed Aggregation on Edge-Devices',
  'Hierarchical and Decentralised Federated Learning'],
 'label': [1, 1, 1, 0, 0],
 'input_ids': [[101,
   9312,
   1997,
   18215,
   4517,
   13373,
   7163,
   4328,
   9276,
   9896,
   2005,
   27312,
   3746,
   7939,
   10054,
   2075,
   102],
  [101,
   3737,
   1011,
   5204,
   3638,
   2897,
   2005,
   9123,
   3872,
   12412,
   3746,
   6903,
   3370,
   102],
  [101,
   5285,
   2475,
   10024,
   2378,
   1024,
   1037,
   2047,
   3784,
   13117,
   2005,
   2878,
   4167,
   27011,
   4106,
   102],
  [101,
   1037,
   6994,
   24925,
   2078,
   2005,
   9394,
   1011,
   15224,
   5500,
   28041,
   2006,
   3341,
   1011,
   5733,
   102],


In [17]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
data_collator

DataCollatorWithPadding(tokenizer=DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
), padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

In [18]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
# TODO: replace it with lighteval?
# accuracy = evaluate.load("accuracy")

def compute_metrics(pred):
    labels = pred.label_ids
    predictions = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='micro') # TODO: weighted when not using 1% sample that skips some labels
    accuracy = accuracy_score(labels, predictions)
    return {
        'accuracy': accuracy, 
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [19]:
from datetime import datetime
now = datetime.now().strftime("%Y%m%d%H%M%S")
now

'20250906003511'

In [20]:
training_args = TrainingArguments(
    run_name=f'{args.model_name}-{args.hf_dataset_suffix}-{now}_sample-{args.sample}_epochs-{args.epochs}',
    output_dir=args.model_dir,
    num_train_epochs=args.epochs,
    per_device_train_batch_size=args.train_batch_size,
    per_device_eval_batch_size=args.eval_batch_size,
    warmup_steps=args.warmup_steps,
    learning_rate=float(args.learning_rate),

    logging_dir=f'{args.output_data_dir}/logs',
    report_to='wandb',
    # logging_steps=5,
    eval_strategy='epoch',
    # eval_strategy='steps',
    # eval_steps=20,
    # max_steps=100,
    # save_steps=100
)
training_args

TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalStrategy.EPOCH,
eval_

In [21]:
# set the wandb project where this run will be logged
os.environ["WANDB_PROJECT"]="research_methodology_extraction"

# save your trained model checkpoint to wandb
os.environ["WANDB_LOG_MODEL"]="checkpoint"

# turn off watch to log faster
os.environ["WANDB_WATCH"]="false"

In [22]:
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    processing_class=tokenizer,
    data_collator=data_collator,
)

In [23]:
timelogger = utils.TimeLogger()
trainer.train()
timelogger.log('Train Finished')
wandb.finish()

 :: :: TIMELOGGER STARTED :: | since_start: 0.00 seconds | since_last: 0.00 seconds :: 




Epoch,Training Loss,Validation Loss


[34m[1mwandb[0m: Adding directory to artifact (SM_MODEL_DIR/checkpoint-500)... Done. 0.9s


In [None]:
eval_result = trainer.evaluate(eval_dataset=tokenized_dataset['test'])
eval_result

In [None]:
tokenized_dataset_predict = tokenized_dataset['test'].select(range(200, 300))
tokenized_dataset_predict

In [None]:
raw_predictions, label_ids, _metrics = trainer.predict(tokenized_dataset_predict)
raw_predictions

In [None]:
predictions = np.argmax(raw_predictions, axis=1)
predictions

In [None]:
for index, (text, label_id_truth, label_id_pred) in enumerate(zip(tokenized_dataset_predict['text'], tokenized_dataset_predict['label'], predictions)):
    print(index, text, f'{index2label[label_id_truth]} ({label_id_truth})', f'{index2label[label_id_pred]} ({label_id_pred})')
    

In [None]:
preds['label_ids']

In [None]:
tokenized_dataset_predict['text']