In [21]:
import requests
import boto3
import json
import os
import sys
import argparse
import importlib
import transformers
import torch
import pathlib
import awswrangler as wr
from IPython.display import display
from sagemaker.huggingface.processing import HuggingFaceProcessor
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import FrameworkProcessor
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.session import get_execution_role


# Adding ../01_modules or ./01_modules to the system path so that we can load modules from 
# there as well
if '__file__' in globals():
    script_dir = pathlib.Path(__file__).parent.resolve()
else:
    script_dir = pathlib.Path().absolute()
modules_path_in_dev = os.path.abspath(os.path.join(script_dir, '..', '01_modules'))
modules_path_in_prod = os.path.abspath(os.path.join(script_dir, '01_modules'))
if os.path.exists(modules_path_in_dev):
    sys.path.append(modules_path_in_dev)
if os.path.exists(modules_path_in_prod):
    sys.path.append(modules_path_in_prod)


# # Jupyter only reads a local module the first time after 
# # kernel start. Re-running a cell with 
# # "from mymodulename import *" would not change
# # anything, even if the imported module has since changed.
# # As a workaround, we need to directly load the module, 
# # use importlib.reload to reload it and then import * 
import utils
_ = importlib.reload(utils)
import config
_ = importlib.reload(config) 


utils.py loaded: v0.2.12
config.py loaded: v0.1


In [22]:
# TODO: reorganize

from datasets import load_dataset, DatasetDict, Dataset

from transformers import (
    AutoTokenizer,
    AutoConfig, 
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer)

from peft import PeftModel, PeftConfig, get_peft_model, LoraConfig
import evaluate
#import torch
import lighteval
import numpy as np

In [23]:
# PRETRAINED_MODEL_NAME = 'roberta-base'
# LABEL_TYPE = 'topic'

In [24]:
import torch


In [25]:
parser = argparse.ArgumentParser()

parser.add_argument("--model_name", type=str, default='distilbert-base-uncased')
parser.add_argument("--hf_dataset_suffix", type=str, default='_Title_SubfieldIndex')
parser.add_argument("--label_type", type=str, default='subfield')
parser.add_argument("--text_key", type=str, default='title')
parser.add_argument("--text_key_rename_to", type=str, default='text')
parser.add_argument("--label_key_rename_to", type=str, default='label')

parser.add_argument("--epochs", type=int, default=3)
parser.add_argument("--train_batch_size", type=int, default=32)
parser.add_argument("--eval_batch_size", type=int, default=64)
parser.add_argument("--warmup_steps", type=int, default=500)
parser.add_argument("--learning_rate", type=str, default=5e-5)

# Data, model, and output directories
parser.add_argument("--output_data_dir", type=str, default='SM_OUTPUT_DATA_DIR') # os.environ['SM_OUTPUT_DATA_DIR'])
parser.add_argument("--model_dir", type=str, default='SM_MODEL_DIR') # os.environ['SM_MODEL_DIR'])
parser.add_argument("--n_gpus", type=str, default=torch.cuda.device_count()) # os.environ['SM_NUM_GPUS'])

args, _ = parser.parse_known_args()
LABEL_KEY = f'{args.label_type}_index'
args

Namespace(model_name='distilbert-base-uncased', hf_dataset_suffix='_Title_SubfieldIndex', label_type='subfield', text_key='title', text_key_rename_to='text', label_key_rename_to='label', epochs=3, train_batch_size=32, eval_batch_size=64, warmup_steps=500, learning_rate=5e-05, output_data_dir='SM_OUTPUT_DATA_DIR', model_dir='SM_MODEL_DIR', n_gpus=0)

In [26]:
dataset_train = load_dataset(
    'SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics'+args.hf_dataset_suffix, 
    split='train[:1%]'
)
dataset_test = load_dataset(
    'SteveAKopias/SemanticScholarCSFullTextWithOpenAlexTopics'+args.hf_dataset_suffix, 
    split='test[:1%]'
)
dataset = DatasetDict({
    'train': dataset_train,
    'test': dataset_test
})
dataset = dataset.rename_column(args.text_key, args.text_key_rename_to)
dataset = dataset.rename_column(LABEL_KEY, args.label_key_rename_to)
dataset

README.md:   0%|          | 0.00/540 [00:00<?, ?B/s]

data/train-00000-of-00008.parquet:   0%|          | 0.00/2.24M [00:00<?, ?B/s]

data/train-00001-of-00008.parquet:   0%|          | 0.00/2.25M [00:00<?, ?B/s]

data/train-00002-of-00008.parquet:   0%|          | 0.00/2.25M [00:00<?, ?B/s]

data/train-00003-of-00008.parquet:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

data/train-00004-of-00008.parquet:   0%|          | 0.00/2.24M [00:00<?, ?B/s]

data/train-00005-of-00008.parquet:   0%|          | 0.00/2.24M [00:00<?, ?B/s]

data/train-00006-of-00008.parquet:   0%|          | 0.00/2.23M [00:00<?, ?B/s]

data/train-00007-of-00008.parquet:   0%|          | 0.00/2.22M [00:00<?, ?B/s]

data/validation-00000-of-00008.parquet:   0%|          | 0.00/281k [00:00<?, ?B/s]

data/validation-00001-of-00008.parquet:   0%|          | 0.00/290k [00:00<?, ?B/s]

data/validation-00002-of-00008.parquet:   0%|          | 0.00/279k [00:00<?, ?B/s]

data/validation-00003-of-00008.parquet:   0%|          | 0.00/282k [00:00<?, ?B/s]

data/validation-00004-of-00008.parquet:   0%|          | 0.00/292k [00:00<?, ?B/s]

data/validation-00005-of-00008.parquet:   0%|          | 0.00/297k [00:00<?, ?B/s]

data/validation-00006-of-00008.parquet:   0%|          | 0.00/289k [00:00<?, ?B/s]

data/validation-00007-of-00008.parquet:   0%|          | 0.00/293k [00:00<?, ?B/s]

data/test-00000-of-00008.parquet:   0%|          | 0.00/283k [00:00<?, ?B/s]

data/test-00001-of-00008.parquet:   0%|          | 0.00/284k [00:00<?, ?B/s]

data/test-00002-of-00008.parquet:   0%|          | 0.00/286k [00:00<?, ?B/s]

data/test-00003-of-00008.parquet:   0%|          | 0.00/292k [00:00<?, ?B/s]

data/test-00004-of-00008.parquet:   0%|          | 0.00/289k [00:00<?, ?B/s]

data/test-00005-of-00008.parquet:   0%|          | 0.00/292k [00:00<?, ?B/s]

data/test-00006-of-00008.parquet:   0%|          | 0.00/293k [00:00<?, ?B/s]

data/test-00007-of-00008.parquet:   0%|          | 0.00/284k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/346344 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/43439 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/43478 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 3463
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 435
    })
})

In [27]:
label_df = wr.athena.read_sql_query(
f"""
SELECT 
    {args.label_type}_index AS index, 
    {args.label_type}_display_name AS display_name
FROM
    {args.label_type}s
""", '03_core'
)
index2label = dict(zip(label_df[f'index'].astype(int), label_df['display_name']))
label2index = dict(zip(label_df['display_name'], label_df['index'].astype(int)))
print('index2label', index2label)
print('label2index', label2index)

index2label {0: 'Artificial Intelligence', 3: 'Computer Networks and Communications', 4: 'Computational Theory and Mathematics', 7: 'Computer Science Applications', 8: 'Hardware and Architecture', 9: 'Computer Graphics and Computer-Aided Design', 10: 'Software', 5: 'Signal Processing', 6: 'Human-Computer Interaction', 1: 'Computer Vision and Pattern Recognition', 2: 'Information Systems'}
label2index {'Artificial Intelligence': 0, 'Computer Networks and Communications': 3, 'Computational Theory and Mathematics': 4, 'Computer Science Applications': 7, 'Hardware and Architecture': 8, 'Computer Graphics and Computer-Aided Design': 9, 'Software': 10, 'Signal Processing': 5, 'Human-Computer Interaction': 6, 'Computer Vision and Pattern Recognition': 1, 'Information Systems': 2}


In [28]:
label_df.shape[0]

11

In [29]:
model = AutoModelForSequenceClassification.from_pretrained(
    args.model_name,
    num_labels=label_df.shape[0],
    id2label=index2label,
    label2id=label2index
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [31]:
tokenizer = AutoTokenizer.from_pretrained(args.model_name, add_prefix_space=True)
tokenizer

DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
)

In [32]:
# if tokenizer.pad_token is None:
#    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
#    model.resize_token_embeddings(len(tokenizer))

In [33]:
def tokenize_function(example):
    text = example[args.text_key_rename_to]
    tokenizer.truncation_side = 'right'
    tokenized_inputs = tokenizer(
        text,
        return_tensors='np',
        truncation=True,
        max_length=512
    )

    return tokenized_inputs


tokenized_dataset = dataset.map(tokenize_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/3463 [00:00<?, ? examples/s]

Map:   0%|          | 0/435 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 3463
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 435
    })
})

In [34]:
tokenized_dataset['train'][0:5]

{'text': ['Evaluation of Weighted Nuclear Norm Minimization Algorithm for Ultrasound Image Denoising',
  'Quality-Aware Memory Network for Interactive Volumetric Image Segmentation',
  'vol2Brain: A New Online Pipeline for Whole Brain MRI Analysis',
  'A Toolchain for Privacy-Preserving Distributed Aggregation on Edge-Devices',
  'Hierarchical and Decentralised Federated Learning'],
 'label': [1, 1, 1, 0, 0],
 'input_ids': [[101,
   9312,
   1997,
   18215,
   4517,
   13373,
   7163,
   4328,
   9276,
   9896,
   2005,
   27312,
   3746,
   7939,
   10054,
   2075,
   102],
  [101,
   3737,
   1011,
   5204,
   3638,
   2897,
   2005,
   9123,
   3872,
   12412,
   3746,
   6903,
   3370,
   102],
  [101,
   5285,
   2475,
   10024,
   2378,
   1024,
   1037,
   2047,
   3784,
   13117,
   2005,
   2878,
   4167,
   27011,
   4106,
   102],
  [101,
   1037,
   6994,
   24925,
   2078,
   2005,
   9394,
   1011,
   15224,
   5500,
   28041,
   2006,
   3341,
   1011,
   5733,
   102],


In [35]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
data_collator

DataCollatorWithPadding(tokenizer=DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}
), padding=True, max_length=None, pad_to_multiple_of=None, return_tensors='pt')

In [48]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
# TODO: replace it with lighteval?
# accuracy = evaluate.load("accuracy")

def compute_metrics(pred):
    labels = pred.label_ids
    predictions = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='micro') # TODO: weighted when not using 1% sample that skips some labels
    accuracy = accuracy_score(labels, predictions)
    return {
        'accuracy': accuracy, 
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [49]:
training_args = TrainingArguments(
    output_dir=args.model_dir,
    num_train_epochs=args.epochs,
    # per_device_train_batch_size=args.train_batch_size,
    # per_device_eval_batch_size=args.eval_batch_size,
    # warmup_steps=args.warmup_steps,
    eval_strategy='epoch',
    logging_dir=f'{args.output_data_dir}/logs',
    learning_rate=float(args.learning_rate),
)
training_args

TrainingArguments(
_n_gpu=0,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=True,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=IntervalStrategy.EPOCH,
eval_

In [50]:
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
    processing_class=tokenizer,
    data_collator=data_collator,
)

In [51]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,2.222072,0.618391,0.618391,0.618391,0.618391
2,0.182900,2.147113,0.664368,0.664368,0.664368,0.664368
3,0.180000,2.17043,0.666667,0.666667,0.666667,0.666667


TrainOutput(global_step=1299, training_loss=0.15539860743756842, metrics={'train_runtime': 743.4036, 'train_samples_per_second': 13.975, 'train_steps_per_second': 1.747, 'total_flos': 70568842650768.0, 'train_loss': 0.15539860743756842, 'epoch': 3.0})

In [53]:
eval_result = trainer.evaluate(eval_dataset=tokenized_dataset['test'])
eval_result

{'eval_loss': 2.1704301834106445,
 'eval_accuracy': 0.6666666666666666,
 'eval_f1': 0.6666666666666666,
 'eval_precision': 0.6666666666666666,
 'eval_recall': 0.6666666666666666,
 'eval_runtime': 4.9615,
 'eval_samples_per_second': 87.675,
 'eval_steps_per_second': 11.085,
 'epoch': 3.0}