In [1]:
import os

# os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
# os.environ["CUDA_VISIBLE_DEVICES"] = ""
# os.environ['TORCH_CUDA_ALLOC_SYNC'] = '1'

from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, BertTokenizerFast, TrainerCallback
from torch.utils.data import Dataset
from torch import tensor, cuda
from torch.nn.functional import softmax
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import evaluate
import optuna
from optuna.pruners import MedianPruner
from optuna.integration import TensorBoardCallback
import math

In [2]:
class CSVDataset(Dataset):
    def __init__(self, bindings, labels=None, device='cpu'):
        self.bindings = bindings
        self.labels = labels
        self.device = device

    def __len__(self):
        return len(self.bindings["input_ids"])

    def __getitem__(self, idx):
        item = {k: tensor(v[idx]).to(self.device) for k, v in self.bindings.items()}
        if self.labels:
            item['labels'] = tensor(self.labels[idx] - 1).to(self.device)
        return item

class OptunaPruningCallback(TrainerCallback):
    def __init__(self, trial, metric_name):
        self.trial = trial
        self.metric_name = metric_name

    def on_evaluate(self, args, state, control, metrics, **kwargs):
        epoch = math.ceil(state.epoch)
        value = metrics.get(self.metric_name)
        self.trial.report(value, epoch)
        if self.trial.should_prune():
            raise optuna.TrialPruned()

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [46]:
model_name = "onlplab/alephbert-base"
num_labels = 20
output_dir = "test_trainer"
eval_strategy = "epoch"
csv_file = "updated_final.csv"
device = 'cuda:0' if cuda.is_available() else 'cpu'
metric = evaluate.load("accuracy")
tokenizer = BertTokenizerFast.from_pretrained(model_name)

ds = pd.read_csv(csv_file, sep='\t')
ds = ds.dropna()

X = list("[CLS]" + ds['sentence'] + "[SEP]" + ds['part'] + "[SEP]")
y = list(ds['category'].astype(int))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)

train_ds = CSVDataset(X_train_tokenized, y_train, device=device)
test_ds = CSVDataset(X_test_tokenized, y_test, device=device)

print(f'{ds.shape[0]} entries in dataset')
ds.head()

987 entries in dataset


Unnamed: 0,sentence,part,category
0,"החקלאי יצא לרסס את מטע הזיתים שלו , ומשלא שב ל...",מטע הזיתים,12.0
1,"החקלאי יצא לרסס את מטע הזיתים שלו , ומשלא שב ל...",קריאות הטלפון,11.0
2,"החקלאי יצא לרסס את מטע הזיתים שלו , ומשלא שב ל...",רוח חיים,18.0
3,"גופתו של חקלאי בן חמישים ושלוש , תושב כפר סבא ...",בן חמישים ושלוש,17.0
4,"גופתו של חקלאי בן חמישים ושלוש , תושב כפר סבא ...",תושב כפר סבא,16.0


In [48]:
def optimize_hyperparameters():
    print(f"Optimizing hyperparameters for {model_name} on {device} with {csv_file} dataset")

    def objective(trial):
        model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=20)
        learning_rate=trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
        weight_decay=trial.suggest_float("weight_decay", 1e-4, 1e-1)
        per_device_train_batch_size=trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32, 64, 128])
        warmup_steps=trial.suggest_int("warmup_steps", 0, 500)
        warmup_ratio=trial.suggest_float("warmup_ratio", 0.0, 0.2)
        num_train_epochs=trial.suggest_int("num_train_epochs", 2, 10)
        
        training_args = TrainingArguments(
            output_dir="./results",
            learning_rate=learning_rate,
            weight_decay=weight_decay,
            per_device_train_batch_size=per_device_train_batch_size,
            warmup_steps=warmup_steps,
            warmup_ratio=warmup_ratio,
            num_train_epochs=num_train_epochs,
            eval_strategy="epoch",
            save_strategy="epoch",
            logging_strategy="epoch"
        )

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_ds,
            eval_dataset=test_ds,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics,
            callbacks=[OptunaPruningCallback(trial, "eval_accuracy")],
        )

        trainer.train()
        out = trainer.evaluate()
        return out['eval_accuracy']

    study = optuna.create_study(direction="maximize", pruner=MedianPruner())
    study.optimize(objective, n_trials=300)
    print("Best Hyperparameters:\n", study.best_params)
    return study

study = optimize_hyperparameters()

[I 2024-08-27 19:36:21,075] A new study created in memory with name: no-name-94eeecc8-3575-4ccb-a056-76483c7d9697


Optimizing hyperparameters for onlplab/alephbert-base on cpu with updated_final.csv dataset


[W 2024-08-27 19:36:22,118] Trial 0 failed with parameters: {} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\liora\anaconda3\envs\projectenv\lib\site-packages\optuna\study\_optimize.py", line 196, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\liora\AppData\Local\Temp\ipykernel_26732\2085749705.py", line 5, in objective
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=20)
  File "c:\Users\liora\anaconda3\envs\projectenv\lib\site-packages\transformers\models\auto\auto_factory.py", line 484, in from_pretrained
    resolved_config_file = cached_file(
  File "c:\Users\liora\anaconda3\envs\projectenv\lib\site-packages\transformers\utils\hub.py", line 399, in cached_file
    resolved_file = hf_hub_download(
  File "c:\Users\liora\anaconda3\envs\projectenv\lib\site-packages\huggingface_hub\utils\_validators.py", line 114, in _inner_fn
    return fn(*args, **kwargs)
  File "c:\User

KeyboardInterrupt: 

In [5]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=20)

training_args = TrainingArguments(
    output_dir="output",
    learning_rate=study.best_params['learning_rate'],
    weight_decay=study.best_params['weight_decay'],
    per_device_train_batch_size=study.best_params['per_device_train_batch_size'],
    warmup_steps=study.best_params['warmup_steps'],
    warmup_ratio=study.best_params['warmup_ratio'],
    num_train_epochs=study.best_params['num_train_epochs'],
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    compute_metrics=compute_metrics,
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at onlplab/alephbert-base and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RuntimeError: CUDA error: an illegal memory access was encountered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [37]:
def load_model(save=False):
    if save:
        trainer.save_model('./trained_model')
    return AutoModelForSequenceClassification.from_pretrained('./trained_model')

trained_model = load_model(save=True)
trained_model.evaluate()

  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 2.4165635108947754, 'eval_accuracy': 0.3465346534653465, 'eval_runtime': 4.9265, 'eval_samples_per_second': 20.502, 'eval_steps_per_second': 2.639, 'epoch': 4.0}


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [41]:
sentence = "קובי נסע לכיכר הרצל בלוד"
# sentence = "זרקו שם מטען חבלה"
input = tokenizer(sentence, padding=True, truncation=True, return_tensors='pt')
output = trained_model(**input)
print(output)
pred = softmax(output.logits, dim=-1).detach().numpy()
print(pred)
print(np.argmax(pred) + 1)

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.1365, -0.1568, -0.4376,  0.1788, -0.5550, -0.3552, -0.6783,  0.0173,
         -1.0358,  0.6358, -0.6998, -1.8162, -0.4686,  2.8654,  4.7651,  0.1125,
         -0.6668, -1.7059,  0.1414, -0.9894]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
[[0.00772378 0.00575987 0.00435009 0.00805722 0.00386803 0.0047237
  0.00341949 0.00685559 0.00239168 0.01272504 0.00334669 0.00109589
  0.00421709 0.11828709 0.79068893 0.00754044 0.00345895 0.00122368
  0.00776161 0.00250517]]
15
