In [0]:
import numpy as np
import pandas as pd
import evaluate
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers import AutoModelForSequenceClassification
from transformers import pipeline
from datasets import Dataset, Features

In [0]:
# choose model
model_name = "bert-base-uncased"
max_length = 512  # max sequence length for each document/sentence sample

In [0]:
# load data
merged_df = pd.read_csv("data/software_citation_intent_merged.csv", index_col=0)
czi_df = pd.read_csv("data/software_citation_intent_czi.csv", index_col=0)

train_dataset = Dataset.from_pandas(merged_df[["text", "label"]])
eval_dataset = Dataset.from_pandas(czi_df[["text", "label"]])

split_dataset = train_dataset.train_test_split(test_size=0.2)

In [0]:
len(split_dataset["test"])

838

In [0]:
# tokenize datasets

tokenizer = BertTokenizerFast.from_pretrained(model_name, do_lower_case=True)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = split_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/3350 [00:00<?, ? examples/s]

Map:   0%|          | 0/838 [00:00<?, ? examples/s]

Map:   0%|          | 0/374 [00:00<?, ? examples/s]

In [0]:
# init
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)
model.to("cuda:0")
training_args = TrainingArguments(output_dir="./tmp/sentence_intent", evaluation_strategy="epoch", save_strategy="no")

def compute_metrics(eval_pred):
    avg_strategy = "macro"  # or macro
    acc = evaluate.load("accuracy")
    prec = evaluate.load("precision")
    rec = evaluate.load("recall")
    f1 = evaluate.load("f1")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    result = {}
    result["accuracy"] = acc.compute(predictions=predictions, references=labels)["accuracy"]
    result["precision"] = prec.compute(predictions=predictions, references=labels, average=avg_strategy)["precision"]
    result["recall"] = rec.compute(predictions=predictions, references=labels, average=avg_strategy)["recall"]
    result["f1"] = f1.compute(predictions=predictions, references=labels, average=avg_strategy)["f1"]
    return result

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    compute_metrics=compute_metrics,
)

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [0]:
trainer.train()



Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.332384,0.904535,0.86028,0.873238,0.864831
2,0.486200,0.304749,0.920048,0.874051,0.89653,0.884466
3,0.199200,0.431962,0.912888,0.862457,0.882746,0.870745


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

TrainOutput(global_step=1257, training_loss=0.2901861696083961, metrics={'train_runtime': 330.1789, 'train_samples_per_second': 30.438, 'train_steps_per_second': 3.807, 'total_flos': 2644313586532352.0, 'train_loss': 0.2901861696083961, 'epoch': 3.0})

In [0]:
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device="cuda:0")

Using /root/.cache/torch_extensions/py310_cu118 as PyTorch extensions root...
Creating extension directory /root/.cache/torch_extensions/py310_cu118/cuda_kernel...
Detected CUDA files, patching ldflags
Emitting ninja build file /root/.cache/torch_extensions/py310_cu118/cuda_kernel/build.ninja...
Building extension module cuda_kernel...
Allowing ninja to set a default number of workers... (overridable by setting the environment variable MAX_JOBS=N)
Loading extension module cuda_kernel...
Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.


In [0]:
(pipe(czi_df.iloc[0]["text"]), czi_df.iloc[0]["label"])

([{'label': 'LABEL_1', 'score': 0.9991515874862671}], 1)