# Finetuning DistilBERT for citation intent classification

## Data preprocessing

In [None]:
from datasets import Dataset
import pandas as pd

df = pd.read_csv('../data/software_citation_intent_merged.csv')
df

Unnamed: 0.1,Unnamed: 0,id,sentence,used,created,mention,context,label,text
0,0,PMC5189946,All of this analysis was implemented using Mat...,False,True,False,,0,All of this analysis was implemented using Mat...
1,1,PMC4511233,"Code for calculating partition similarity, obt...",False,True,False,Since the probability of getting a given MI is...,0,"Code for calculating partition similarity, obt..."
2,2,PMC4186879,All behavioral statistical analyses were perfo...,False,False,True,All behavioral statistical analyses were perfo...,2,All behavioral statistical analyses were perfo...
3,3,PMC5026371,"M-Track was written using Python 2.7, OpenCV 3...",True,False,False,,1,"M-Track was written using Python 2.7, OpenCV 3..."
4,4,PMC1283974,"Mindboggle is a freely downloadable, open sour...",False,True,False,"Mindboggle is a freely downloadable, open sour...",0,"Mindboggle is a freely downloadable, open sour..."
...,...,...,...,...,...,...,...,...,...
4183,995,PMC2936424,Study sample\n,False,False,False,,3,Study sample\n
4184,996,PMC3660501,U0216 and Wortmannin were dissolved in PBS whe...,False,False,False,,3,U0216 and Wortmannin were dissolved in PBS whe...
4185,997,PMC4213368,The linker unit serves as an acceptor for the ...,False,False,False,,3,The linker unit serves as an acceptor for the ...
4186,998,PMC4451992,The Training and Recourse Center of CIDI in Be...,False,False,False,,3,The Training and Recourse Center of CIDI in Be...


In [None]:
dataset = df[['text', 'label']]
dataset = dataset.sample(n=len(dataset), random_state=42)
dataset = dataset.reset_index(drop=True)
dataset

Unnamed: 0,text,label
0,All statistical analyses were performed with S...,1
1,"The subjects underwent prenatal follow-up, and...",3
2,The data were analyzed with SPSS software (IBM...,1
3,We developed an automated image analysis algor...,0
4,Categorizing the free response answers involve...,3
...,...,...
4183,These studies have not been included in the pr...,3
4184,Hierarchical Bayesian parameter estimation usi...,1
4185,°C and tools offered by all providers through ...,2
4186,It measures land use intensity only of the ind...,3


In [None]:
id2label = {0: "created", 1: "used", 2: "mention", 3: "none"}
label2id = {"created": 0, "used": 1, "mention": 2, "none": 3}

In [None]:
dataset = Dataset.from_pandas(dataset)
dataset

Dataset({
    features: ['text', 'label'],
    num_rows: 4188
})

In [None]:
type(dataset)

datasets.arrow_dataset.Dataset

In [None]:
dataset = dataset.train_test_split(test_size=0.2)

In [None]:
type(dataset)

datasets.dataset_dict.DatasetDict

In [None]:
dataset['train'][0]

{'text': 'Data linkage was completed using Matlab.', 'label': 2}

### Load tokenizer

In [None]:
from transformers import *

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')


Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/6cdc0aad91f5ae2e6712e91bc7b65d1cf5c05411/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.31.0",
  "vocab_size": 30522
}



Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/6cdc0aad91f5ae2e6712e91bc7b65d1cf5c05411/vocab.txt
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/6cdc0aad91f5ae2e6712e91bc7b65d1cf5c05411/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/6cdc0aad91f5ae2e6712e91bc7b65d1cf5c05411/tokenizer_config.json
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/6cdc0aad91f5ae2e6712e91bc7b65d1cf5c05411/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropo

### preprocessing - truncation

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [None]:
tokenizer_dataset = dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/3350 [00:00<?, ? examples/s]

Map:   0%|          | 0/838 [00:00<?, ? examples/s]

### preprocessing - padding

In [None]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Train Model

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")

In [None]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
# model = BertForSequenceClassification.from_pretrained('allenai/scibert_scivocab_uncased', num_labels=4,)
model = AutoModelForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', num_labels=4, id2label=id2label, label2id=label2id
)


loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/6cdc0aad91f5ae2e6712e91bc7b65d1cf5c05411/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "created",
    "1": "used",
    "2": "mention",
    "3": "none"
  },
  "initializer_range": 0.02,
  "label2id": {
    "created": 0,
    "mention": 2,
    "none": 3,
    "used": 1
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.31.0",
  "vocab_size": 30522
}



Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/6cdc0aad91f5ae2e6712e91bc7b65d1cf5c05411/model.safetensors
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification 

In [None]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [None]:
model.device

device(type='cpu')

In [None]:
device = 'cuda:0'
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [None]:
model.device

device(type='cuda', index=0)

In [None]:
training_args = TrainingArguments(
    output_dir="./tmp/",
    learning_rate=2e-5,
    per_device_train_batch_size=10,
    per_device_eval_batch_size=10,
    num_train_epochs=20,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    # load_best_model_at_end=True,
    save_strategy="no",
    # push_to_hub=True,
    
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenizer_dataset["train"],
    eval_dataset=tokenizer_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Found safetensors installation, but --save_safetensors=False. Safetensors should be a preferred weights saving format due to security and performance reasons. If your model cannot be saved by safetensors please feel free to open an issue at https://github.com/huggingface/safetensors!
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 3,350
  Num Epochs = 20
  Instantaneous batch size per device = 10
  Total train batch size (w. parallel,

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.353816,0.871122
2,0.483800,0.320744,0.896181
3,0.179300,0.375301,0.899761
4,0.179300,0.527703,0.880668
5,0.069600,0.526273,0.899761
6,0.032000,0.581543,0.894988
7,0.032000,0.601787,0.898568
8,0.008300,0.667022,0.892601
9,0.007000,0.655201,0.903341
10,0.007000,0.653594,0.903341


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 838
  Batch size = 10
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 838
  Batch size = 10
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: text. If text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 83

TrainOutput(global_step=6700, training_loss=0.05862445879546898, metrics={'train_runtime': 527.0027, 'train_samples_per_second': 127.134, 'train_steps_per_second': 12.713, 'total_flos': 1353200145793024.0, 'train_loss': 0.05862445879546898, 'epoch': 20.0})

## Validation on CZI dataset

In [None]:
from datasets import Dataset
import pandas as pd

df_evaluate = pd.read_csv('../data/czi_val_merged.csv')
df_evaluate

Unnamed: 0,doi_link,source,text,software,intent,label
0,https://doi.org/10.1371/journal.pone.0043845,10. Confocal microscopy,Confocal laser scanning was performed on a Oly...,Fluoview,used,usage
1,https://doi.org/10.1155/2014/395212,5. Simulation Results and Analysis,"In Figure 11, p miss of SVM-SMP is nearly equa...",SVM,used,usage
2,https://doi.org/10.3389/fphys.2020.587057,Network Architecture and Training Parameters,A pattern recognition network was created usin...,MATLAB,used,usage
3,https://doi.org/10.1186/s13046-018-0997-7,paper_abstract,"Finally, we investigated the potential molecul...",analysis (GSEA),used,usage
4,https://doi.org/10.7717/peerj.9470,Conclusions,The GitHub repository for this study has been ...,Python,used,usage
...,...,...,...,...,...,...
405,https://joss.theoj.org/papers/10.21105/joss.05556,title,PASCal Python: A Principal Axis Strain Calculator,PASCal Python,creation,creation
406,https://joss.theoj.org/papers/10.21105/joss.05313,title,QuaC: A Pipeline Implementing Quality Control ...,QuaC,creation,creation
407,https://joss.theoj.org/papers/10.21105/joss.05251,title,ReSurfEMG: A Python library for preprocessing ...,ReSurfEMG,creation,creation
408,https://joss.theoj.org/papers/10.21105/joss.05562,title,DARTS: The Data Analysis Remote Treatment Service,DARTS,creation,creation


In [None]:
text = df_evaluate['text'].values.tolist()

In [None]:
from transformers import pipeline

model = model.to('cpu')
classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)
classifier(text)

Disabling tokenizer parallelism, we're using DataLoader multithreading already


[{'label': 'used', 'score': 0.9999872446060181},
 {'label': 'none', 'score': 0.999983549118042},
 {'label': 'mention', 'score': 0.9999443292617798},
 {'label': 'none', 'score': 0.839496374130249},
 {'label': 'mention', 'score': 0.9994470477104187},
 {'label': 'used', 'score': 0.9999864101409912},
 {'label': 'used', 'score': 0.9999853372573853},
 {'label': 'mention', 'score': 0.999943733215332},
 {'label': 'used', 'score': 0.9999845027923584},
 {'label': 'used', 'score': 0.9999880790710449},
 {'label': 'used', 'score': 0.9999847412109375},
 {'label': 'used', 'score': 0.9988715052604675},
 {'label': 'used', 'score': 0.9999755620956421},
 {'label': 'used', 'score': 0.9999852180480957},
 {'label': 'used', 'score': 0.9999881982803345},
 {'label': 'used', 'score': 0.9999865293502808},
 {'label': 'used', 'score': 0.9999847412109375},
 {'label': 'used', 'score': 0.9999810457229614},
 {'label': 'used', 'score': 0.9999873638153076},
 {'label': 'used', 'score': 0.9999884366989136},
 {'label': 'us

In [None]:
df_evaluate = df_evaluate[['text', 'label']]
df_evaluate

Unnamed: 0,text,label
0,Confocal laser scanning was performed on a Oly...,usage
1,"In Figure 11, p miss of SVM-SMP is nearly equa...",usage
2,A pattern recognition network was created usin...,usage
3,"Finally, we investigated the potential molecul...",usage
4,The GitHub repository for this study has been ...,usage
...,...,...
405,PASCal Python: A Principal Axis Strain Calculator,creation
406,QuaC: A Pipeline Implementing Quality Control ...,creation
407,ReSurfEMG: A Python library for preprocessing ...,creation
408,DARTS: The Data Analysis Remote Treatment Service,creation


In [None]:
set(df_evaluate['label'].values.tolist())

{'creation', 'mention', 'none', 'usage'}

In [None]:
map_label = {'usage':'used', 'creation':'created', 'none': 'none', 'mention': 'mention'}
df_evaluate['label'] = df_evaluate['label'].apply(lambda x: map_label[x])

In [None]:
df_evaluate

Unnamed: 0,text,label
0,Confocal laser scanning was performed on a Oly...,used
1,"In Figure 11, p miss of SVM-SMP is nearly equa...",used
2,A pattern recognition network was created usin...,used
3,"Finally, we investigated the potential molecul...",used
4,The GitHub repository for this study has been ...,used
...,...,...
405,PASCal Python: A Principal Axis Strain Calculator,created
406,QuaC: A Pipeline Implementing Quality Control ...,created
407,ReSurfEMG: A Python library for preprocessing ...,created
408,DARTS: The Data Analysis Remote Treatment Service,created


In [None]:
import tqdm
cnt = 0
classifier_list = classifier(text)
for i in tqdm.tqdm(range(len(classifier_list))):
    if classifier_list[i]['label'] == df_evaluate.loc[i, 'label']:
        cnt += 1
    else:
        print('-'*80)
        print('[text]', df_evaluate.loc[i, 'text'], '[label]:', df_evaluate.loc[i, 'label'])
        print('[model label]:', classifier_list[i]['label'], '[pred score]:', classifier_list[i]['score'])
print('-'*80)
print('correct:', cnt, 'total:', len(df_evaluate))
print('accuracy:', cnt/len(df_evaluate))


  0%|          | 0/410 [00:00<?, ?it/s]100%|██████████| 410/410 [00:00<00:00, 37047.37it/s]

--------------------------------------------------------------------------------
[text] In Figure 11, p miss of SVM-SMP is nearly equal to 0, which is much better than SVM-LA [label]: used
[model label]: none [pred score]: 0.999983549118042
--------------------------------------------------------------------------------
[text] A pattern recognition network was created using MATLAB r2017b to study the performance of the extracted gait parameters [label]: used
[model label]: mention [pred score]: 0.9999443292617798
--------------------------------------------------------------------------------
[text] Finally, we investigated the potential molecular mechanism of ARNTL by gene set enrichment analysis (GSEA), dual Luciferase reporter assay and chromatin immunoprecipitation assay [label]: used
[model label]: none [pred score]: 0.839496374130249
--------------------------------------------------------------------------------
[text] The GitHub repository for this study has been created, inclu




In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, multilabel_confusion_matrix, accuracy_score

In [None]:
true_labels = [label2id[i] for i in df_evaluate['label'].values.tolist()]
predicted_labels = [label2id[i['label']] for i in classifier_list]

precision = precision_score(true_labels, predicted_labels, average='macro')
recall = recall_score(true_labels, predicted_labels, average='macro')
f1 = f1_score(true_labels, predicted_labels, average='macro')
accuracy = accuracy_score(true_labels, predicted_labels)
print('precision:', precision)
print('recall:', recall)
print('f1:', f1)
print('accuracy:', accuracy)

precision: 0.4703534913212332
recall: 0.4986077481840194
f1: 0.4821389042612064
accuracy: 0.8292682926829268


## Evaluate on test dataset

In [None]:
test = dataset['test']
test_text = [i['text'] for i in test]
test_text

['The Statistical Package for the Social Sciences (SPSS Inc., Chicago, IL) was used for statistical calculations. ',
 'Data were analysed using STATA 12.0 (Stata Corp, College Station, TX, USA). ',
 'The Matlab statistics toolbox was used for statistical analysis of the intensity data of the 25 hybridizations from five different treatments (SC, F, B, I, F+I or B+I). ',
 'Between updates these target Q-values remain unchanged and provide some much needed stability.',
 'The physicians were asked to rate each alarm code.',
 'All interactions with p-values<0.0001 were included along with all predictors in the regression models of the SL library.',
 'A moderated hierarchical regression was performed using SPSS (IBM) to estimate the amount of variance in child birthweight explained by stressor and distress variables separately and by the interaction among stressors and maternal characteristics.',
 'Similar questions to ascertain asthma diagnosis have been used within the International Study 

In [None]:
classifier_list = classifier(test_text)

In [None]:
true_labels = [i['label'] for i in test]
predicted_labels = [label2id[i['label']] for i in classifier_list]

precision = precision_score(true_labels, predicted_labels, average='macro')
recall = recall_score(true_labels, predicted_labels, average='macro')
f1 = f1_score(true_labels, predicted_labels, average='macro')
accuracy = accuracy_score(true_labels, predicted_labels)
print('precision:', precision)
print('recall:', recall)
print('f1:', f1)
print('accuracy:', accuracy)

precision: 0.8463252028696164
recall: 0.851756134954961
f1: 0.8475591954341168
accuracy: 0.8973747016706444
