In [1]:
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from datasets import load_dataset
from evaluate import load

In [2]:
# model_path = "roberta-large"
model_path = "microsoft/deberta-v3-small"
data_path = "knowledgator/events_classification_biotech"

In [3]:
events_class = load_dataset(data_path)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [4]:
events_class

DatasetDict({
    train: Dataset({
        features: ['title', 'content', 'target organization', 'all_labels', 'all_labels_concat', 'label 1', 'label 2', 'label 3', 'label 4', 'label 5'],
        num_rows: 2759
    })
    test: Dataset({
        features: ['title', 'content', 'target organization', 'all_labels', 'all_labels_concat', 'label 1', 'label 2', 'label 3', 'label 4', 'label 5'],
        num_rows: 381
    })
})

In [5]:
events_class['train'][0]

{'title': "Sarah Polley's Book Recommendations",
 'content': 'Drive Your Plow Over the Bones of The Dead\nby Olga Tokarczuk. I am an incredibly slow reader, but the tone and specificity of the world she creates in this book was something I couldnt leave behind until it was done. Also: All We Sawby Anne Michaels, Fight Nightby Miriam Toews, and The Summer Before the Darkby Doris Lessing.\nId like turned into a Netflix show:\nby Amia Srinivasan. One of the most brain-shattering books Ive ever read. Her thinking is so electrically rigorous and fearless. (I double DARE them to make this into a Netflix show!)\n...I last bought:\n. I rediscovered her poetry lately, and I feel like I dont want to read anything else for a while. She owns desire and submerged things.\n...has the greatest ending:\nby J.D. Salinger. The last page always leaves me breathless. The intimacy and truth of that final page is so arresting and almost painful to read.\nshould be on every college syllabus:\nby Anton Piatig

In [6]:
set([event['label 5'] for event in events_class['train']])

{1, 16, 18, 22, 27, 3, None}

In [7]:
classes = [class_ for class_ in events_class['train'].features['label 1'].names if class_]
classes

['event organization',
 'executive statement',
 'regulatory approval',
 'hiring',
 'foundation',
 'closing',
 'partnerships & alliances',
 'expanding industry',
 'new initiatives or programs',
 'm&a',
 'service & product providing',
 'event organisation',
 'new initiatives & programs',
 'subsidiary establishment',
 'product launching & presentation',
 'product updates',
 'executive appointment',
 'alliance & partnership',
 'ipo exit',
 'article publication',
 'clinical trial sponsorship',
 'company description',
 'investment in public company',
 'other',
 'expanding geography',
 'participation in an event',
 'support & philanthropy',
 'department establishment',
 'funding round',
 'patent publication']

In [8]:
class2id = {class_:id for id, class_ in enumerate(classes)}
id2class = {id:class_ for class_, id in class2id.items()}

In [21]:
tokenizer = AutoTokenizer.from_pretrained(model_path)



In [22]:
from transformers import AutoConfig
model_config = AutoConfig.from_pretrained(model_path)
model_config

DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-small",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 256,
  "relative_attention": true,
  "share_att_key": true,
  "transformers_version": "4.37.2",
  "type_vocab_size": 0,
  "vocab_size": 128100
}

In [23]:
def process(row):
    text = f"""{row['title']}.
            {row['content']}"""
    all_labels = row['all_labels_concat'].split(', ')
    labels = [0. for i in range(len(classes))]
    for label in all_labels:
        label_id = class2id[label]
        labels[label_id] = 1  # this is similar to 1-hot encoding
    row_tokened = tokenizer(text, truncation=True, max_length=512)
    row_tokened['labels'] = labels
    return row_tokened

In [24]:
events_processed = events_class.map(process)

Map:   0%|          | 0/2759 [00:00<?, ? examples/s]

Map:   0%|          | 0/381 [00:00<?, ? examples/s]

In [12]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

Implementing metrics during training is super helpful for monitoring model performance over time. It can help avoid over-fitting and build a more general model.

In [13]:
import evaluate
import numpy as np

clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def sigmoid(x):
   return 1/(1 + np.exp(-x))

def compute_metrics(eval_pred):
   predictions, labels = eval_pred
   predictions = sigmoid(predictions)
   predictions = (predictions > 0.5).astype(int).reshape(-1)
   return clf_metrics.compute(predictions=predictions,
                              references=labels.astype(int).reshape(-1))

In [14]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    model_path, num_labels=len(classes),
    id2label=id2class, label2id=class2id,
    problem_type = "multi_label_classification"
)

  return self.fget.__get__(instance, owner)()
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
targs = TrainingArguments(
    output_dir="~/training_files/multi_class",
    num_train_epochs=2,
    per_device_train_batch_size=6,
    per_device_eval_batch_size=6,
    learning_rate=2e-5,
    weight_decay=0.01,
    save_strategy="epoch",
    evaluation_strategy="epoch",
    push_to_hub=False,
    report_to="none",
    load_best_model_at_end=True
)

In [27]:
trainer = Trainer(
    model=model,
    args=targs,
    eval_dataset=events_processed['test'],
    train_dataset=events_processed['train'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

In [28]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.148101,0.949081,0.338636,0.665179,0.227134
2,0.165900,0.139258,0.950831,0.386463,0.680769,0.269817


TrainOutput(global_step=920, training_loss=0.15845647480176842, metrics={'train_runtime': 166.6283, 'train_samples_per_second': 33.116, 'train_steps_per_second': 5.521, 'total_flos': 730412675431560.0, 'train_loss': 0.15845647480176842, 'epoch': 2.0})

In [29]:
trained_model_path = "/home/kamal/training_files/multi_class/checkpoint-920"
trained_model = AutoModelForSequenceClassification.from_pretrained(trained_model_path,
                                                                  num_labels=len(classes))

In [30]:
test_content = """The UK says it will provide thousands of "first-person view" drones, which give operators the situational awareness to target positions, armoured vehicles, and ships.
That type of UAV has proven highly effective on the battlefield since Russia's full-scale invasion of its neighbour two years ago, Britain's defence ministry said.
"The UK continues to do all we can to give Ukraine what it needs," Defence Secretary Grant Shapps said in a statement, ahead of meeting NATO counterparts in Brussels on Thursday.
Referring to the joint project with Latvia, he added: "Together, we will give Ukraine the capabilities it needs to defend itself and win this war."
At Thursday's NATO ministerial meeting, Britain, its 13 allies, and prospective member Sweden, will sign an agreement on two new procurement initiatives for munitions and missiles, London said.
Spearheaded by the UK, they aim to increase defence industrial capacity across the Euro-Atlantic area and replenish stockpiles significantly depleted by the war in Ukraine.
Other members involved in the procurement plans include France, Germany and Turkey, said defence ministry statement.
Britain is also working with alliance members Canada, Denmark, the Netherlands and the United States to deliver air-defence equipment to Ukraine, the ministry added."""

In [32]:
tokened_content = tokenizer(test_content, truncation=True, padding=True,
                           max_length=512, return_tensors='pt').to('cuda')
tokened_content

{'input_ids': tensor([[     1,    279,   1222,    652,    278,    296,    531,   2113,    265,
            307,   9150,    271,   8148,    866,    309,  13909,    261,    319,
            527,   6100,    262,  39427,   3250,    264,   1782,   3129,    261,
          47605,   2500,    261,    263,   5512,    260,    512,    810,    265,
          52305,    303,   3813,   1344,   1287,    277,    262,  17572,    515,
           2425,    280,    268,    540,    271,   5609,   9631,    265,    359,
          17704,    375,    388,    824,    261,   3491,    280,    268,   6506,
           4969,    357,    260,    307,    635,   1222,   2240,    264,    333,
            305,    301,    295,    264,    527,   7116,    339,    278,    634,
            261,    309,  11972,   3777,   6020,  12239,  40837,    357,    267,
            266,   1548,    261,   1645,    265,   1122,  10915,  11912,    267,
          10908,    277,   1561,    260,  30783,    264,    262,   3199,    663,
            27

In [33]:
trained_model.to('cuda')

DebertaV2ForSequenceClassification(
  (deberta): DebertaV2Model(
    (embeddings): DebertaV2Embeddings(
      (word_embeddings): Embedding(128100, 768, padding_idx=0)
      (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
      (dropout): StableDropout()
    )
    (encoder): DebertaV2Encoder(
      (layer): ModuleList(
        (0-5): 6 x DebertaV2Layer(
          (attention): DebertaV2Attention(
            (self): DisentangledSelfAttention(
              (query_proj): Linear(in_features=768, out_features=768, bias=True)
              (key_proj): Linear(in_features=768, out_features=768, bias=True)
              (value_proj): Linear(in_features=768, out_features=768, bias=True)
              (pos_dropout): StableDropout()
              (dropout): StableDropout()
            )
            (output): DebertaV2SelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=T

In [34]:
output = trained_model(**tokened_content)

In [59]:
preds = output.logits.to('cpu').detach().numpy()
predictions = sigmoid(preds)
predictions = (predictions > 0.5).astype(int).reshape(-1)

In [63]:
classes[predictions.argmax()]

'executive statement'