In [1]:
# Required packages
import torch
import re
import csv

In [2]:
# Check if GPU is configured
print(torch.rand(5, 3))

tensor([[0.7061, 0.3341, 0.8402],
        [0.1260, 0.6928, 0.2567],
        [0.3885, 0.8542, 0.7658],
        [0.6511, 0.6868, 0.5815],
        [0.7998, 0.4384, 0.0408]])


In [3]:
train_path = "/DEVEL/code/data/ner_trainsets/v1.0.0.9024/allo/cpes_rasa_vpv_2k.csv"
test_path = "/DEVEL/code/data/ner_trainsets/v1.0.0.9024/allo/cpes_rasa_vpv_100.csv"
num_epochs = 10
num_decay = 0.05

#### Load training set

In [4]:
raw_text = ""
with open(train_path) as csvfile:
    raw_csv = csv.reader(csvfile, delimiter=',')
    for row in raw_csv:
        raw_text = raw_text + "\n" + row[1]

# remove last empty line
train_text = raw_text.split("\n")[1:]

In [5]:
raw_text2 = ""
with open(test_path) as csvfile:
    raw_csv = csv.reader(csvfile, delimiter=',')
    for row in raw_csv:
        raw_text2 = raw_text2 + "\n" + row[1]

# remove last empty line
test_text = raw_text2.split("\n")[1:]

In [6]:

def get_tokens_with_entities(raw_text: str):
    # split the text by spaces only if the space does not occur between square brackets
    # we do not want to split "multi-word" entity value yet
    raw_tokens = re.split(r"\s(?![^\[]*\])", raw_text)

    # a regex for matching the annotation according to our notation [entity_value](entity_name)
    entity_value_pattern = r"\[(?P<value>.+?)\]\((?P<entity>.+?)\)"
    entity_value_pattern_compiled = re.compile(entity_value_pattern, flags=re.I|re.M)

    tokens_with_entities = []

    for raw_token in raw_tokens:
        match = entity_value_pattern_compiled.match(raw_token)
        if match:
            raw_entity_name, raw_entity_value = match.group("entity"), match.group("value")

            # we prefix the name of entity differently
            # B- indicates beginning of an entity
            # I- indicates the token is not a new entity itself but rather a part of existing one
            for i, raw_entity_token in enumerate(re.split("\s", raw_entity_value)):
                entity_prefix = "B" if i == 0 else "I"
                entity_name = f"{entity_prefix}-{raw_entity_name}"
                tokens_with_entities.append((raw_entity_token, entity_name))
        else:
            tokens_with_entities.append((raw_token, "O"))

    return tokens_with_entities

In [7]:
print(get_tokens_with_entities("adobe [acrobat](cpe_product) x (10.1)"))
print(get_tokens_with_entities("red hat [wildfly core](cpe_product) 2.0.0 alpha 2"))
print(get_tokens_with_entities(test_text[0]))
print(get_tokens_with_entities(test_text[1]))


[('adobe', 'O'), ('acrobat', 'B-cpe_product'), ('x', 'O'), ('(10.1)', 'O')]
[('red', 'O'), ('hat', 'O'), ('wildfly', 'B-cpe_product'), ('core', 'I-cpe_product'), ('2.0.0', 'O'), ('alpha', 'O'), ('2', 'O')]
[('tumbleweed', 'B-cpe_vendor'), ('server', 'B-cpe_product'), ('validator', 'I-cpe_product'), ('suite', 'I-cpe_product'), ('4.10', 'B-cpe_version')]
[('gfi', 'B-cpe_vendor'), ('archiver', 'B-cpe_product'), ('15.2', 'B-cpe_version')]


In [8]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [9]:
class NERDataMaker:
    def __init__(self, texts):
        self.unique_entities = []
        self.processed_texts = []

        temp_processed_texts = []
        for text in texts:
            tokens_with_entities = get_tokens_with_entities(text)
            for _, ent in tokens_with_entities:
                if ent not in self.unique_entities:
                    self.unique_entities.append(ent)
            temp_processed_texts.append(tokens_with_entities)

        self.unique_entities.sort(key=lambda ent: ent if ent != "O" else "")

        for tokens_with_entities in temp_processed_texts:
            self.processed_texts.append([(t, self.unique_entities.index(ent)) for t, ent in tokens_with_entities])

    @property
    def id2label(self):
        return dict(enumerate(self.unique_entities))

    @property
    def label2id(self):
        return {v:k for k, v in self.id2label.items()}

    def __len__(self):
        return len(self.processed_texts)

    def __getitem__(self, idx):
        def _process_tokens_for_one_text(id, tokens_with_encoded_entities):
            ner_tags = []
            tokens = []
            for t, ent in tokens_with_encoded_entities:
                ner_tags.append(ent)
                tokens.append(t)

            return {
                "id": id,
                "ner_tags": ner_tags,
                "tokens": tokens
            }

        tokens_with_encoded_entities = self.processed_texts[idx]
        if isinstance(idx, int):
            return _process_tokens_for_one_text(idx, tokens_with_encoded_entities)
        else:
            return [_process_tokens_for_one_text(i+idx.start, tee) for i, tee in enumerate(tokens_with_encoded_entities)]

    def as_hf_dataset(self, tokenizer):
        from datasets import Dataset, Features, Value, ClassLabel, Sequence
        def tokenize_and_align_labels(examples):
            tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

            labels = []
            for i, label in enumerate(examples[f"ner_tags"]):
                word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
                previous_word_idx = None
                label_ids = []
                for word_idx in word_ids:  # Set the special tokens to -100.
                    if word_idx is None:
                        label_ids.append(-100)
                    elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                        label_ids.append(label[word_idx])
                    else:
                        label_ids.append(-100)
                    previous_word_idx = word_idx
                labels.append(label_ids)

            tokenized_inputs["labels"] = labels
            return tokenized_inputs

        ids, ner_tags, tokens = [], [], []
        for i, pt in enumerate(self.processed_texts):
            ids.append(i)
            pt_tokens,pt_tags = list(zip(*pt))
            ner_tags.append(pt_tags)
            tokens.append(pt_tokens)
        data = {
            "id": ids,
            "ner_tags": ner_tags,
            "tokens": tokens
        }
        features = Features({
            "tokens": Sequence(Value("string")),
            "ner_tags": Sequence(ClassLabel(names=dm.unique_entities)),
            "id": Value("int32")
        })
        ds = Dataset.from_dict(data, features)
        tokenized_ds = ds.map(tokenize_and_align_labels, batched=True)
        return tokenized_ds

In [10]:
# Create Training NER Data Object
dm = NERDataMaker(train_text)
print(f"total examples = {len(dm)}")
print(dm[0:3])

total examples = 2000
[{'id': 0, 'ner_tags': [2, 1, 3, 0, 0, 0], 'tokens': ['suse', 'rancher', '1.2.2', 'release', 'candidate', '3']}, {'id': 1, 'ner_tags': [2, 1, 3, 0, 0], 'tokens': ['extensis', 'mrsid', '1.99', 'for', 'irfanview']}, {'id': 2, 'ner_tags': [2, 1, 3], 'tokens': ['facebook', 'folly', '2018.05.14.00']}]


In [11]:
# Create NER Data Object
dm_test = NERDataMaker(test_text)
print(f"total examples = {len(dm_test)}")
print(dm_test[0:3])

total examples = 100
[{'id': 0, 'ner_tags': [2, 1, 4, 4, 3], 'tokens': ['tumbleweed', 'server', 'validator', 'suite', '4.10']}, {'id': 1, 'ner_tags': [2, 1, 3], 'tokens': ['gfi', 'archiver', '15.2']}, {'id': 2, 'ner_tags': [2, 1, 3], 'tokens': ['apperta', 'openeyes', '1.5.5']}]


## Model training
For this demo, I’ll use distilbert-base-uncased model. The dm object contains few properties which we pass to the AutoModelForTokenClassification.from_pretrained method.

In [12]:
from transformers import AutoTokenizer, DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=len(dm.unique_entities), id2label=dm.id2label, label2id=dm.label2id)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN t

In [13]:
train_ds = dm.as_hf_dataset(tokenizer=tokenizer)
test_ds = dm_test.as_hf_dataset(tokenizer=tokenizer)

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [14]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=num_epochs,
    weight_decay=num_decay,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds, 
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

The following columns in the training set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: ner_tags, tokens, id. If ner_tags, tokens, id are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2000
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 1250


  0%|          | 0/1250 [00:00<?, ?it/s]

The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: ner_tags, tokens, id. If ner_tags, tokens, id are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.11310423165559769, 'eval_runtime': 0.5156, 'eval_samples_per_second': 193.942, 'eval_steps_per_second': 13.576, 'epoch': 1.0}


The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: ner_tags, tokens, id. If ner_tags, tokens, id are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.06771220266819, 'eval_runtime': 0.5148, 'eval_samples_per_second': 194.245, 'eval_steps_per_second': 13.597, 'epoch': 2.0}


The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: ner_tags, tokens, id. If ner_tags, tokens, id are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.062498826533555984, 'eval_runtime': 0.508, 'eval_samples_per_second': 196.858, 'eval_steps_per_second': 13.78, 'epoch': 3.0}


Saving model checkpoint to ./results\checkpoint-500
Configuration saved in ./results\checkpoint-500\config.json


{'loss': 0.153, 'learning_rate': 1.2e-05, 'epoch': 4.0}


Model weights saved in ./results\checkpoint-500\pytorch_model.bin
tokenizer config file saved in ./results\checkpoint-500\tokenizer_config.json
Special tokens file saved in ./results\checkpoint-500\special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: ner_tags, tokens, id. If ner_tags, tokens, id are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.034930676221847534, 'eval_runtime': 0.3215, 'eval_samples_per_second': 311.027, 'eval_steps_per_second': 21.772, 'epoch': 4.0}


The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: ner_tags, tokens, id. If ner_tags, tokens, id are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.06904372572898865, 'eval_runtime': 0.5101, 'eval_samples_per_second': 196.055, 'eval_steps_per_second': 13.724, 'epoch': 5.0}


The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: ner_tags, tokens, id. If ner_tags, tokens, id are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.048532530665397644, 'eval_runtime': 0.5119, 'eval_samples_per_second': 195.366, 'eval_steps_per_second': 13.676, 'epoch': 6.0}


The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: ner_tags, tokens, id. If ner_tags, tokens, id are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.049744125455617905, 'eval_runtime': 0.561, 'eval_samples_per_second': 178.253, 'eval_steps_per_second': 12.478, 'epoch': 7.0}


Saving model checkpoint to ./results\checkpoint-1000
Configuration saved in ./results\checkpoint-1000\config.json


{'loss': 0.0118, 'learning_rate': 4.000000000000001e-06, 'epoch': 8.0}


Model weights saved in ./results\checkpoint-1000\pytorch_model.bin
tokenizer config file saved in ./results\checkpoint-1000\tokenizer_config.json
Special tokens file saved in ./results\checkpoint-1000\special_tokens_map.json
The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: ner_tags, tokens, id. If ner_tags, tokens, id are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.05985697731375694, 'eval_runtime': 0.35, 'eval_samples_per_second': 285.69, 'eval_steps_per_second': 19.998, 'epoch': 8.0}


The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: ner_tags, tokens, id. If ner_tags, tokens, id are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.05336456745862961, 'eval_runtime': 0.561, 'eval_samples_per_second': 178.254, 'eval_steps_per_second': 12.478, 'epoch': 9.0}


The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForTokenClassification.forward` and have been ignored: ner_tags, tokens, id. If ner_tags, tokens, id are not expected by `DistilBertForTokenClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 100
  Batch size = 16


  0%|          | 0/7 [00:00<?, ?it/s]



Training completed. Do not forget to share your model on huggingface.co/models =)




{'eval_loss': 0.05234888195991516, 'eval_runtime': 0.152, 'eval_samples_per_second': 657.879, 'eval_steps_per_second': 46.052, 'epoch': 10.0}
{'train_runtime': 436.3893, 'train_samples_per_second': 45.831, 'train_steps_per_second': 2.864, 'train_loss': 0.06708817844390869, 'epoch': 10.0}


TrainOutput(global_step=1250, training_loss=0.06708817844390869, metrics={'train_runtime': 436.3893, 'train_samples_per_second': 45.831, 'train_steps_per_second': 2.864, 'train_loss': 0.06708817844390869, 'epoch': 10.0})

In [15]:
model.save_pretrained("models/ner_rasa_vpv_v2")
tokenizer.save_pretrained("models/ner_rasa_vpv_v2/tokenizer")

Configuration saved in models/ner_rasa_vpv_v2\config.json
Model weights saved in models/ner_rasa_vpv_v2\pytorch_model.bin
tokenizer config file saved in models/ner_rasa_vpv_v2/tokenizer\tokenizer_config.json
Special tokens file saved in models/ner_rasa_vpv_v2/tokenizer\special_tokens_map.json


('models/ner_rasa_vpv_v2/tokenizer\\tokenizer_config.json',
 'models/ner_rasa_vpv_v2/tokenizer\\special_tokens_map.json',
 'models/ner_rasa_vpv_v2/tokenizer\\vocab.txt',
 'models/ner_rasa_vpv_v2/tokenizer\\added_tokens.json',
 'models/ner_rasa_vpv_v2/tokenizer\\tokenizer.json')

# INFERENCE

In [16]:
from transformers import pipeline
pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple", device=0) # pass device=0 if using gpu
pipe("""softing uagates 1.73 for wordpress""")

[{'entity_group': 'cpe_vendor',
  'score': 0.9995116,
  'word': 'soft',
  'start': 0,
  'end': 4},
 {'entity_group': 'cpe_product',
  'score': 0.6682099,
  'word': '##ing',
  'start': 4,
  'end': 7},
 {'entity_group': 'cpe_product',
  'score': 0.99949306,
  'word': 'uagates',
  'start': 8,
  'end': 15},
 {'entity_group': 'cpe_version',
  'score': 0.9995697,
  'word': '1',
  'start': 16,
  'end': 17}]

In [17]:
pipe("""microsoft visual c++ 2013 redistributable (x64) - 12.0.30501""")

[{'entity_group': 'cpe_vendor',
  'score': 0.99955875,
  'word': 'microsoft',
  'start': 0,
  'end': 9},
 {'entity_group': 'cpe_product',
  'score': 0.95530385,
  'word': 'visual c + + 2013 redistribu',
  'start': 10,
  'end': 36},
 {'entity_group': 'cpe_version',
  'score': 0.9993456,
  'word': '12',
  'start': 50,
  'end': 52},
 {'entity_group': 'cpe_version',
  'score': 0.99857056,
  'word': '0',
  'start': 53,
  'end': 54}]

In [18]:
pipe("""google chrome 32.0.1670.5""")

[{'entity_group': 'cpe_vendor',
  'score': 0.99949896,
  'word': 'google',
  'start': 0,
  'end': 6},
 {'entity_group': 'cpe_product',
  'score': 0.99968445,
  'word': 'chrome',
  'start': 7,
  'end': 13},
 {'entity_group': 'cpe_version',
  'score': 0.9995409,
  'word': '32',
  'start': 14,
  'end': 16},
 {'entity_group': 'cpe_version',
  'score': 0.99672794,
  'word': '.',
  'start': 16,
  'end': 17},
 {'entity_group': 'cpe_version',
  'score': 0.9993685,
  'word': '0',
  'start': 17,
  'end': 18},
 {'entity_group': 'cpe_version',
  'score': 0.99852043,
  'word': '1670',
  'start': 19,
  'end': 23}]

In [19]:
pipe("cool house technology ewelink 4.3.0 for android")

[{'entity_group': 'cpe_vendor',
  'score': 0.9875686,
  'word': 'cool house technology',
  'start': 0,
  'end': 21},
 {'entity_group': 'cpe_product',
  'score': 0.9962328,
  'word': 'ewelink',
  'start': 22,
  'end': 29},
 {'entity_group': 'cpe_version',
  'score': 0.99957496,
  'word': '4',
  'start': 30,
  'end': 31},
 {'entity_group': 'cpe_version',
  'score': 0.9994165,
  'word': '3',
  'start': 32,
  'end': 33}]

In [20]:
pipe("fastball productions fastball 2.5.3 for joomla")

[{'entity_group': 'cpe_vendor',
  'score': 0.9991911,
  'word': 'fast',
  'start': 0,
  'end': 4},
 {'entity_group': 'cpe_product',
  'score': 0.99947006,
  'word': 'fastball',
  'start': 21,
  'end': 29},
 {'entity_group': 'cpe_version',
  'score': 0.99958426,
  'word': '2',
  'start': 30,
  'end': 31},
 {'entity_group': 'cpe_version',
  'score': 0.99943036,
  'word': '5',
  'start': 32,
  'end': 33}]

In [21]:

pipe("""Microsoft Visual C++ 2013 Redistributable (x64) - 12.0.30501""")
# pipe("""microsoft visual c++ 2013 redistributable (x64) - 12.0.30501""")

# pipe("""google chrome 32.0.1670.5""")
# pipe("""draw.io 2.6.3 for confluence""")


# pipe("""progress sitefinity 9.2""")
# pipe("bitnami containers 7.30.1-debian-10-r40 for laravel")

# pipe("cool house technology ewelink 4.3.0 for android")
# pipe("fastball productions fastball 2.5.3 for joomla")

[{'entity_group': 'cpe_vendor',
  'score': 0.99955875,
  'word': 'microsoft',
  'start': 0,
  'end': 9},
 {'entity_group': 'cpe_product',
  'score': 0.95530385,
  'word': 'visual c + + 2013 redistribu',
  'start': 10,
  'end': 36},
 {'entity_group': 'cpe_version',
  'score': 0.9993456,
  'word': '12',
  'start': 50,
  'end': 52},
 {'entity_group': 'cpe_version',
  'score': 0.99857056,
  'word': '0',
  'start': 53,
  'end': 54}]