In [1]:
# https://towardsdatascience.com/custom-named-entity-recognition-using-spacy-7140ebbb3718
# https://spacy.io/usage/training#api

In [15]:
import spacy
from spacy import displacy
from spacy.training import Example
from spacy.lang.en import English
import random
import json

# labeled_data_path = r"/home/zhenyuan/AttacKG/NLP/Doccano/admin.jsonl"
labeled_data_path = r"C:\Users\xiaowan\Documents\GitHub\AttacKG\NLP\Doccano\admin.jsonl"
labeled_data = []
with open(labeled_data_path, "r") as read_file:
    for line in read_file:
        data = json.loads(line)
        labeled_data.append(data)

print('---Read Labeled Data(%d)!---' % len(labeled_data))

# self.nlp.initialize(lambda: self.spacy_data)
# split training and testing set
# training_set = spacy_data[0:19]
# testing_set = spacy_data[20:-1]

---Read Labeled Data(27)!---


In [8]:
# nlp = spacy.blank("en")
# nlp = spacy.load("en_core_web_sm") # python -m spacy download en_core_web_sm
model_location = None
# model_location = "/home/zhenyuan/AttacKG/NLP/cti.model"
ner_labels = ["NetLoc", "APTFamily", "ExeFile", "ScriptsFile", "DocumentFile", "E-mail", "Registry", "File", "Vulnerability", "C2C", "SensInfo", "Service"]
# ner_labels = ["FilePath", "NetLoc", "FileName", "Vulnerability", "Registry", "Attacker", "ExeFile", "DocFIle", "Service"]

if model_location is None:
    nlp = spacy.blank('en')
    print("---Created Blank 'en' Model!---")
else:
    nlp = spacy.load(model_location)
    print("---Load Model: %s!---" % model_location)

if 'ner' not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")
else:
    ner = nlp.get_pipe("ner")
print("---Add Pipe 'ner'!---")

for label in ner_labels:
    ner.add_label(label)

if model_location is None:
    optimizer = nlp.begin_training()
else:
    optimizer = ner.create_optimizer()
print("---Created Optimizer!---")

---Created Blank 'en' Model!---
---Add Pipe 'ner'!---
---Created Optimizer!---


In [16]:
# Data format converting
spacy_data = []
for entry in labeled_data:
    entities = []
    for e in entry['label']:
        entities.append((e[0], e[1], e[2]))
    try:
        spacy_data.append(Example.from_dict(nlp.make_doc(entry['data']), {"entities": entities}))
    except:
        print("Wrong format: %s!" % entry['data'])
print(spacy_data)

[{'doc_annotation': {'cats': {}, 'entities': ['B-APTFamily', 'L-APTFamily', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], 'links': {}}, 'token_annotation': {'ORTH': ['Cobalt', 'Strike', 'can', 'use', 'known', 'credentials', 'to', 'run', 'commands', 'and', 'spawn', 'processes', 'as', 'a', 'domain', 'user', 'account', '.'], 'SPACY': [True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, True, False, False], 'TAG': ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''], 'LEMMA': ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''], 'POS': ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''], 'MORPH': ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''], 'HEAD': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17], 'DEP': ['', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', ''], 'SENT_START': [1, 0, 0, 0, 0, 0, 0, 0, 



In [18]:
len(spacy_data)

27

In [17]:
# Start training
print("---Start Training!---")
new_model_location = "./new_cti.model"

# Loop
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes):
    for itn in range(4):
        random.shuffle(spacy_data)
        losses = ()

        # Batch the examples
        for batch in spacy.util.minibatch(spacy_data, size=2):
            # Update the model
            nlp.update(batch, sgd=optimizer) #, drop=0.35, losses=losses
            # print('Losses', losses)

nlp.to_disk(new_model_location)
print("---Save Model to %s!---" % new_model_location)

---Start Training!---
---Save Model to ./new_cti.model!---


In [19]:
sample = "APT3 has used PowerShell on victim systems to download and run payloads after exploitation."

#def test_model(sample = sample):
doc = nlp(sample)
displacy.render(doc, style='ent')

In [20]:
sample = "MCMD can use Registry Run Keys for persistence."
nlp = spacy.load(new_model_location)
# nlp = spacy.load("en_core_web_sm")
doc = nlp(sample)
displacy.render(doc, style = 'ent')

In [2]:
import spacy
from spacy.pipeline import EntityRuler
import import_ipynb
import nlp_parser

nlp = spacy.blank('en')
doc = nlp("APT3 has used PowerShell on victim systems to download and run payloads after exploitation.")
print([(ent.text, ent.label_) for ent in doc.ents])

# https://stackoverflow.com/questions/57667710/using-regex-for-phrase-pattern-in-entityruler
# patterns = [{"label": "ExeFile", "pattern": "payloads"}]
patterns = [
    {"label": "ExeFile", "pattern":[{"TEXT": {"REGEX": "payloads"}}]}

]

config = {
   "phrase_matcher_attr": None,
   "validate": True,
   "overwrite_ents": False,
   "ent_id_sep": "||",
}

ruler = nlp.add_pipe("entity_ruler", config=config)
ruler.add_patterns(patterns)

doc = nlp("APT3 has used PowerShell on victim systems to download and run payloads after exploitation.")
print([(ent.text, ent.label_) for ent in doc.ents])
spacy.displacy.render(doc, style = 'ent')

importing Jupyter notebook from nlp_parser.ipynb





Using the 'dep' visualizer
Serving on http://0.0.0.0:5000 ...

Shutting down server on port 5000.


NameError: name 'to_nltk_formatted_tree' is not defined