In [None]:
import spacy
from spacy.training.example import Example
import random
import json

with open("output_complete.json", "r") as f:
    TRAIN_DATA = json.load(f)

nlp = spacy.blank("en")

if "ner" not in nlp.pipe_names:
    ner = nlp.add_pipe("ner")
else:
    ner = nlp.get_pipe("ner")

for _, annotations in TRAIN_DATA:
    for start, end, label in annotations["entities"]:
        ner.add_label(label)

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    for i in range(30):
        random.shuffle(TRAIN_DATA)
        losses = {}
        for text, annotations in TRAIN_DATA:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            nlp.update([example], losses=losses, drop=0.3)

output_dir = "ner_small_model"
nlp.to_disk(output_dir)
print("Model saved to", output_dir)

Model saved to ner_small_model


In [None]:
import spacy
nlp = spacy.load("ner_small_model")

test_texts = [
                "The threat actor known as Patchwork has been attributed to a new spear-phishing campaign targeting Turkish defense contractors with the goal of gathering strategic intelligence. The campaign employs a five-stage execution chain delivered via malicious LNK files disguised as conference invitations sent to targets interested in learning more about unmanned vehicle systems, Arctic Wolf Labs said in a technical report published this week. The activity, which also singled out an unnamed manufacturer of precision-guided missile systems, appears to be geopolitically motivated as the timing coincides amid deepening defense cooperation between Pakistan and Türkiye, and the recent India-Pakistan military skirmishes. Patchwork, also called APT-C-09, APT-Q-36, Chinastrats, Dropping Elephant, Operation Hangover, Quilted Tiger, and Zinc Emerson, is assessed to be a state-sponsored actor of Indian origin. Known to be active since at least 2009, the hacking group has a track record of striking entities in China, Pakistan, and other countries in South Asia. xactly a year ago, the Knownsec 404 Team documented Patchwork's targeting entities with ties to Bhutan to deliver the Brute Ratel C4 framework and an updated version of a backdoor called PGoShell. Since the start of 2025, the threat actor has been linked to various campaigns aimed at Chinese universities, with recent attacks using baits related to power grids in the country to deliver a Rust-based loader that, in turn, decrypts and launches a C# trojan called Protego to harvest a wide range of information from compromised Windows systems. Another report published by Chinese cybersecurity firm QiAnXin back in May said it identified infrastructure overlaps between Patchwork and DoNot Team (aka APT-Q-38 or Bellyworm), suggesting potential operational connections between the two threat clusters."
             ]

for text in test_texts:
    doc = nlp(text)
    print(f"\nText: {text}")
    for ent in doc.ents:
        print(f"{ent.text} -> {ent.label_}")


Text: The threat actor known as Patchwork has been attributed to a new spear-phishing campaign targeting Turkish defense contractors with the goal of gathering strategic intelligence. The campaign employs a five-stage execution chain delivered via malicious LNK files disguised as conference invitations sent to targets interested in learning more about unmanned vehicle systems, Arctic Wolf Labs said in a technical report published this week. The activity, which also singled out an unnamed manufacturer of precision-guided missile systems, appears to be geopolitically motivated as the timing coincides amid deepening defense cooperation between Pakistan and Türkiye, and the recent India-Pakistan military skirmishes. Patchwork, also called APT-C-09, APT-Q-36, Chinastrats, Dropping Elephant, Operation Hangover, Quilted Tiger, and Zinc Emerson, is assessed to be a state-sponsored actor of Indian origin. Known to be active since at least 2009, the hacking group has a track record of striking 

In [None]:
import spacy
import json

nlp = spacy.load("ner_small_model")

with open("output_complete.json", "r") as f:
    TRAIN_DATA = json.load(f)

correct = 0
total_gold = 0
total_pred = 0

for text, ann in TRAIN_DATA:
    doc = nlp(text)
    gold_ents = set(
        (text[start:end], label) for start, end, label in ann["entities"]
    )
    pred_ents = set(
        (ent.text, ent.label_) for ent in doc.ents
    )
    correct += len(gold_ents & pred_ents)
    total_gold += len(gold_ents)
    total_pred += len(pred_ents)

precision = (correct / total_pred * 100) if total_pred else 0
recall = (correct / total_gold * 100) if total_gold else 0
f1 = (2 * precision * recall / (precision + recall)) if (precision + recall) else 0

print(f"NER Model Accuracy (F1): {f1:.2f}%")

NER Model Accuracy (F1): 95.93%


In [None]:
from google.colab import files
uploaded = files.upload()

Saving output_complete.json to output_complete.json


In [None]:
from google.colab import files
uploaded = files.upload()

Saving output_complete.json to output_complete.json
