In [2]:
import re 
import string 
import nltk 
import spacy 
import pandas as pd 
import numpy as np 
import math 
from tqdm import tqdm 

from spacy.matcher import Matcher 
from spacy.tokens import Span 
from spacy import displacy

# pd.set_option('display.max_colwidth', 200)

# load spaCy model
# nlp = spacy.load("en_core_web_sm")

In [6]:
import spacy
from nltk import Tree

def to_nltk_tree(node):
    if node.n_lefts + node.n_rights > 0:
        return Tree(node.orth_, [to_nltk_tree(child) for child in node.children])
    else:
        return node.orth_

def tok_format(tok):
    return "_".join([tok.orth_, tok.tag_]) #, tok.dep_])

def to_nltk_formatted_tree(node):
    if node.n_lefts + node.n_rights > 0:
        return Tree(tok_format(node), [to_nltk_formatted_tree(child) for child in node.children])
    else:
        return tok_format(node)

In [42]:
en_nlp = spacy.load('en_core_web_sm')

doc = en_nlp("APT19 sent spearphishing emails with malicious attachments in RTF and XLSM formats to deliver initial exploits")

# displacy.render(doc, style='dep',jupyter=True)
tree = [to_nltk_formatted_tree(sent.root) for sent in doc.sents]
# tree[0].draw()

[to_nltk_formatted_tree(sent.root).pretty_print() for sent in doc.sents]

                          sent_VBD                                                          
    _________________________|_______________________                                        
   |                                          spearphishing_VB                              
   |                                                 G                                      
   |                          _______________________|___________________                    
   |                     emails_NNS                                      |                  
   |                         |                                           |                   
   |                      with_IN                                        |                  
   |                         |                                           |                   
   |                  attachments_NNS                                    |                  
   |           ______________|___________                         

[None]

In [50]:
# Text: The original word text.
# Lemma: The base form of the word.
# POS: The simple UPOS part-of-speech tag.
# Tag: The detailed part-of-speech tag.
# Dep: Syntactic dependency, i.e. the relation between tokens.
# Shape: The word shape â€“ capitalization, punctuation, digits.
# is alpha: Is the token an alpha character?
# is stop: Is the token part of a stop list, i.e. the most common words of the language?

for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop)

APT19 apt19 NOUN NN nsubj XXXdd False False
sent send VERB VBD ROOT xxxx True False
spearphishing spearphishe VERB VBG xcomp xxxx True False
emails email NOUN NNS dobj xxxx True False
with with ADP IN prep xxxx True True
malicious malicious ADJ JJ amod xxxx True False
attachments attachment NOUN NNS pobj xxxx True False
in in ADP IN prep xx True True
RTF RTF PROPN NNP pobj XXX True False
and and CCONJ CC cc xxx True True
XLSM XLSM PROPN NNP compound XXXX True False
formats format NOUN NNS conj xxxx True False
to to PART TO aux xx True True
deliver deliver VERB VB advcl xxxx True False
initial initial ADJ JJ amod xxxx True False
exploits exploit NOUN NNS dobj xxxx True False


In [36]:
import networkx as nx

# Read graph
G = nx.Graph()
G = nx.read_gml('Tactic_Technique_Reference_Example.gml')

examples = []
for n in G.neighbors('/techniques/T1566/001'):
    if G.nodes[n]['types'] == 'examples':
#         print(n)
        examples.append(n)
    
# regex_pattern = "(\[\d+\])+"
# for example in examples:
#     result = re.sub(regex_pattern, '', example)
#     print(result)
    
examples_stripped = [re.sub("(\[\d+\])+", '', example) for example in examples]

In [41]:
import spacy

nlp = spacy.load("en_core_web_sm")
for doc in nlp.pipe(examples_stripped):
    # Do something with the doc here
#     print([(ent.text, ent.label_) for ent in doc.ents])
    print(doc)
    
    tree = [to_nltk_formatted_tree(sent.root) for sent in doc.sents]
    [to_nltk_formatted_tree(sent.root).pretty_print() for sent in doc.sents]

admin@338 has sent emails with malicious Microsoft Office documents attached.
                                 sent_VBN                                                   
       _____________________________|________________________                                
      |          |        |         |                     with_IN                           
      |          |        |         |                        |                               
      |          |        |         |                  documents_NNS                        
      |          |        |         |           _____________|_______________________        
admin@338_NNP has_VBZ emails_NNS   ._.    malicious_JJ Microsoft_NNP Office_NNP attached_VBN

APT-C-36 has used spearphishing emails with password protected RAR attachment to avoid being detected by the email gateway. 
                                                      used_VBN                                                                                