In [28]:
'''
This file gathers all previously selected categories:
1. ActionName
    File
    Network
    Other
2. Capability
    infection_propagation
    ...something
    other
into one DataFrame, instead of gathering binary data into files 
(for example ActionName - NoActionName)
'''

'\nThis file gathers all previously selected categories:\n1. ActionName\n    File\n    Network\n    Other\n2. Capability\n    infection_propagation\n    ...something\n    other\ninto one DataFrame, instead of gathering binary data into files \n(for example ActionName - NoActionName)\n'

In [22]:
import re
import os

In [76]:
root = '/home/jells123/Documents/ENGINEER/DATA/1-Raw data/data/'
ann = '/annotations'

sets = {
    'Train' : 'train',
    'Dev' : 'dev',
    'Test' : 'test_3'
}
set_type = 'Test'
path = root + sets[set_type] + ann

rel_pattern = r'(?P<r_id>R[0-9]+)\s+[A-Za-z]+\s+(?P<r_first>[A-Za-z]+:T[0-9]+)\s+(?P<r_second>[A-Za-z]+:T[0-9]+)'
ann_pattern = r'(?P<a_id>A[0-9]+)\s+(?P<cat>[A-Za-z]+)\s+(?P<t_id>T[0-9]+)\s+(?P<text>[^\n]+)'
token_pattern = r'(?P<t_id>T[0-9]+)\s+(?P<t_type>[A-Z][a-z]+) (?P<indices>[0-9]+ [0-9]+[^\t]+)\t(?P<t_text>.+)\n'

In [67]:
def init_data():
    data = {
        'Text' : [],
        'ActionName' : [],
        'Capability' : []
    }
    undefined = None
    return data, undefined

In [68]:
def process_annotation(anns, words, token):
    
    data['Text'].append(words)
    
    current_annotations = {}
    for ann in anns:
        m = re.search(r'(?P<id>[0-9]{3}):(?P<type>[A-Za-z]+)-(?P<name>.+)\s*', ann[3])
        typ, name = m.group('type'), m.group('name')
        typ, name = re.sub(r'\/', '_', typ), re.sub(r'\/', '_', name)
        main_cat = ann[1]
        if main_cat == 'Capability':
            # special treatment for Capability because there are no sub-categories
            current_annotations[main_cat] = name
        else:
            current_annotations[main_cat] = typ
    
    for c in [
        ['ActionName', ['File', 'Network']], 
        ['Capability', ['infection_propagation', 'command_and_control']]
    ]:
        category = c[0]
        sub_cats = c[1]
        if category in current_annotations.keys():
            sub_cat = current_annotations[category]
            if sub_cat in sub_cats:
                data[category].append(sub_cat)
            else:
                data[category].append("Other")
        else:
            data[category].append("-")
    

In [69]:
def get_token_words_relation(t, tokens, rels, anns):
    
    global tokens_count
    tokens_count += 1
    
    relations = list(filter(lambda x : x[1] == 'Action:'+t[0] , rels))
    if relations:
        mod_relations = list(filter(lambda x : 'Modifier' in x[2], relations))
        if mod_relations:
            for mr in mod_relations:
                relations += list(filter(lambda x : x[1] == mr[2], rels))
    
    if include_subject:
        subject = list(filter(lambda x : 'Subject' in x[1] and x[2] == 'Action:'+t[0], rels))
        if subject:
            subject = subject[0]
            relations.append(subject)

    if not relations:
        print("Token {} has no relation.".format(t))
        global relation_missing
        relation_missing += 1
    
    words = []
    words.append(t[3])
    
    for r in relations:
        
        if include_subject and 'Subject' in r[1]: 
            # subject -> action
            token_id = r[1].split(':')[1]
            token_word = list(filter(lambda x : x[0] == token_id, tokens))
            if token_word and token_word[0][3] not in words:
                words.insert(0, token_word[0][3])
            
        else:
            # action -> object, action -> mod -> object
            token_id = r[2].split(':')[1]
            token_word = list(filter(lambda x : x[0] == token_id, tokens))
            if token_word and token_word[0][3] not in words:
                if token_word[0][3][0].isupper():
                    words.insert(0, token_word[0][3])
                else:
                    words.append(token_word[0][3])
    
    # relation gathered and here we create the dataset
    # id prefer it in anotehr function but whatever
    annotations = list(filter(lambda x : x[2] == t[0], anns))
    process_annotation(annotations, words, t)
    
    if not annotations:
        print("Token {} has no annotation.".format(t))
        global annotation_missing
        annotation_missing += 1
                


In [70]:
def get_token_words_coords(t, text, off=0):
    indices = t[2]
    indices = indices.split(';') if ';' in indices else [indices]
    real_token = ''
    indices = [[int(idx) for idx in i.split()] for i in indices]
    for i in indices:
        real_token += text[i[0]+off : i[1]+off] + ' '
        
    while '- ' in real_token:
        real_token = real_token.replace('- ', '')
    return real_token.strip(), (indices[0][0]+off, indices[-1][1]+off)

def get_token_neighbors(token, coords, text, neighbors):
    
    left = (neighbors-1)/2
    right = left
    
    l_idx = coords[0] - 1
    r_idx = coords[1] + 1
    
    words = []
    while left:
        while text[l_idx].isspace():
            l_idx -= 1
        word = ''
        while not text[l_idx].isspace():
            word += text[l_idx]
            l_idx -= 1
        
        word = word[::-1]
        if word[0] == '<' or word[-1] == '>': #html tag
            l_idx -=1
            word = ''
        else:
            words.insert(0, word)
            left -= 1
    
    words.append(token)
    while right:
        while text[r_idx].isspace():
            r_idx += 1
        word = ''
        while not text[r_idx].isspace():
            word += text[r_idx]
            r_idx += 1
            
        if word[0] == '<' or word[-1] == '>':
            r_idx += 1
            word = ''
        else:
            words.append(word)
            right -= 1
            
    return words
        
def gather_data(words, anns, t):
    annotations = list(filter(lambda x : x[2] == t[0], anns))
    process_annotation(annotations, words, t)
    
    if not annotations:
        print("Token {} has no annotation.".format(t))
        global annotation_missing
        annotation_missing += 1

In [77]:
data, undefined = init_data()
print(data)

relation_mode = True

relation_missing = 0
annotation_missing = 0
tokens_count = 0

include_subject = True
action_tokens = []

for filename in os.listdir(path):
    
    if filename.endswith('.ann'):
        print("\n>>> {}".format(filename))
        with open(path + '/' + filename) as ann_file:
            content = ann_file.read()
            
            # match lines with a token            
            tokens = re.findall(token_pattern, content)
            relations = re.findall(rel_pattern, content)
            annotations = re.findall(ann_pattern, content)
            
            if not relation_mode:
            # read both .ann and .txt file
                with open(path + "/" + filename.replace(".ann", ".txt")) as txt_file:
                    text = txt_file.read()
                    text = re.sub(r'[^\x00-\x7F]+',' ', text)
                    offset = 0
                    for t in tokens:
                        if t[1] == 'Action':
                            # get token from annotations + text and compare them, are they equal?
                            ann_token = t[3]
                            tokens_count += 1
                            while '- ' in ann_token:
                                ann_token = ann_token.replace('- ', '')

                            idx_token, coords = get_token_words_coords(t, text, offset)
                            # oops, we got a mismatch!
                            if ann_token != idx_token:
                                # probably something went wrong! - we need to deal with it as always :)
                                where_is = text.find(ann_token)
                                offset = where_is - int(t[2].split()[0])
                                idx_token, coords = get_token_words_coords(t, text, offset)

                            words = get_token_neighbors(ann_token, coords, text, 9)
                            gather_data(words, annotations, t)
                            
            else:
            # older version -> finds token groups by relation
                for t in tokens:
                    if t[1] == 'Action':
                        get_token_words_relation(t, tokens, relations, annotations)         
                        action_tokens.append(t[3])


{'Text': [], 'ActionName': [], 'Capability': []}

>>> butterfly-corporate-spies-out-for-financial-gain.ann

>>> Equation_group_questions_and_answers.ann

>>> Dissecting-LinuxMoose.ann
Token ('T136', 'Action', '52031 52036', 'reach') has no annotation.

>>> Dissecting-the-Kraken.ann
Token ('T16', 'Action', '10699 10705', 'update') has no relation.
Token ('T17', 'Action', '10710 10719', 'uninstall') has no relation.

>>> DEEP_PANDA_Sakula.ann


In [72]:
data['Text']

[['Carbanak', 'designed', 'for', 'espionage'],
 ['Carbanak', 'designed', 'for', 'data exfiltration'],
 ['Carbanak', 'designed', 'to', 'provide remote access to infected machines'],
 ['They',
  'install',
  'additional software such as the Ammyy Remote Administration Tool'],
 ['They', 'SSH servers', 'compromise'],
 ['The videos', 'sent', 'to', 'the C2 server'],
 ['made', 'video recordings of the activities of bank employees'],
 ['Carbanak', 'copies', 'itself', 'into', '“%system32%\\com”'],
 ['the malware', 'creates', 'a new service'],
 ['Carbanak',
  'determines',
  'if',
  'either the avp.exe or avpui.exe processes (components of Kaspersky Internet Security) is running'],
 ['Carbanak',
  'creates',
  'a file with a random name and a .bin extension',
  'in',
  '%COMMON_APPDATA%\\Mozilla'],
 ['the malware',
  'gets',
  'the proxy configuration',
  'from',
  'the registry entry'],
 ['The malware', 'saves', 'files', 'in'],
 ['The malware', 'sent', 'to', 'files', 'the C2 server'],
 ['Carban

In [78]:
import pandas as pd

dataset = pd.DataFrame({
    'text-rel' : data['Text'],
    'token' : action_tokens,
    'ActionName' : data['ActionName'],
    'Capability' : data['Capability']
})
    
dataset.head()

Unnamed: 0,text-rel,token,ActionName,Capability
0,"[The attackers, obtaining, access, to, specifi...",obtaining,-,Other
1,"[they, eavesdropped, on, email conversations]",eavesdropped,-,Other
2,"[they, insert, fraudulent emails]",insert,-,Other
3,"[used, a command-and-control (C&C) server, in,...",used,-,command_and_control
4,"[Bda9.tmp, executed]",executed,File,-


In [79]:
dataset['text-rel'] = dataset['text-rel'].apply(' '.join)
dataset['text-rel'] = dataset['text-rel'].apply(lambda x : re.sub(r"[“”\[\]]", "", x))
dataset.head()

Unnamed: 0,text-rel,token,ActionName,Capability
0,The attackers obtaining access to specific sys...,obtaining,-,Other
1,they eavesdropped on email conversations,eavesdropped,-,Other
2,they insert fraudulent emails,insert,-,Other
3,used a command-and-control (C&C) server in an ...,used,-,command_and_control
4,Bda9.tmp executed,executed,File,-


In [80]:
# dataset.to_csv('neighbours/{}-{}.csv'.format(main_cat, set_type))
dataset.to_csv('All-{}.csv'.format(set_type))
print(dataset.ActionName.value_counts(), "\n")
print(dataset.Capability.value_counts(), "\n")

-          76
File       15
Other      12
Network     7
Name: ActionName, dtype: int64 

Other                    63
-                        22
command_and_control      15
infection_propagation    10
Name: Capability, dtype: int64 

