In [5]:
'''
This file gathers all previously selected categories:
1. ActionName
    File
    Network
    Other
2. Capability
    infection_propagation
    ...something
    other
into one DataFrame, instead of gathering binary data into files 
(for example ActionName - NoActionName)
'''

'\nThis file gathers all previously selected categories:\n1. ActionName\n    File\n    Network\n    Other\n2. Capability\n    infection_propagation\n    ...something\n    other\ninto one DataFrame, instead of gathering binary data into files \n(for example ActionName - NoActionName)\n'

In [6]:
import re
import os

In [17]:
root = '/home/jells123/Documents/ENGINEER/DATA/1-Raw data/data/'
ann = '/annotations'

sets = {
    'Train' : 'train',
    'Dev' : 'dev',
    'Test' : 'test_3'
}

rel_pattern = r'(?P<r_id>R[0-9]+)\s+[A-Za-z]+\s+(?P<r_first>[A-Za-z]+:T[0-9]+)\s+(?P<r_second>[A-Za-z]+:T[0-9]+)'
ann_pattern = r'(?P<a_id>A[0-9]+)\s+(?P<cat>[A-Za-z]+)\s+(?P<t_id>T[0-9]+)\s+(?P<text>[^\n]+)'
token_pattern = r'(?P<t_id>T[0-9]+)\s+(?P<t_type>[A-Z][a-z]+) (?P<indices>[0-9]+ [0-9]+[^\t]+)\t(?P<t_text>.+)\n'

In [8]:
def init_data():
    data = {
        'ActionName' : [],
        'Capability' : [],
        
        'token' : [],
        'text-rel-subj' : [],
        'text-rel' : [],
        'text-neigh' : []
    }
    undefined = None
    return data, undefined

In [9]:
def process_annotation(anns, words, token):
    
#     data['Text'].append(words)
    
    current_annotations = {}
    for ann in anns:
        m = re.search(r'(?P<id>[0-9]{3}):(?P<type>[A-Za-z]+)-(?P<name>.+)\s*', ann[3])
        typ, name = m.group('type'), m.group('name')
        typ, name = re.sub(r'\/', '_', typ), re.sub(r'\/', '_', name)
        main_cat = ann[1]
        if main_cat == 'Capability':
            # special treatment for Capability because there are no sub-categories
            current_annotations[main_cat] = name
        else:
            current_annotations[main_cat] = typ
    
    ann_result = []
    for c in [
        ['ActionName', ['File', 'Network']], 
        ['Capability', ['infection_propagation', 'command_and_control']]
    ]:
        category = c[0]
        sub_cats = c[1]
        if category in current_annotations.keys():
            sub_cat = current_annotations[category]
            if sub_cat in sub_cats:
                ann_result.append(sub_cat)
            else:
                ann_result.append("Other")
        else:
            ann_result.append("-")
            
    return ann_result
    

In [10]:
def get_token_words_relation(t, tokens, rels, anns, include_subject=False):
    
    global tokens_count
    tokens_count += 1
    
    relations = list(filter(lambda x : x[1] == 'Action:'+t[0] , rels))
    if relations:
        mod_relations = list(filter(lambda x : 'Modifier' in x[2], relations))
        if mod_relations:
            for mr in mod_relations:
                relations += list(filter(lambda x : x[1] == mr[2], rels))
    
    if include_subject:
        subject = list(filter(lambda x : 'Subject' in x[1] and x[2] == 'Action:'+t[0], rels))
        if subject:
            relations.append(subject[0])
            
        modifier = list(filter(lambda x : 'Modifier' in x[1] and x[2] == 'Action:'+t[0], rels))
        if modifier:
            relations.append(modifier[0])
            # it never happens...

    if not relations:
        print("Token {} has no relation.".format(t))
        global relation_missing
        relation_missing += 1
    
    words = [t[3]]
    
    for r in relations:
        
        if include_subject and 'Subject' in r[1]: 
            # subject -> action
            token_id = r[1].split(':')[1]
            token_word = list(filter(lambda x : x[0] == token_id, tokens))
                        
            if token_word and token_word[0][3] not in words:
                words.insert(0, token_word[0][3])
            
        else:
            # action -> object, action -> mod -> object
            token_id = r[2].split(':')[1]
            token_word = list(filter(lambda x : x[0] == token_id, tokens))
            if token_word and token_word[0][3] not in words:
                if token_word[0][3][0].isupper():
                    words.insert(0, token_word[0][3])
                else:
                    words.append(token_word[0][3])
    
    # relation gathered and here we create the dataset
    # id prefer it in anotehr function but whatever
    annotations = list(filter(lambda x : x[2] == t[0], anns))
    annotation_result = process_annotation(annotations, words, t)
    
    if not annotations:
        print("Token {} has no annotation.".format(t))
        global annotation_missing
        annotation_missing += 1
        print(words)
    
    return words, annotation_result             


In [11]:
def get_token_words_coords(t, text, off=0):
    indices = t[2]
    indices = indices.split(';') if ';' in indices else [indices]
    real_token = ''
    indices = [[int(idx) for idx in i.split()] for i in indices]
    for i in indices:
        real_token += text[i[0]+off : i[1]+off] + ' '
        
    while '- ' in real_token:
        real_token = real_token.replace('- ', '')
    return real_token.strip(), (indices[0][0]+off, indices[-1][1]+off)

def get_token_neighbors(token, coords, text, neighbors):
    
    left = (neighbors-1)/2
    right = left
    
    l_idx = coords[0] - 1
    r_idx = coords[1] + 1
    
    words = []
    while left:
        while text[l_idx].isspace():
            l_idx -= 1
        word = ''
        while not text[l_idx].isspace():
            word += text[l_idx]
            l_idx -= 1
        
        word = word[::-1]
        if word[0] == '<' or word[-1] == '>': #html tag
            l_idx -=1
            word = ''
        else:
            words.insert(0, word)
            left -= 1
    
    words.append(token)
    while right:
        while text[r_idx].isspace():
            r_idx += 1
        word = ''
        while not text[r_idx].isspace():
            word += text[r_idx]
            r_idx += 1
            
        if word[0] == '<' or word[-1] == '>':
            r_idx += 1
            word = ''
        else:
            words.append(word)
            right -= 1
            
    return words
        
def gather_data(words, anns, t):
    annotations = list(filter(lambda x : x[2] == t[0], anns))
    annotation_result = process_annotation(annotations, words, t)
    
    if not annotations:
        print("Token {} has no annotation.".format(t))
        global annotation_missing
        annotation_missing += 1
        
    return annotation_result

In [22]:
# set_type = 'Test'

data, undefined = init_data()

relation_mode = True
include_subject = True

if relation_mode:
    if include_subject:
        label = 'text-rel-subj'
    else:
        label = 'text-rel'
else:
    label = 'text-neigh'
    
relation_missing = 0
annotation_missing = 0
tokens_count = 0

rel_pattern = r'(?P<r_id>R[0-9]+)\s+(SubjAction)\s+(?P<r_first>[A-Za-z]+:T[0-9]+)\s+(?P<r_second>[A-Za-z]+:T[0-9]+)'

path = root + 'dev' + ann
print(path)

count_relations = 0
for filename in os.listdir(path):

    if filename.endswith('.ann'):
        print("\n>>> {}".format(filename))

        with open(path + '/' + filename) as ann_file:
            content = ann_file.read()

            # match lines with a token            
            tokens = re.findall(token_pattern, content)
            relations = re.findall(rel_pattern, content)
            annotations = re.findall(ann_pattern, content)
            
            print(len(relations))
            count_relations += len(relations)

#             with open(path + "/" + filename.replace(".ann", ".txt")) as txt_file:
#                 text = txt_file.read()
#                 text = re.sub(r'[^\x00-\x7F]+',' ', text)
#                 offset = 0
#                 for t in tokens:
#                     if t[1] == 'Action':

#                         ann_token = t[3]
#                         tokens_count += 1

#                         # WHICH TOKEN?
#                         data['token'].append(ann_token)

#                         idx_token, coords = get_token_words_coords(t, text, offset)

#                         if ann_token != idx_token:
#                             # this fixes errors occuring in one file - indices are invalid
#                             where_is = text.find(ann_token)
#                             offset = where_is - int(t[2].split()[0])
#                             idx_token, coords = get_token_words_coords(t, text, offset)

#                         # NEIGHBOURS
#                         words = get_token_neighbors(ann_token, coords, text, 9)
#                         annotation_result = gather_data(words, annotations, t)
#                         if not words or not annotation_result:
#                             print("NEIGHBOURS error")
#                         else:
#                             data['text-neigh'].append(words)

#                         # BASIC RELATIONS
#                         words, annotation_result = get_token_words_relation(t, tokens, 
#                                                                         relations, annotations,
#                                                                        include_subject=False)
#                         if not words or not annotation_result:
#                             print("RELATIONS error")
#                         else:
#                             data['text-rel'].append(words)

#                         # BASIC RELATIONS + SUBJECT->ACTION RELATION
#                         words, annotation_result = get_token_words_relation(t, tokens, 
#                                                                         relations, annotations,
#                                                                        include_subject=True)
#                         if not words or not annotation_result:
#                             print("RELATIONS+SUBJ error")
#                         else:
#                             data['text-rel-subj'].append(words)


#                         # WHICH CLASSES?
#                         data['ActionName'].append(annotation_result[0])
#                         data['Capability'].append(annotation_result[1])

print(count_relations)                   

/home/jells123/Documents/ENGINEER/DATA/1-Raw data/data/dev/annotations

>>> Carbanak_APT_eng.ann
21

>>> Anunak_APT_against_financial_institutions.ann
13

>>> Agent.BTZ_to_ComRAT.ann
3

>>> Dragonfly_Threat_Against_Western_Energy_Suppliers.ann
13

>>> GlobalThreatIntelReport.ann
38
88


In [129]:
import pandas as pd
import numpy as np

[print(key, len(data[key])) for key in data.keys()]

dataset = pd.DataFrame({
    key : data[key] for key in data.keys()
})

# dataset['text-rel-lengths'] = dataset['text-rel'].apply(lambda x : len((' '.join(x)).split()))
# dataset['text-rel-subj-lengths'] = dataset['text-rel-subj'].apply(lambda x : len((' '.join(x)).split()))

# print(np.mean(dataset['text-rel-lengths']))
# print(np.median(dataset['text-rel-lengths']))

# print(np.mean(dataset['text-rel-subj-lengths']))
# print(np.median(dataset['text-rel-subj-lengths']))

for column in ['text-rel', 'text-neigh', 'text-rel-subj']:
    dataset[column] = dataset[column].apply(' '.join)
    dataset[column] = dataset[column].apply(lambda x : re.sub(r"[“”\[\]]", "", x))  
dataset.head()

ActionName 344
Capability 344
token 344
text-rel-subj 344
text-rel 344
text-neigh 344


Unnamed: 0,ActionName,Capability,token,text-rel-subj,text-rel,text-neigh
0,Network,command_and_control,upload,Seaduke operators upload files to the command-...,upload files to the command- and-control (C&C)...,to infected machines by upload ng tasks to a
1,-,Other,retrieve,They retrieve detailed bot/system information,retrieve detailed bot/system information,contact these websites to retrieve task inform...
2,-,command_and_control,update,They update bot configuration,update bot configuration,"retrieve detailed bot/system information, upda..."
3,Network,-,upload,They upload files,upload files,"information, update bot configuration, upload ..."
4,Network,-,download,They download files,download files,"bot configuration, upload files, download file..."


In [130]:
print("_" * 10 + set_type + "_" * 10, '\n')
dataset.to_csv('All-{}.csv'.format(set_type))
print(dataset.ActionName.value_counts(), "\n")
print(dataset.Capability.value_counts(), "\n")

__________Test__________ 

-          229
Other       51
File        45
Network     19
Name: ActionName, dtype: int64 

Other                    214
-                         65
command_and_control       46
infection_propagation     19
Name: Capability, dtype: int64 

