In [4]:
import numpy as np

def load_embeddings(path):
    mapping = dict()
    
    with open(path, 'r', encoding='utf8') as f:
        for line in f:
            line = line.strip()
            if len(line) == 0:
                continue
            splitted = line.split(" ")
            if len(splitted) <= 2:
                continue
            mapping[splitted[0]] = np.array(splitted[1:], dtype=float) # stwórz słownik słowo -> wektor 
    return mapping

vec_size = 300
glove_mapping = load_embeddings('glove/glove.6B.{}d.txt'.format(vec_size)) 
my_mapping = load_embeddings('Embedding-Models/size{}-window10.txt'.format(vec_size)) 
my_mapping = load_embeddings('Embedding-Models/super_model.txt'.format(vec_size)) 


In [11]:
def join_embeddings(glove_mapping, my_mapping):
    glove_keys = np.array(list(glove_mapping.keys()))
    glove_vals = np.array(list(glove_mapping.values()))
    print(glove_keys.shape, glove_vals.shape)

    glove_avg, glove_std = np.average(glove_vals), np.std(glove_vals)

    my_keys = np.array(list(my_mapping.keys()))
    my_vals = np.array(list(my_mapping.values()))
    print(my_keys.shape, my_vals.shape)
    
    my_avg, my_std = np.average(my_vals), np.std(my_vals)

    glove_keys_count = glove_keys.shape[0]
    my_keys_count = my_keys.shape[0]
    all_words_count = np.unique(np.concatenate((glove_keys, my_keys))).shape[0]

    overhead = all_words_count - glove_keys_count

    new_mapping_values = np.zeros(shape = (all_words_count, glove_vals.shape[1] + my_vals.shape[1]), dtype=np.float64)
    new_mapping_keys = np.empty(all_words_count, dtype="<U30")

    # put glove mapping at the beginning of new super-mapping
    # indices 0 : 4 000 000, 0 : 300
    new_mapping_values[:glove_vals.shape[0], :glove_vals.shape[1]] = glove_vals
    new_mapping_keys[:glove_keys.shape[0]] = glove_keys

    vec_size = glove_vals.shape[1]

    # join glove mapping with own mapping 
    # or if it's missing, fill with random normal
    for idx, key in enumerate(glove_keys):
        if key in my_mapping.keys():
            new_mapping_values[idx, glove_vals.shape[1]:] = my_mapping[key]
        else:
            new_mapping_values[idx, glove_vals.shape[1]:] = np.random.normal(loc=glove_avg, scale=glove_std, size=vec_size)

    counter = 0
    for idx, key in enumerate(my_keys):
        if key not in glove_mapping.keys():
            new_mapping_values[glove_vals.shape[0] + counter, :glove_vals.shape[1]] = np.random.normal(loc=my_avg, scale=my_std, size=vec_size)
            new_mapping_values[glove_vals.shape[0] + counter, glove_vals.shape[1]:] = my_mapping[key]
            new_mapping_keys[glove_vals.shape[0] + counter] = key
            counter += 1            

    super_mapping = dict(zip(new_mapping_keys, new_mapping_values))
    return super_mapping

super_mapping = join_embeddings(glove_mapping, my_mapping)
print("\n", len(super_mapping.keys()), len(super_mapping['the']))

(400000,) (400000, 300)
(21055,) (21055, 300)
406548 600


In [12]:
str(super_mapping['the'][0])

'0.04656'

In [73]:
with open("Embedding-Models/super-model.txt", "w") as file:
    for key, value in super_mapping.items():
        file.write(key)
        file.write(" ")
        for v in value:
            file.write(str(v) + " ")
        file.write("\n")

In [13]:
dataframes = {}

In [14]:
'''
Data loader.
'''

import data_helpers
import os
import pandas as pd

load_main = False

main_cats = ['ActionName', 'Capability']
# dataframes = {cat : {} for cat in main_cats} if load_main else {}

for cat in main_cats:
    path = os.getcwd() + "/Dataframes/" + cat + "/Processed/"
    ls = os.listdir(path)
    for file in ls:
        if (load_main and cat in file) or (not load_main and cat not in file):
            print(file)
            df = pd.read_csv(path + file, encoding='utf-8')

            df.rename({"Unnamed: 0":"a"}, axis="columns", inplace=True)
            df.drop(["a"], axis=1, inplace=True)

            df.rename({"Unnamed: 0.1":"a"}, axis="columns", inplace=True)
            df.drop(["a"], axis=1, inplace=True)

            df_type = file.split('-')[1]
            if load_main:
                try:
                    dataframes[cat][df_type] = df
                except KeyError:
                    print("\t? KeyError exception, did you mean it ?")
                    dataframes[cat] = {}
                    dataframes[cat][df_type] = df
            else:
                sub_cat = file.split('-')[0]
                if sub_cat not in dataframes.keys():
                    dataframes[sub_cat] = {}
                dataframes[sub_cat][df_type] = df

if not load_main:
    main_cats = list(dataframes.keys())
main_cats

File-Train-P.csv
Other-Test-P.csv
Network-Test-P.csv
Other-Train-P.csv
Network-Train-P.csv
File-Test-P.csv
other-Test-P.csv
infection_propagation-Train-P.csv
command_and_control-Train-P.csv
command_and_control-Test-P.csv
infection_propagation-Test-P.csv
other-Train-P.csv


['File',
 'Other',
 'Network',
 'other',
 'infection_propagation',
 'command_and_control']

In [38]:
'''
Calculate simple average embeddings for given sentences,
based on file loaded (for example vectors of length 50).

Skips words, that don't exist in mapping!
Therefore some possibly meaningful words (in security context) are skipped.
'''

from sklearn.svm import SVC
from nltk import word_tokenize

from data_helpers import clean_sentence

import re
word_pattern = re.compile(r'[^a-zA-Z0-9-]')

from nltk.corpus import stopwords
stopwords = set(stopwords.words('english'))

def documents_to_ave_embeddings(docs, embeddings, slice_size=None, bin_features=False):
    result = []
    for idx, doc in enumerate(docs):
        vectors = []
        
        if bin_features:
            new_doc, features = clean_sentence(doc, get_features=True)   
            f_to_bin = np.array(list(map(int, features.values())), dtype=np.float64)
        else:
            new_doc = clean_sentence(doc, get_features=False)   
                    
        new_doc = re.sub(word_pattern, " ", new_doc)
        
        words = word_tokenize(new_doc)
        words = [word.lower() for word in words if len(word) > 1 and len(word) <= 25]
        
        for word in words:
#         for word in (doc.lower()).split():
            
            global tokens_count
            tokens_count += 1
            if word in embeddings.keys():
                vec = embeddings[word]
                
                if slice_size is not None:
                    first_slice = vec[:slice_size]
                    second_slice = vec[vec_size : vec_size+slice_size]
                    vec = np.concatenate((first_slice, second_slice))
                    
                vectors.append(vec)
            else:
                global missing_tokens
                missing_tokens += 1
          
        if vectors:
            avg_vec = np.average(vectors, axis=0)
        else:
            avg_vec = np.zeros(shape=result[0].shape)

        if bin_features:
            avg_vec = np.concatenate((avg_vec, f_to_bin))
            
        result.append(avg_vec)
    
    print(result)
    return result


In [26]:
new_doc, features = clean_sentence('zhCat TCP port 443 XORed communication connect back to with using on srv01.microsoftwindowsupdate(dot)net (a deceptive domain owned by this group with falsified Whois data attributing to Microsoft Investor Relations) the HTTP protocol', get_features=True)        
np.array(list(map(int, features.values())), dtype=np.float64)

array([0., 0., 1., 0., 0., 0., 0., 1., 0.])

In [6]:
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report

In [7]:
models = {}

In [15]:
tokens_count = 0
missing_tokens = 0

cat = 'ActionName'
column = 'text-rel'
mapping = super_mapping

print(">>> {}\n".format(cat))

train = dataframes[cat]['Train']
test = dataframes[cat]['Test']

train[column] = train[column].apply(lambda x : re.sub(r'[^\x00-\x7F]+',' ', x))
test[column] = test[column].apply(lambda x : re.sub(r'[^\x00-\x7F]+',' ', x))

ss = 200
bf = True # bin features

train_transformed = documents_to_ave_embeddings(train[column], mapping, slice_size=ss, bin_features=bf)
test_transformed = documents_to_ave_embeddings(test[column], mapping, slice_size=ss, bin_features=bf)

print("There are {} tokens.".format(tokens_count))
print(" - {:2g} are missing :(".format(missing_tokens/tokens_count))
print()

clf = LogisticRegression(
    solver='newton-cg',
    class_weight='balanced',
    C=1.0
)
clf.fit(train_transformed, train['label_num']) # zwektoryzujmy dane i wytrenujmy klasyfikator na zbiorze treningowym

train_acc = clf.score(train_transformed, train['label_num'])
models[cat] = clf

accuracy = clf.score(test_transformed, test['label_num'])
y_pred = clf.predict(test_transformed)

print("Train Acc: {}".format(train_acc))
print("Test Acc: {}".format(accuracy))
print(classification_report(test['label_num'], y_pred))
c_m = confusion_matrix(test['label_num'], y_pred)
tn, fp, fn, tp = c_m.ravel()
print("TN: {}, FP: {}, FN: {}, TP: {}".format(tn, fp, fn, tp))

train_transformed

>>> ActionName



KeyError: 'ActionName'

In [36]:
dataframes['ActionName']['Test']


Unnamed: 0,label,label_num,text-neigh,text-rel,token,text-neigh-processed,text-rel-processed,text-neigh-tokens,text-rel-tokens
0,ActionName,1,Explorer. Bda9.tmp was then executed and went ...,Bda9.tmp executed,obtaining,explorer. tmp-file wa execut went,tmp-file execut,"['explorer.', 'tmp-file', 'wa', 'execut', 'went']","['tmp-file', 'execut']"
1,ActionName,1,application was used to create copies of Backd...,create copies of Backdoor.Jiripbot on the comp...,eavesdropped,applic wa use creat copi jir-file,creat copi jir-file compromis comput,"['applic', 'wa', 'use', 'creat', 'copi', 'jir-...","['creat', 'copi', 'jir-file', 'compromis', 'co..."
2,ActionName,1,"It hides activity by editing events logs, dump...",It editing events logs,insert,hide activ edit event log dump password,edit event log,"['hide', 'activ', 'edit', 'event', 'log', 'dum...","['edit', 'event', 'log']"
3,ActionName,1,"by editing events logs, dumping passwords, sec...",It dumping passwords,used,edit event log dump password secur delet file,dump password,"['edit', 'event', 'log', 'dump', 'password', '...","['dump', 'password']"
4,ActionName,1,"logs, dumping passwords, securely deleting fil...",It deleting files,executed,log dump password secur delet file encrypt file,delet file,"['log', 'dump', 'password', 'secur', 'delet', ...","['delet', 'file']"
5,ActionName,1,also end processes and perform a secure self-d...,The tool perform a secure self-delete,create,also end process perform secur self-delete.,tool perform secur self-delet,"['also', 'end', 'process', 'perform', 'secur',...","['tool', 'perform', 'secur', 'self-delet']"
6,ActionName,1,"to parse event logs, dumping out ones of inter...",Hacktool.Eventlog dumping out ones of interest,spread,pars event log dump one interest,eve-file dump one interest,"['pars', 'event', 'log', 'dump', 'one', 'inter...","['eve-file', 'dump', 'one', 'interest']"
7,ActionName,1,"primary functionality is to parse event logs, ...",Hacktool.Eventlog parse event logs,create,primari function pars event log dump,eve-file pars event log,"['primari', 'function', 'pars', 'event', 'log'...","['eve-file', 'pars', 'event', 'log']"
8,ActionName,1,"of interest, and to delete entries. The tool will",Hacktool.Eventlog delete entries,create,interest delet entries. tool,eve-file delet entri,"['interest', 'delet', 'entries.', 'tool']","['eve-file', 'delet', 'entri']"
9,ActionName,1,"Similarly, event logs are modified to remove a...",modified event logs to remove any evidence of ...,remove,similarli event log modifi remov ani evid,modifi event log remov ani evid attack activ,"['similarli', 'event', 'log', 'modifi', 'remov...","['modifi', 'event', 'log', 'remov', 'ani', 'ev..."


In [163]:
test_transformed = documents_to_ave_embeddings(dataframes['ActionName']['Test']['text-rel'], mapping)

predictions = {}
for model_name in models.keys():
    predictions[model_name] = models[model_name].predict_proba(test_transformed)

In [327]:
'''
Gets actual tokens-list from data.
Some tokens annotated as 'Action Token' are not a single word.

Words' pos tags are checked and we add word to a token list 
if it's a VERB, or eventually a NOUN 
(it's usually NNS - noun plural - which was actually a VERB)

Skipping stopwords, because of such phrases as 'were installed' etc.
'''

from nltk.corpus import stopwords
from nltk import word_tokenize, pos_tag       

stopwords = set(stopwords.words('english'))
tokens_lists = {
    0 : [],
    1 : []
}

train = dataframes['ActionName']['Train']
test = dataframes['ActionName']['Test']

import re
pattern = re.compile('\W')

for i in range(train.shape[0]):
    token = train.at[i, 'token']
    label = train.at[i, 'label_num']
    tokens_lists[label] += get_verb_token(token)

print(len(tokens_lists[0]), len(tokens_lists[1]), train.shape)

2144 1147 (3348, 10)


In [284]:
def get_verb_token(token):
    result = []
    token = token.lower().split()
    
    token = [t for t in token if t not in stopwords]
    token = [re.sub(pattern, '', t) for t in token]
        
    if len(token) > 1:
        tagged_tokens = [pos_tag([t]) for t in token]
        verb_tokens = [t for t in tagged_tokens if t[0][1].startswith("V")]
        if len(verb_tokens) == 1:
            token = verb_tokens[0][0][0]
            result += [token]
        elif len(verb_tokens) > 1:
            for tt in verb_tokens:
                 result += [tt[0][0]]
        else:
            noun_tokens = [t for t in tagged_tokens if t[0][1].startswith("N")]
            for tt in noun_tokens:
                result += [tt[0][0]]
    else:
        result += token
        
    return result

In [328]:
map_vectors = {}
for cat in tokens_lists.keys():
    vectors = [mapping[word] for word in tokens_lists[cat] if word in mapping.keys()]
    map_vectors[cat] = np.average(vectors, axis=0)

In [286]:
def cosine(vec1, vec2):
    vec1, vec2 = np.asarray(vec1), np.asarray(vec2)
    v1_sqrt = np.sqrt(np.sum(vec1 ** 2))
    v2_sqrt = np.sqrt(np.sum(vec2 ** 2))
    return np.sum((np.multiply(vec1, vec2))) / ( v1_sqrt * v2_sqrt )

In [329]:
all_count = 0
correct_count = 0

ds = train

train_data = []
train_labels = []

for i in range(ds.shape[0]):
    token = get_verb_token(ds.at[i, 'token'])
    label = ds.at[i, 'label_num']
    if token:
        token = token[0]
        
        if token in mapping.keys():
            
            all_count += 1
            vector = mapping[token]
            scores = [cosine(vector, map_vectors[0]), cosine(vector, map_vectors[1])]
#             print("Predicted: {}, real: {}".format(np.argmax(scores), label))
            choice = np.argmax(scores)
        
            if label == choice:
                correct_count += 1
               
            train_data.append(scores)
            train_labels.append(label)

print(correct_count/all_count)

0.5078922934076138


In [330]:
clf = SVC()

clf.fit(train_data, train_labels) # zwektoryzujmy dane i wytrenujmy klasyfikator na zbiorze treningowym
accuracy = clf.score(train_data, train_labels)

accuracy



0.6511915815536985