In [6]:
import os
import numpy as np
import pandas as pd

In [29]:
import math

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

class HierarchicalModel:
    def __init__(self):
        self.train = None
        self.test = None
        
        self.train_results = None
        self.test_results = None
        
        self.main_class_predictor = lambda x: 1 if x[1] > x[0] else 1
        self.sub_class_predictor = None
    
    def loadData(self, path, main_cats_list):
        dataframes = {}

        print("Loading data...\n")
        ls = os.listdir(path)

        for file in ls:
            if '.csv' in file and 'old' not in file:

                print(path + file)
                df = pd.read_csv(path + file, encoding='utf-8')

                for cname in ["Unnamed: 0", "Unnamed: 0.1", "label"]:
                    if cname in df.columns:
                        df.rename({cname:"a"}, axis="columns", inplace=True)
                        df.drop(["a"], axis=1, inplace=True)
                df_type = file.split('-')[1]

                dataframes[df_type] = df

        self.train = dataframes['Train']
        self.test = dataframes['Test']
        
        categories, mapper = {}, {}
        for mc in main_cats_list:
            categories[mc] = list(sorted(set(filter(lambda x : x != '-', self.train[mc]))))
            mapper[mc] = {}
            for cat in np.unique(self.train[mc]):
                mapper[mc][cat] = np.unique(self.train[self.train[mc] == cat]["{}_num".format(mc)])[0]
        
        self.mapper = mapper
        self.categories = categories
        self.main_cats = main_cats_list
        
        print("\nLoaded categories:\n")
        print(categories)
        
    
        
    def predictMainClass(self, data_c, pipeline, report=False, undefined="-"):
        
        if not self.train_results:
            self.train_results = pd.DataFrame()
        if not self.test_results:
            self.test_results = pd.DataFrame()
        
        if report:
            print("\n > > > MAIN CLASS CLASSIFICATION > > >\n")
        else:
            print("\nClassifying main classes...")
            
        for main_cat in self.main_cats:
            
            if report:
                print("-" * 52)

            train_labels = self.train[main_cat].map(lambda x : 0 if x == undefined else 1)
            test_labels = self.test[main_cat].map(lambda x : 0 if x == undefined else 1)

            # train pipeline
            train_data = list(self.train[data_c])
#             pipeline.fit(self.train[data_c], train_labels)
            pipeline.fit(train_data, train_labels)

            # store train set results
            self.train_results['{}-def_prediction'.format(main_cat)] = [list(p) for p in pipeline.predict_proba(train_data)]
            self.train_results['{}-def_true'.format(main_cat)] = train_labels

            train_pred = pipeline.predict(train_data)
            if report:
                print("\n{} -> TRAIN results:".format(main_cat))
                print(classification_report(train_labels, train_pred))
    
            # test
            test_data = list(self.test[data_c])
                                          
            # store test set results
            self.test_results['{}-def_prediction'.format(main_cat)] = [list(p) for p in pipeline.predict_proba(test_data)]
            self.test_results['{}-def_true'.format(main_cat)] = test_labels

            test_pred = pipeline.predict(test_data)
            if report:
                print("\n{} -> TEST results:".format(main_cat))
                print(classification_report(test_labels, test_pred))

            # prepare columns to store our final predictions
            self.train_results['{}_PREDICTION'.format(main_cat)] = '?'    
            self.test_results['{}_PREDICTION'.format(main_cat)] = '?'

            # additional column for test-set
            # replace '?' with '-' according to pipeline's prediction
            # - if it's very certain about answering NO, it won't be taken into consideration
            # in next level of classification

            self.test['{}_PREDICTION'.format(main_cat)] = '?'

            for i in range(self.test_results.shape[0]):
                row = self.test_results.loc[i]
                def_pred = row['{}-def_prediction'.format(main_cat)]
                pred = self.main_class_predictor(def_pred)
                if pred == 0:
                    self.test.at[i, "{}_PREDICTION".format(main_cat)] = undefined
                    
        if not report:
            print("Done!")
                    
    def predictSubClass(self, data_c, pipeline, undefined="-"):

        print("\nClassifying subclasses [binary classifiers!]...")
        
        for main_cat in self.main_cats:
            print('\n' + main_cat + ":")
            cats = self.categories[main_cat]

            for cat in cats:
                print("- " + cat)

                # subset of train set where main_cat is defined
                train_subset = self.train.loc[self.train[main_cat] != undefined]
                # subset of test set where main_cat is defined, according to previous classification stage
                test_subset = self.test.loc[self.test["{}_PREDICTION".format(main_cat)] != undefined] # !!!

                # prepare binary labels: 1 for this class, 0 for any other
                train_labels = train_subset[main_cat].map(lambda x : 0 if x != cat else 1)
                test_labels = test_subset[main_cat].map(lambda x : 0 if x != cat else 1)

                # train pipeline
                train_data = list(train_subset[data_c])
                pipeline.fit(train_data, train_labels)

                pred_label = '{}_prediction'.format(cat)
                true_label = '{}_true'.format(cat)

                # store results 
                self.train_results[pred_label] = [list(p) for p in pipeline.predict_proba(list(self.train[data_c]))]
                self.train_results[true_label] = train_labels

                self.test_results[pred_label] = [list(p) for p in pipeline.predict_proba(list(self.test[data_c]))]
                self.test_results[true_label] = test_labels

                # fix NaN issues...
                for df in [self.train_results, self.test_results]:
                    df[true_label] = df[true_label].map(lambda x : '-' if math.isnan(x) else int(x))
          
        print("\nMaking decision...")
        
        for df in [self.train_results, self.test_results]:
            # iterate over rows
            for i in range(df.shape[0]):
                row = df.loc[i]

                for main_cat in self.main_cats:
                    # get main class prediction
                    def_pred = row['{}-def_prediction'.format(main_cat)]
                    # make decision :)
                    pred = self.main_class_predictor(def_pred)
                    # get truth (mostly for debugging)
                    truth = row['{}-def_true'.format(main_cat)]

                    if pred:
                        # if predicted as defined, get sub classes
                        cats = self.categories[main_cat]

                        positive_scores = []
                        negative_scores = []

                        # gather negative and positive votes
                        for cat in cats:
                            cat_prob = row['{}_prediction'.format(cat)]
                            negative_scores.append(cat_prob[0] * def_pred[0])
                            positive_scores.append(cat_prob[1] * def_pred[1])

                        # get best scores for negative and positive answer
                        pos_idx = np.argmax(positive_scores)
                        neg_idx = np.argmax(negative_scores)

                        if positive_scores[pos_idx] > negative_scores[neg_idx]:
                            best_class = cats[pos_idx]   
                        else:
                            best_class = '-'

                        df.at[i, "{}_PREDICTION".format(main_cat)] = best_class
                    else:
                        df.at[i, "{}_PREDICTION".format(main_cat)] = "-"
                        
        print("Done!")
      
    def transformDocsToEmbeddings(self, docs, embeddings, slice_size=None, bin_features=False):
        from nltk import word_tokenize
        from data_helpers import clean_sentence
        
        import re
        word_pattern = re.compile(r'[^a-zA-Z0-9-]')

        from nltk.corpus import stopwords
        stopwords = set(stopwords.words('english'))

        result = []
        for idx, doc in enumerate(docs):
            vectors = []

#             if bin_features:
#                 new_doc, features = clean_sentence(doc, get_features=True)   
#                 f_to_bin = np.array(list(map(int, features.values())), dtype=np.float64)
#             else:
#                 new_doc = clean_sentence(doc, get_features=False)   

            new_doc = doc
    
            new_doc = re.sub(word_pattern, " ", new_doc)

            words = word_tokenize(new_doc)
            words = [word.lower() for word in words if len(word) > 1 and len(word) <= 25]

            for word in words:

                if word in embeddings.keys():
                    vec = embeddings[word]

                    if slice_size is not None:
                        first_slice = vec[:slice_size]
                        second_slice = vec[vec_size : vec_size+slice_size]
                        vec = np.concatenate((first_slice, second_slice))

                    vectors.append(vec)
                else:
                    pass
#                     global missing_tokens
#                     missing_tokens += 1

            if vectors:
                avg_vec = np.average(vectors, axis=0)
            else:
                avg_vec = np.zeros(shape=result[0].shape)

            if bin_features:
                avg_vec = np.concatenate((avg_vec, f_to_bin))

            result.append(avg_vec)

        return result

    def addEmbeddings(self, in_column, out_column, mapping, ss=100, get_binary_features=False):
        
        train_clean = self.train[in_column].apply(lambda x : re.sub(r'[^\x00-\x7F]+',' ', x))
        test_clean = self.test[in_column].apply(lambda x : re.sub(r'[^\x00-\x7F]+',' ', x))

        train_transformed = self.transformDocsToEmbeddings(train_clean, mapping, slice_size=ss, bin_features=get_binary_features)
        test_transformed = self.transformDocsToEmbeddings(test_clean, mapping, slice_size=ss, bin_features=get_binary_features)
    
        self.train[out_column] = list(train_transformed)
        self.test[out_column] = list(test_transformed)
    
    def showResults(self):
        
        for main_cat in self.main_cats:
            self.train["{}-PREDICTION_num".format(main_cat)] = self.train_results["{}_PREDICTION".format(main_cat)].map(self.mapper[main_cat])
            self.test["{}-PREDICTION_num".format(main_cat)] = self.test_results["{}_PREDICTION".format(main_cat)].map(self.mapper[main_cat])

        for main_cat in self.main_cats:
            print()
            print("-"*20 + " " + main_cat + " " + "-"*20)
            print(self.mapper[main_cat])
            
            print("\n>>> TRAIN:")
            print(classification_report(self.train['{}_num'.format(main_cat)], self.train['{}-PREDICTION_num'.format(main_cat)]),
#                  target_names=self.mapper[main_cat]
                 )
            
            print("\n>>> TEST:")
            print(classification_report(self.test['{}_num'.format(main_cat)], self.test['{}-PREDICTION_num'.format(main_cat)]))


In [3]:
def load_embeddings(path):
    mapping = dict()
    
    with open(path, 'r', encoding='utf8') as f:
        for line in f:
            line = line.strip()
            if len(line) == 0:
                continue
            splitted = line.split(" ")
            if len(splitted) <= 2:
                continue
            mapping[splitted[0]] = np.array(splitted[1:], dtype=float) # stwórz słownik słowo -> wektor 
    return mapping

vec_size = 300

# glove_mapping = load_embeddings('glove/glove.6B.{}d.txt'.format(vec_size)) 
# my_mapping = load_embeddings('Embedding-Models/size{}-window10.txt'.format(vec_size)) 
super_mapping = load_embeddings('Embedding-Models/super-model.txt') 


In [31]:
'''
Prepare simple pipeline
'''

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def dummy(doc):
    return doc

pipeline = Pipeline([          
#     ('vectorizer',
#      CountVectorizer(
#          preprocessor=dummy, 
#          ngram_range=(1, 4),
#          analyzer='word',
#          binary=False
#      )),
    ('clf', 
     LogisticRegression(
#          class_weight='balanced',
#          solver='liblinear'
     )
    )
])

In [25]:
hm.train

Unnamed: 0,ActionName,Capability,token,text-rel-subj,text-rel,text-neigh,text-neigh-word-tokenized,text-neigh-processed,text-rel-word-tokenized,text-rel-processed,text-rel-subj-word-tokenized,text-rel-subj-processed,text-neigh-tokens,text-rel-tokens,text-rel-subj-tokens,ActionName_num,Capability_num,text-embedding,ActionName-PREDICTION_num,Capability-PREDICTION_num
0,-,Other,performs,ScanBox performs keylogging of users when they...,performs keylogging of users when they visit a...,on the game. ScanBox performs keylogging of us...,"['game', '.', 'scanbox', 'perform', 'keylog', ...",game. scanbox perform keylog user,"['perform', 'keylog', 'user', 'visit', 'compro...",perform keylog user visit compromis websit,"['scanbox', 'perform', 'keylog', 'user', 'visi...",scanbox perform keylog user visit compromis we...,"['game.', 'scanbox', 'perform', 'keylog', 'user']","['perform', 'keylog', 'user', 'visit', 'compro...","['scanbox', 'perform', 'keylog', 'user', 'visi...",0,3,"[-0.25121911111111106, 0.12480377777777776, 0....",0,3
1,-,Other,perform,an attacker perform reconnaissance and key log...,perform reconnaissance and key logging of visi...,allows an attacker to perform reconnaissance a...,"['allow', 'attack', 'perform', 'reconnaiss', '...",allow attack perform reconnaiss key log,"['perform', 'reconnaiss', 'key', 'log', 'visit...",perform reconnaiss key log visitor compromis w...,"['attack', 'perform', 'reconnaiss', 'key', 'lo...",attack perform reconnaiss key log visitor comp...,"['allow', 'attack', 'perform', 'reconnaiss', '...","['perform', 'reconnaiss', 'key', 'log', 'visit...","['attack', 'perform', 'reconnaiss', 'key', 'lo...",0,3,"[-0.1613737, 0.09372370000000001, -0.2297578, ...",0,3
2,-,Other,tracking,tracking files and drives present on the victi...,tracking files and drives present on the victi...,a different method of tracking files and drive...,"['differ', 'method', 'track', 'file', 'drive',...",differ method track file drive present,"['track', 'file', 'drive', 'present', 'victim'...",track file drive present victim machin,"['track', 'file', 'drive', 'present', 'victim'...",track file drive present victim machin,"['differ', 'method', 'track', 'file', 'drive',...","['track', 'file', 'drive', 'present', 'victim'...","['track', 'file', 'drive', 'present', 'victim'...",0,3,"[-0.22681733333333332, 0.17845211111111114, -0...",0,3
3,-,command_and_control,being,it being a command and control destination for...,being a command and control destination for ma...,also been observed as being a command and control,"['also', 'observ', 'command', 'control']",also observ command control,"['command', 'control', 'destin', 'malwar']",command control destin malwar,"['command', 'control', 'destin', 'malwar']",command control destin malwar,"['also', 'observ', 'command', 'control']","['command', 'control', 'destin', 'malwar']","['command', 'control', 'destin', 'malwar']",0,1,"[0.005214000000000001, -0.020905571428571437, ...",0,1
4,-,infection_propagation,contain,the e-mails contain links to websites hosting ...,contain links to websites hosting ScanBox,"victims, where the e-mails contain links to we...","['victim', 'e-mail', 'contain', 'link', 'websi...",victim e-mail contain link websit host,"['contain', 'link', 'websit', 'host', 'scanbox']",contain link websit host scanbox,"['e-mail', 'contain', 'link', 'websit', 'host'...",e-mail contain link websit host scanbox,"['victim', 'e-mail', 'contain', 'link', 'websi...","['contain', 'link', 'websit', 'host', 'scanbox']","['e-mail', 'contain', 'link', 'websit', 'host'...",0,2,"[-0.2388474480814257, 0.16502939653891083, -0....",0,2
5,Network,Other,were dropped,Both backdoors were dropped from malicious doc...,Both backdoors were dropped from malicious doc...,and WATERSPOUT. Both backdoors were dropped fr...,"['waterspout', '.', 'backdoor', 'drop', 'malic...",waterspout. backdoor drop malici document built,"['backdoor', 'drop', 'malici', 'document', 'bu...",backdoor drop malici document built util tran ...,"['backdoor', 'drop', 'malici', 'document', 'bu...",backdoor drop malici document built util tran ...,"['waterspout.', 'backdoor', 'drop', 'malici', ...","['backdoor', 'drop', 'malici', 'document', 'bu...","['backdoor', 'drop', 'malici', 'document', 'bu...",2,3,"[-0.05097333333333332, -0.16647399999999998, -...",0,3
6,-,Other,communicates,backdoor HTTP communicates via to a hard-coded...,HTTP communicates via to a hard-coded command ...,a proxy-aware backdoor that communicates via H...,"['proxy-awar', 'backdoor', 'commun', 'via', 'h...",proxy-awar backdoor commun via http,"['http', 'commun', 'via', 'hard-cod', 'command...",http commun via hard-cod command control c2 se...,"['backdoor', 'http', 'commun', 'via', 'hard-co...",backdoor http commun via hard-cod command cont...,"['proxy-awar', 'backdoor', 'commun', 'via', 'h...","['http', 'commun', 'via', 'hard-cod', 'command...","['backdoor', 'http', 'commun', 'via', 'hard-co...",0,3,"[-0.3650246, 0.03408679999999999, 0.2694957, -...",0,1
7,-,Other,decrease,this change decrease detection,decrease detection,post in order to decrease detection of RIPTIDE by,"['post', 'order', 'decreas', 'detect', 'riptid']",post order decreas detect riptid,"['decreas', 'detect']",decreas detect,"['chang', 'decreas', 'detect']",thi chang decreas detect,"['post', 'order', 'decreas', 'detect', 'riptid']","['decreas', 'detect']","['thi', 'chang', 'decreas', 'detect']",0,3,"[-0.0989085, 0.539335, 0.45611, 0.1742305, -0....",0,3
8,Network,infection_propagation,sent,sent a spear phish email to a Taiwanese govern...,sent a spear phish email to a Taiwanese govern...,a spear phish email sent to a Taiwanese govern...,"['spear', 'phish', 'email', 'sent', 'taiwanes'...",spear phish email sent taiwanes govern,"['sent', 'spear', 'phish', 'email', 'taiwanes'...",sent spear phish email taiwanes govern ministri,"['sent', 'spear', 'phish', 'email', 'taiwanes'...",sent spear phish email taiwanes govern ministri,"['spear', 'phish', 'email', 'sent', 'taiwanes'...","['sent', 'spear', 'phish', 'email', 'taiwanes'...","['sent', 'spear', 'phish', 'email', 'taiwanes'...",2,2,"[-0.17065832499999997, -0.07323550000000001, -...",0,2
9,-,-,dropped,The exploit document dropped the HIGHTIDE back...,dropped the HIGHTIDE backdoor,Spearphishing The exploit document dropped the...,"['spearphish', 'exploit', 'document', 'drop', ...",spearphish exploit document drop hightid backdoor,"['drop', 'hightid', 'backdoor']",drop hightid backdoor,"['exploit', 'document', 'drop', 'hightid', 'ba...",exploit document drop hightid backdoor,"['spearphish', 'exploit', 'document', 'drop', ...","['drop', 'hightid', 'backdoor']","['exploit', 'document', 'drop', 'hightid', 'ba...",0,0,"[0.053436226780741135, -0.015233134858650785, ...",0,3


In [32]:
import re

hm = HierarchicalModel()
hm.loadData(os.getcwd() + "/Dataframes/All/Processed/", ["ActionName", "Capability"])
hm.main_class_predictor = lambda predictions: 0 if predictions[0] > predictions[1] else 1

hm.addEmbeddings("text-rel", "text-embedding", super_mapping)

# pipeline.fit(list(hm.train['text-embedding']), hm.train["Capability"].map(lambda x : 0 if x == '-' else 1))
# trzeba przerobić na LISTę

hm.predictMainClass('text-embedding', pipeline, report=False)
hm.predictSubClass('text-embedding', pipeline)
hm.showResults()

Loading data...

/home/jells123/Documents/ENGINEER/Secure-NLP/Dataframes/All/Processed/All-Train-P.csv
/home/jells123/Documents/ENGINEER/Secure-NLP/Dataframes/All/Processed/All-Test-P.csv

Loaded categories:

{'ActionName': ['File', 'Network', 'Other'], 'Capability': ['Other', 'command_and_control', 'infection_propagation']}

Classifying main classes...
Done!

Classifying subclasses [binary classifiers!]...

ActionName:
- File
- Network
- Other

Capability:
- Other
- command_and_control
- infection_propagation

Making decision...
Done!

-------------------- ActionName --------------------
{'-': 0, 'File': 1, 'Network': 2, 'Other': 3}

>>> TRAIN:
              precision    recall  f1-score   support

           0       0.85      0.80      0.83      2194
           1       0.57      0.57      0.57       325
           2       0.52      0.59      0.55       373
           3       0.47      0.55      0.51       456

   micro avg       0.72      0.72      0.72      3348
   macro avg       0