In [4]:
import os
import numpy as np
import pandas as pd

In [32]:
import math

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

class HierarchicalModel:
    def __init__(self):
        self.train = None
        self.test = None
        
        self.train_results = None
        self.test_results = None
        
        self.main_class_predictor = lambda x: 1 if x[1] > x[0] else 1
        self.sub_class_predictor = None
    
    def loadData(self, path, main_cats_list):
        dataframes = {}

        print("Loading data...\n")
        ls = os.listdir(path)

        for file in ls:
            if '.csv' in file and 'old' not in file:

                print(path + file)
                df = pd.read_csv(path + file, encoding='utf-8')

                for cname in ["Unnamed: 0", "Unnamed: 0.1", "label"]:
                    if cname in df.columns:
                        df.rename({cname:"a"}, axis="columns", inplace=True)
                        df.drop(["a"], axis=1, inplace=True)
                df_type = file.split('-')[1]

                dataframes[df_type] = df

        self.train = dataframes['Train']
        self.test = dataframes['Test']
        
        categories, mapper = {}, {}
        for mc in main_cats_list:
            categories[mc] = list(sorted(set(filter(lambda x : x != '-', self.train[mc]))))
            mapper[mc] = {}
            for cat in np.unique(self.train[mc]):
                mapper[mc][cat] = np.unique(self.train[self.train[mc] == cat]["{}_num".format(mc)])[0]
        
        self.mapper = mapper
        self.categories = categories
        self.main_cats = main_cats_list
        
        print("\nLoaded categories:\n")
        print(categories)
        
    
        
    def predictMainClass(self, data_c, pipeline, report=False, undefined="-"):
        
        if not self.train_results:
            self.train_results = pd.DataFrame()
        if not self.test_results:
            self.test_results = pd.DataFrame()
        
        if report:
            print("\n > > > MAIN CLASS CLASSIFICATION > > >\n")
        else:
            print("\nClassifying main classes...")
            
        for main_cat in self.main_cats:
            
            if report:
                print("-" * 52)

            train_labels = self.train[main_cat].map(lambda x : 0 if x == undefined else 1)
            test_labels = self.test[main_cat].map(lambda x : 0 if x == undefined else 1)

            # train pipeline
            pipeline.fit(self.train[data_c], train_labels)

            # store train set results
            self.train_results['{}-def_prediction'.format(main_cat)] = [list(p) for p in pipeline.predict_proba(self.train[data_c])]
            self.train_results['{}-def_true'.format(main_cat)] = train_labels

            train_pred = pipeline.predict(self.train[data_c])
            if report:
                print("\n{} -> TRAIN results:".format(main_cat))
                print(classification_report(train_labels, train_pred))

            # store test set results
            self.test_results['{}-def_prediction'.format(main_cat)] = [list(p) for p in pipeline.predict_proba(self.test[data_c])]
            self.test_results['{}-def_true'.format(main_cat)] = test_labels

            test_pred = pipeline.predict(self.test[data_c])
            if report:
                print("\n{} -> TEST results:".format(main_cat))
                print(classification_report(test_labels, test_pred))

            # prepare columns to store our final predictions
            self.train_results['{}_PREDICTION'.format(main_cat)] = '?'    
            self.test_results['{}_PREDICTION'.format(main_cat)] = '?'

            # additional column for test-set
            # replace '?' with '-' according to pipeline's prediction
            # - if it's very certain about answering NO, it won't be taken into consideration
            # in next level of classification

            self.test['{}_PREDICTION'.format(main_cat)] = '?'

            for i in range(self.test_results.shape[0]):
                row = self.test_results.loc[i]
                def_pred = row['{}-def_prediction'.format(main_cat)]
                pred = self.main_class_predictor(def_pred)
                if pred == 0:
                    self.test.at[i, "{}_PREDICTION".format(main_cat)] = undefined
                    
        if not report:
            print("Done!")
                    
    def predictSubClass(self, data_c, pipeline, undefined="-"):

        print("\nClassifying subclasses [binary classifiers!]...")
        
        for main_cat in self.main_cats:
            print('\n' + main_cat + ":")
            cats = self.categories[main_cat]

            for cat in cats:
                print("- " + cat)

                # subset of train set where main_cat is defined
                train_subset = self.train.loc[self.train[main_cat] != undefined]
                # subset of test set where main_cat is defined, according to previous classification stage
                test_subset = self.test.loc[self.test["{}_PREDICTION".format(main_cat)] != undefined] # !!!

                # prepare binary labels: 1 for this class, 0 for any other
                train_labels = train_subset[main_cat].map(lambda x : 0 if x != cat else 1)
                test_labels = test_subset[main_cat].map(lambda x : 0 if x != cat else 1)

                # train pipeline
                pipeline.fit(train_subset[data_c], train_labels)

                pred_label = '{}_prediction'.format(cat)
                true_label = '{}_true'.format(cat)

                # store results 
                self.train_results[pred_label] = [list(p) for p in pipeline.predict_proba(self.train[data_c])]
                self.train_results[true_label] = train_labels

                self.test_results[pred_label] = [list(p) for p in pipeline.predict_proba(self.test[data_c])]
                self.test_results[true_label] = test_labels

                # fix NaN issues...
                for df in [self.train_results, self.test_results]:
                    df[true_label] = df[true_label].map(lambda x : '-' if math.isnan(x) else int(x))
          
        print("\nMaking decision...")
        
        for df in [self.train_results, self.test_results]:
            # iterate over rows
            for i in range(df.shape[0]):
                row = df.loc[i]

                for main_cat in self.main_cats:
                    # get main class prediction
                    def_pred = row['{}-def_prediction'.format(main_cat)]
                    # make decision :)
                    pred = self.main_class_predictor(def_pred)
                    # get truth (mostly for debugging)
                    truth = row['{}-def_true'.format(main_cat)]

                    if pred:
                        # if predicted as defined, get sub classes
                        cats = self.categories[main_cat]

                        positive_scores = []
                        negative_scores = []

                        # gather negative and positive votes
                        for cat in cats:
                            cat_prob = row['{}_prediction'.format(cat)]
                            negative_scores.append(cat_prob[0] * def_pred[0])
                            positive_scores.append(cat_prob[1] * def_pred[1])

                        # get best scores for negative and positive answer
                        pos_idx = np.argmax(positive_scores)
                        neg_idx = np.argmax(negative_scores)

                        if positive_scores[pos_idx] > negative_scores[neg_idx]:
                            best_class = cats[pos_idx]   
                        else:
                            best_class = '-'

                        df.at[i, "{}_PREDICTION".format(main_cat)] = best_class
                    else:
                        df.at[i, "{}_PREDICTION".format(main_cat)] = "-"
                        
        print("Done!")
      
    def transformDocsToEmbeddings(self, docs, embeddings, slice_size=None, bin_features=False):
        from nltk import word_tokenize
        from data_helpers import clean_sentence
        
        import re
        word_pattern = re.compile(r'[^a-zA-Z0-9-]')

        from nltk.corpus import stopwords
        stopwords = set(stopwords.words('english'))

        result = []
        for idx, doc in enumerate(docs):
            vectors = []

            if bin_features:
                new_doc, features = clean_sentence(doc, get_features=True)   
                f_to_bin = np.array(list(map(int, features.values())), dtype=np.float64)
            else:
                new_doc = clean_sentence(doc, get_features=False)   

            new_doc = re.sub(word_pattern, " ", new_doc)

            words = word_tokenize(new_doc)
            words = [word.lower() for word in words if len(word) > 1 and len(word) <= 25]

            for word in words:

                if word in embeddings.keys():
                    vec = embeddings[word]

                    if slice_size is not None:
                        first_slice = vec[:slice_size]
                        second_slice = vec[vec_size : vec_size+slice_size]
                        vec = np.concatenate((first_slice, second_slice))

                    vectors.append(vec)
                else:
                    pass
#                     global missing_tokens
#                     missing_tokens += 1

            if vectors:
                avg_vec = np.average(vectors, axis=0)
            else:
                avg_vec = np.zeros(shape=result[0].shape)

            if bin_features:
                avg_vec = np.concatenate((avg_vec, f_to_bin))

            result.append(avg_vec)

        return result

    def addEmbeddings(self, in_column, out_column, mapping, ss=100, get_binary_features=False):
        
        train_clean = self.train[in_column].apply(lambda x : re.sub(r'[^\x00-\x7F]+',' ', x))
        test_clean = self.test[in_column].apply(lambda x : re.sub(r'[^\x00-\x7F]+',' ', x))

        train_transformed = self.transformDocsToEmbeddings(train_clean, mapping, slice_size=ss, bin_features=get_binary_features)
        test_transformed = self.transformDocsToEmbeddings(test_clean, mapping, slice_size=ss, bin_features=get_binary_features)
    
        self.train[out_column] = list(train_transformed)
        self.test[out_column] = list(test_transformed)
    
    def showResults(self):
        
        for main_cat in self.main_cats:
            self.train["{}-PREDICTION_num".format(main_cat)] = self.train_results["{}_PREDICTION".format(main_cat)].map(self.mapper[main_cat])
            self.test["{}-PREDICTION_num".format(main_cat)] = self.test_results["{}_PREDICTION".format(main_cat)].map(self.mapper[main_cat])

        for main_cat in self.main_cats:
            print()
            print("-"*20 + " " + main_cat + " " + "-"*20)
            print(self.mapper[main_cat])
            
            print("\n>>> TRAIN:")
            print(classification_report(self.train['{}_num'.format(main_cat)], self.train['{}-PREDICTION_num'.format(main_cat)]),
#                  target_names=self.mapper[main_cat]
                 )
            
            print("\n>>> TEST:")
            print(classification_report(self.test['{}_num'.format(main_cat)], self.test['{}-PREDICTION_num'.format(main_cat)]))


In [6]:
def load_embeddings(path):
    mapping = dict()
    
    with open(path, 'r', encoding='utf8') as f:
        for line in f:
            line = line.strip()
            if len(line) == 0:
                continue
            splitted = line.split(" ")
            if len(splitted) <= 2:
                continue
            mapping[splitted[0]] = np.array(splitted[1:], dtype=float) # stwórz słownik słowo -> wektor 
    return mapping

vec_size = 300

# glove_mapping = load_embeddings('glove/glove.6B.{}d.txt'.format(vec_size)) 
# my_mapping = load_embeddings('Embedding-Models/size{}-window10.txt'.format(vec_size)) 
super_mapping = load_embeddings('Embedding-Models/super-model.txt') 


In [12]:
'''
Prepare simple pipeline
'''

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

def dummy(doc):
    return doc

pipeline = Pipeline([          
#     ('vectorizer',
#      CountVectorizer(
#          preprocessor=dummy, 
#          ngram_range=(1, 4),
#          analyzer='word',
#          binary=False
#      )),
    ('clf', 
     LogisticRegression(
         class_weight='balanced',
         solver='liblinear'
     )
    )
])

In [39]:
hm = HierarchicalModel()
hm.loadData(os.getcwd() + "/Dataframes/All/Processed/", ["ActionName", "Capability"])
hm.main_class_predictor = lambda predictions: 0 if predictions[0] - predictions[1] > 0.5 else 1

hm.addEmbeddings("text-rel", "text-embedding", super_mapping)

# pipeline.fit(list(hm.train['text-embedding']), hm.train["Capability"].map(lambda x : 0 if x == '-' else 1))
# trzeba przerobić na LISTę

hm.predictMainClass('text-embedding', pipeline, report=False)
# hm.predictSubClass('text-embedding', pipeline)
# hm.showResults()

Loading data...

/home/jells123/Documents/ENGINEER/Secure-NLP/Dataframes/All/Processed/All-Train-P.csv
/home/jells123/Documents/ENGINEER/Secure-NLP/Dataframes/All/Processed/All-Test-P.csv

Loaded categories:

{'ActionName': ['File', 'Network', 'Other'], 'Capability': ['command_and_control', 'infection_propagation', 'other']}

Classifying main classes...


ValueError: setting an array element with a sequence.