In [50]:
import numpy as np
import matplotlib.pyplot as plt
import joblib
import pickle
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import matthews_corrcoef, cohen_kappa_score
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split, KFold
import sys
import os
from os import listdir
from os.path import isfile, join
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix
from tabulate import tabulate
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from deslib.dcs import OLA
# from deslib.dcs import KNORA_U, KNORA_E
from deslib.des import  KNOP, METADES
from deslib.static import SingleBest, StackedClassifier
from deslib.static import StaticSelection
from imblearn.metrics import geometric_mean_score
import json

In [51]:
FEATURES_SET = {
    "feature": 1,
    "permission": 2,
    "activity": 3,
    "service_receiver": 3,
    "provider": 3,
    "service": 3,
    "intent": 4,
    "api_call": 5,
    "real_permission": 6,
    "call": 7,
    "url": 8
}


def count_feature_set(lines):
    """
    Count how many features belong to a specific set
    :param lines: features in the text file
    :return:
    """
    features_map = {x: 0 for x in range(1, 9)}
    for l in lines:
        if l != "\n":
            set = l.split("::")[0]
            features_map[FEATURES_SET[set]] += 1
    features = []
    for i in range(1, 9):
        features.append(features_map[i])
    return features


In [52]:
def read(LOAD_DATA=False):
    if LOAD_DATA:
        print("Previous data not loaded. Attempt to read data ...")
        mypath = r"Drebin\MetaData\feature_vectors\feature_vectors"
        onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]

        print("Reading csv file for ground truth ...")
        ground_truth = np.loadtxt(r"Drebin\MetaData\sha256_family.csv", delimiter=",", skiprows=1, dtype=str)
        # print ground_truth.shape
        # families = np.unique(ground_truth[:, 1])
        # print families
        # print len(families)

        print("Reading positive and negative texts ...")
        pos = []
        neg = []
        for virus in tqdm(onlyfiles):
            if virus in ground_truth[:, 0]:
                pos.append(virus)
            else:
                neg.append(virus)

        print("Extracting features ...")
        x = []
        y = []
        for text_file in tqdm(pos):
            sys.stdin = open("%s/%s" % (mypath, text_file))
            features = sys.stdin.readlines()
            sample = count_feature_set(features)
            x.append(sample)
            y.append(1)

        for text_file in tqdm(neg):
            sys.stdin = open("%s/%s" % (mypath, text_file))
            features = sys.stdin.readlines()
            sample = count_feature_set(features)
            x.append(sample)
            y.append(0)

        print("Data is read successfully:")
        x = np.array(x)
        y = np.array(y)
        print(x.shape, y.shape)

        print("Saving data under data_numpy directory ...")
        np.save(r"x_all.npy", x)
        np.save(r"y_all.npy", y)

        return x, y
    else:
        print("Loading previous data ...")
        x_ = np.load(r"x_all.npy")
        y_ = np.load(r"y_all.npy")
        print(x_.shape, y_.shape)
        # print x == x_, y == y_
        return x_, y_


def map_family_to_category(families):
    out = {}
    count = 1
    for family in families:
        out[family] = count
        count += 1
    return out


if __name__ == "__main__":
    #x, y = read(LOAD_DATA=True)
    x, y = read()

In [53]:
DES = {
    "SingleBest": SingleBest,
    "StaticSelection": StaticSelection,
    "OLA": OLA,
    # "KNORAU": KNORAU,
    # "KNORAE": KNORAE,
    "KNOP": KNOP,
    "METADES": METADES
}

In [54]:
# Função para criar o gráfico cumulativo para uma seed específica
def plot_cumulative_histogram_single_seed(hardness_table, seed, title):
    plt.figure(figsize=(10, 6))

    for label in ['label_0', 'label_1']:
        # Hits para a seed especificada
        hit_scores = hardness_table[label]['Hit'][seed]
        sorted_hits = np.sort(hit_scores)
        cumulative_hits = np.arange(1, len(sorted_hits) + 1) / len(sorted_hits)
        plt.plot(sorted_hits, cumulative_hits, label=f"{label} - Hit")

        # Misses para a seed especificada
        miss_scores = hardness_table[label]['Miss'][seed]
        sorted_misses = np.sort(miss_scores)
        cumulative_misses = np.arange(1, len(sorted_misses) + 1) / len(sorted_misses)
        plt.plot(sorted_misses, cumulative_misses, label=f"{label} - Miss", linestyle='--')

    # Configurações do gráfico
    plt.title(f'Cumulative Hardness Histogram for {title}')
    plt.xlabel('Hardness Score')
    plt.ylabel('Cumulative Probability')
    plt.legend()
    plt.grid(True)
    plt.show()



def CreateDESModel(x_all, y_all, model, modelName, balance, num_rep=30):

    with open('test_kdn_scores_unbalanced.json', 'r') as f:
        kdn_scores = json.load(f)

    if balance:
        with open('test_kdn_scores_balanced.json', 'r') as f:
            kdn_scores = json.load(f)

    for i in tqdm(range(num_rep)):
        x_train, x_test, y_train, y_test = train_test_split(x_all, y_all, test_size=0.2, stratify=y_all, random_state= i)

        if balance:
            smote = SMOTE(sampling_strategy='auto', k_neighbors=5, random_state=i)
            x_train, y_train = smote.fit_resample(x_train, y_train)

        # Train an ensemble of classifiers
        ensemble = RandomForestClassifier(n_estimators=100)
        ensemble.fit(x_train, y_train)

        # Initialize and train the DES model on the training set
        des_model = model(ensemble)
        des_model.fit(x_train, y_train)

        # Make predictions on the test set
        y_pred = des_model.predict(x_test)

        x_train, x_test, y_train, y_test = train_test_split(
                np.arange(len(x_all)), y_all, test_size=0.2, stratify=y_all, random_state=i)
        
        hardness_score = np.array(list(kdn_scores[f"seed-{i}"].values()))
        hardness_table = {'label_0': {'Hit':{}, 'Miss':{}}, 'label_1': {'Hit':{}, 'Miss':{}}}

        for label in [0, 1]:
            hit_scores = hardness_score[(y_test == y_pred) & (y_test == label)]
            miss_scores = hardness_score[(y_test != y_pred) & (y_test == label)]
            hardness_table[f'label_{label}']['Hit'][f'seed-{str(i)}'] = hit_scores
            hardness_table[f'label_{label}']['Miss'][f'seed-{str(i)}'] = miss_scores

        print(f"Hardness médio para seed-{i}: {np.mean(hardness_score):.4f}")
        
        if not balance:
            plot_cumulative_histogram_single_seed(hardness_table, f'seed-{i}', f'Unbalanced - {modelName}')  # Mostrar para o último seed processado
        else:
            plot_cumulative_histogram_single_seed(hardness_table, f'seed-{i}', f'Balanced - {modelName}')  # Mostrar para o último seed processado
            


In [55]:
x_all, y_all = read(LOAD_DATA=False)

In [56]:
for name, classifier in DES.items():
    CreateDESModel(x_all, y_all, classifier, name, balance=False, num_rep=1)

In [57]:
for name, classifier in DES.items():
    CreateDESModel(x_all, y_all, classifier, name, balance=True, num_rep=1)