In [7]:
# REQUIRED IMPORTS
import os                                               # Directory walking for file loading
import math                                             # Logarithm function
import pandas as pd                                     # Dataframe managment
import pefile                                           # Header feature extraction
import pickle                                           # Model saving
from sklearn.feature_selection import SelectFromModel   # Feature dimensionality reduction
from sklearn.ensemble import RandomForestClassifier     # Random Forest Classifier
from collections import Counter                         # Entropy calculations

# EVALUATION IMPORTS
import matplotlib as plt                                # Output plotting
import seaborn as sns                                   # Heatmap of confusion matrix
from sklearn.metrics import confusion_matrix            # Confusion matrix
from sklearn.metrics import classification_report       # Classification report

In [4]:
# MODEL OUTPUT LOCATION
model_file = "model.sav"

# TRAINING SAMPLE LOCATION
train_dir = ".\samples\\training"

# TESTING SAMPLE LOCATION
test_dir = ".\samples\\validation"

In [8]:
def entropy(self, data):
    """Calculate the entropy of a chunk of data."""

    if not data:
        return 0.0

    occurences = Counter(bytearray(data))

    entropy = 0
    for x in occurences.values():
        p_x = float(x) / len(data)
        entropy -= p_x * math.log(p_x, 2)

    return entropy

In [9]:
def create_feature_vectors(sample_dir):
    # Creating initial feature dataframe
    feature_df = pd.DataFrame()

    # Creating initial byte bi-gram dataframe
    byte_bi_gram_features = pd.DataFrame(columns=["SAMPLE"])

    # Creating initial opcode bi-gram dataframe
    opcode_bi_gram_features = pd.DataFrame(columns=["SAMPLE"])

    # Creating initial opcode tri-gram dataframe
    opcode_tri_gram_features = pd.DataFrame(columns=["SAMPLE"])

    # Creating the feature selector model
    selector = SelectFromModel(RandomForestClassifier(n_estimators=1000))

    # Iterating through all samples in the samples subdirectory
    for root, dirs, files in os.walk(sample_dir):
        for file in files:
            # Collecting PE Header features from current sample
            features = {}
            try:
                pe = pefile.PE(root+file)
                features["SAMPLE"] = (root+file)
                features["CLASSIFICATION"] = [1 if ("malicious" in root) else 0]
                features["FILE_HEADER.MACHINE"] = [pe.FILE_HEADER.Machine if (pe.FILE_HEADER != None) else 0]
                features["FILE_HEADER.SIZEOFOPTIONALHEADER"] = [pe.FILE_HEADER.SizeOfOptionalHeader if (pe.FILE_HEADER != None) else 0]
                features["FILE_HEADER.CHARACTERISTICS"] = [pe.FILE_HEADER.Characteristics if (pe.FILE_HEADER != None) else 0]
                features["OPTIONAL_HEADER.IMAGEBASE"] = [pe.OPTIONAL_HEADER.ImageBase if (pe.OPTIONAL_HEADER != None) else 0]
                features["OPTIONAL_HEADER.MAJOROPERATINGSYSTEM"] = [pe.OPTIONAL_HEADER.MajorOperatingSystemVersion if (pe.OPTIONAL_HEADER != None) else 0]
                features["OPTIONAL_HEADER.MAJORSUBSYSTEMVERSION"] = [pe.OPTIONAL_HEADER.MajorSubsystemVersion if (pe.OPTIONAL_HEADER != None) else 0]
                features["OPTIONAL_HEADER.DLLCHARACTERISTICS"] = [pe.OPTIONAL_HEADER.DllCharacteristics if (pe.OPTIONAL_HEADER != None) else 0]
                features["OPTIONAL_HEADER.SUBSYSTEM"] = [pe.OPTIONAL_HEADER.Subsystem if (pe.OPTIONAL_HEADER != None) else 0]
                entropies = []
                if (pe.OPTIONAL_HEADER != None):
                    for section in pe.sections:
                        entropies.append(section.get_entropy())
                else:
                    entropies.append(0)
                features["PE_SECTIONS.MAXENTROPY"] = max(entropies)
                features["PE_SECTIONS.MINENTROPY"] = min(entropies)
                features["PE_SECTIONS.MEANENTROPY"] = sum(entropies) / len(entropies)
                # TODO: Compute the resource max and min entropy
                if (pe.OPTIONAL_HEADER != None):
                    for directory in pe.OPTIONAL_HEADER.DATA_DIRECTORY:
                        features["DATA_DIRECTORY."+str(directory.name)] = [1 if ((directory.VirtualAddress != 0) and (directory.Size != 0)) else 0]
                features["VS_VERSIONINFO.Length"] = [pe.VS_VERSIONINFO[0].Length if (pe.VS_VERSIONINFO != None) else 0]
                feature_df = feature_df.concat(features, ignore_index=True)
            except Exception as e:
                print(e)
            feature_df.filna(0)


            # Collecting all byte bi-gram features in current sample
            new_bi_grams = set()
            try:
                with open(root+file, "rb") as f:
                    byte_bi_gram_features = byte_bi_gram_features.append({"SAMPLE": (root+file)}, ignore_index=True)
                    cur_byte = f.read(1)
                    prev_byte = None

                    # While not the end of file
                    while (cur_byte != b""):
                        # Creating the bi-gram if enough history exists
                        if prev_byte != None:
                            bi_gram = prev_byte.hex() + " " + cur_byte.hex()
                            if not bi_gram in byte_bi_gram_features.columns:
                                new_bi_grams.add(bi_gram)
                            else:
                                byte_bi_gram_features.loc[byte_bi_gram_features["SAMPLE"]==(root+file) , [bi_gram]] = 1

                        # Moving the sliding window
                        prev_byte = cur_byte
                        cur_byte = f.read(1)
            except Exception as e:
                print(e)
            
            # One-hot-encoding every sample with the combination of all encountered features
            new_feature_array = []
            for index, row in byte_bi_gram_features.iterrows():
                if row["SAMPLE"] == (root+file):
                    new_feature_array.append([1 for x in new_bi_grams])
                else:
                    new_feature_array.append([0 for x in new_bi_grams])
            byte_bi_gram_features = pd.concat([byte_bi_gram_features, pd.DataFrame(new_feature_array, columns=new_bi_grams)], axis=1)


            # Collecting all opcode bi-gram and tri-gram features in current sample
            # new_bi_grams = set()
            # new_tri_grams = set()
            # try:
            #     with open(root+file, "rb") as f: # TODO: Convert this to open file in ASM
            #         opcode_bi_gram_features = opcode_bi_gram_features.append({"SAMPLE": (root+file)}, ignore_index=True)
            #         opcode_tri_gram_features = opcode_tri_gram_features.append({"SAMPLE": (root+file)}, ignore_index=True)
            #         cur_opcode = f.read(1) # TODO: Convert this to find next opcode
            #         prev_opcode1 = None
            #         prev_opcode2 = None
                    
            #         # While not the end of file
            #         while (cur_opcode != b""):
            #             # Creating the bi-gram if enough history exists
            #             if prev_opcode1 != None:
            #                 bi_gram = prev_opcode1.hex() + " " + cur_opcode.hex()
            #                 if not bi_gram in opcode_bi_gram_features.columns:
            #                     new_bi_grams.add(bi_gram)
            #                 else:
            #                     opcode_bi_gram_features.loc[opcode_bi_gram_features["SAMPLE"]==(root+file), [bi_gram]] = 1

            #             # Creating the tri-gram if enough history exists
            #             if prev_opcode2 != None:
            #                 tri_gram = prev_opcode2.hex() + " " + prev_opcode1.hex() + " " + cur_opcode.hex()
            #                 if not tri_gram in opcode_tri_gram_features.columns:
            #                     new_tri_grams.add(tri_gram)
            #                 else:
            #                     opcode_tri_gram_features.loc[opcode_tri_gram_features["SAMPLE"]==(root+file), [tri_gram]] = 1

            #             # Moving the sliding window
            #             prev_opcode2 = prev_opcode1
            #             prev_opcode1 = cur_opcode
            #             cur_opcode = f.read(1) # TODO: Convert this to find next opcode
            # except Exception as e:
            #     print(e)

            # # One-hot-encoding every sample with the combination of all encountered features
            # new_feature_array = []
            # for index, row in opcode_bi_gram_features.iterrows():
            #     if row["SAMPLE"] == (root+file):
            #         new_feature_array.append([1 for x in new_bi_grams])
            #     else:
            #         new_feature_array.append([0 for x in new_bi_grams])
            # opcode_bi_gram_features = pd.concat([opcode_bi_gram_features, pd.DataFrame(new_feature_array, columns=new_bi_grams)], axis=1)
            # new_feature_array = []
            # for index, row in opcode_tri_gram_features.iterrows():
            #     if row["SAMPLE"] == (root+file):
            #         new_feature_array.append([1 for x in new_tri_grams])
            #     else:
            #         new_feature_array.append([0 for x in new_tri_grams])
            # opcode_tri_gram_features = pd.concat([opcode_tri_gram_features, pd.DataFrame(new_feature_array, columns=new_tri_grams)], axis=1)

    # Selecting top 200 byte bi-gram features
    selector.fit(byte_bi_gram_features.iloc[:, 1:].to_numpy(), list(feature_df["CLASSIFICATION"])) # TODO: Debug this
    selected_byte_features = byte_bi_gram_features.columns[selector.get_support()]

    # Selecting top 100 opcode bi-gram features
    # selector.fit(opcode_bi_gram_features.iloc[:, 1:], list(feature_df["CLASSIFICATION"])) # TODO: Copy the debugged version above
    # selected_opcode_features_1 = opcode_bi_gram_features.columns[selector.get_support()]

    # Selecting top 100 opcode tri-gram features
    # selector.fit(opcode_tri_gram_features.iloc[:, 1:], list(feature_df["CLASSIFICATION"])) # TODO: Copy the debugged version above
    # selected_opcode_features_2 = opcode_tri_gram_features.columns[selector.get_support()]

    # Creating final dataset with full feature matrix
    #sample_df = pd.concat([feature_df, byte_bi_gram_features.loc[:, selected_byte_features], opcode_bi_gram_features.loc[:, selected_opcode_features_1], opcode_tri_gram_features.loc[:, selected_opcode_features_2]])
    sample_df = pd.concat([feature_df, byte_bi_gram_features.loc[:, selected_byte_features]])

    return sample_df

In [1]:
def evaluate_model(model, x_test, y_test):
    y_pred = model.predict(x_test)
    print('Classification Report')
    print(classification_report(y_test, y_pred))
    print('Confusion Matrix')
    confused = confusion_matrix(y_test, y_pred)
    f = plt.figure(figsize=(15,15))
    ax = f.add_subplot()
    sns.heatmap(confused, annot=True, fmt='g', ax=ax)
    ax.set_xlabel('Predicted Labels')
    ax.set_ylabel('True Labels')
    ax.set_title('Confusion Matrix')
    ax.xaxis.set_ticklabels(["Malicious", "Benign"])
    ax.yaxis.set_ticklabels(["Malicious", "Benign"])
    plt.show()
    print('True Negative: ' + str(confused[0][0]))
    print('True Positive: ' + str(confused[1][1]))
    print('False Negative: ' + str(confused[0][1]))
    print('False Positive: ' + str(confused[1][0]))

In [11]:
# CREATING THE TRAINING DATASET
train = create_feature_vectors(train_dir)
x_train = train.loc[:, train.columns != "CLASSIFICATION"]
y_train = train["CLASSIFICATION"]

KeyError: 'CLASSIFICATION'

In [None]:
# CREATING THE TESTING DATASET
test = create_feature_vectors(test_dir)
x_test = test.loc[:, test.columns != "CLASSIFICATION"]
y_test = test["CLASSIFICATION"]

In [None]:
# CREATING AND TRAINING THE RFC CLASSIFIER
model = RandomForestClassifier(n_estimators=1000).fit(x_train, y_train)
evaluate_model(model, x_test, y_test)

In [None]:
# SAVING THE TRAINED MODEL
pickle.dump(model, open(model_file, 'wb'))