In [1]:
import json
import argparse
import os
import itertools
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from datetime import datetime
from sklearn.compose import make_column_transformer
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.feature_selection import SelectKBest, SelectFromModel, mutual_info_regression, chi2, f_classif, mutual_info_classif, SequentialFeatureSelector
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score, make_scorer, precision_score, recall_score, accuracy_score, confusion_matrix, ConfusionMatrixDisplay, PrecisionRecallDisplay, precision_recall_curve, roc_curve, auc
from sklearn.model_selection import train_test_split, KFold, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.svm import SVC

In [2]:
def load_json(filename: str) -> str:
    with open(filename, "r") as f:
        data = json.loads(f.read())
    return data


def get_files(path: str) -> list:
    filelist = []
    for root, dirs, files in os.walk(path):
        for filename in files:
            fullpath = os.path.join(root, filename)
            filelist.append(fullpath)
    return filelist


def parse_segment(load_command: object):
    segment = {}
    lname = load_command["name"]
    segment["name"] = lname
    segment[f"segment.{lname}.vmsize"] = load_command["vmsize"]
    segment[f"segment.{lname}.size"] = load_command["size"]
    segment[f"segment.{lname}.initprot"] = load_command["initprot"]
    segment[f"segment.{lname}.maxprot"] = load_command["maxprot"]
    segment[f"segment.{lname}.nsects"] = load_command["nsects"]
    segment[f"segment.{lname}.entropy"] = load_command["entropy"]
    for sect in load_command["sects"]:
        sectname = sect["name"]
        segment[f"segment.{lname}.{sectname}"] = sect
    return segment


def parse_loaddylib(load_command: object, mach: object):
    dylib = {}
    dname = load_command["name"]
    dylib["name"] = dname
    if "imports" in mach["macho"].keys():
        impcount = 0
        for imp in mach["macho"]["imports"]:
            if imp[1] == dname:
                impcount += 1
        dylib[f"dylib.{dname}.imports"] =  impcount
    return dylib


def parse_json(data: object, filename: str):
    mach = {}
    mach["name"] = data["name"]
    mach["size"] = data["size"]
    mach["entropy"] = data["entropy"]
    mach["nlcs"] = data["macho"]["nlcs"]
    mach["slcs"] = data["macho"]["slcs"]
 
    for flag in data["macho"]["flags"]:
        fname = f"flag_{flag}"
        mach[fname] = 1
    if "packed" in filename:
        mach["packed"] = 1
    else:
        mach["packed"] = 0
    if data["malware"] == 1:
        mach["malware"] = 1
    else:
        mach["malware"] = 0
    mach['packed_malware'] = mach["packed"] & mach["malware"]
    try:
        mach['family'] = data['vtscan']['McAfee']['result']
    except:
        try:
            mach['family'] = data['vtscan']['Kaspersky']['result']
        except:
            mach['family'] = "No family"

    num_segments = 0
    num_imports = 0
    for load_command in data["macho"]["lcs"]:
        lc_type = load_command["cmd"]
        if lc_type == "SEGMENT" or lc_type == "SEGMENT_64":
            num_segments += 1
            segment = parse_segment(load_command)
            sname = segment["name"]
            mach[f"{sname}"] = 1
            for k,v in segment.items():
                mach[f"{k}"] = v
        if lc_type == "LOAD_DYLIB":
            num_imports += 1
            dylib = parse_loaddylib(load_command, data)
            dname = dylib["name"]
            mach[f"{dname}"] = 1
            for k,v in dylib.items():
                mach[f"{k}"] = v
    mach["num_segments"] = num_segments
    mach["num_imports"] = num_imports
    return mach

In [3]:
def call_parse(files, malware=0):
    global machos 
    global max_len 
    global keys 

    for file in files:
        toparse = []
        with open(file, "r") as f:
            jsondata = json.loads(f.read())
            if "universal" in jsondata.keys():
                for arch in jsondata["universal"]["machos"]:
                    jsondata["macho"] = arch
                    jsondata['malware'] = malware
                    #print(jsondata)
                    toparse.append(jsondata)
            else:
                jsondata['malware'] = malware
                toparse.append(jsondata)
        for macho in toparse:
            mach = parse_json(macho, file)
            if not mach:
                print(f"failed {file}")
                continue
            machos.append(mach)
            [keys.append(x) for x in mach.keys()]
            cur_len = len(mach)
            if cur_len > max_len:
                max_len = cur_len

machos = []
max_len = 0
keys = []

files = get_files("json_benign_vt")
call_parse(files)
# Repeat, but with malware. Mark malware in JSON
files = get_files("json_malware_vt")
call_parse(files, malware=1)

In [4]:
df = pd.DataFrame(columns=set(keys))
for col in df.columns:
    if "imports" in col:
        df[col] = df[col].astype('object')

count=0
for mach in tqdm(machos, bar_format="{l_bar}{bar}"):
    """
    df.loc[count] = 0  # Initializes all values for the ID to zero.
    df.loc[count, mach] = 1  # Sets relevant features to a value of one.
    df.loc[count]['name'] = mach['name']
    df.loc[count]['size'] = mach['size'] # update non-binary columns
    df.loc[count]['entropy'] = mach['entropy']
    df.loc[count]['nlcs'] = mach['nlcs']
    df.loc[count]['slcs'] = mach['slcs']
    """
    for col in df.columns:
        if col in mach.keys():
            if isinstance(mach[col], dict):
                df.at[count, col] = 1
            else:
                df.at[count, col] = mach[col]
        else:
            if "imports" in col:
                df.at[count, col] = -1

    count+=1
df = df.convert_dtypes()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████


In [5]:
df

Unnamed: 0,@rpath/ProGL.framework/Versions/A/ProGL,@rpath/MicrosoftConversionLibrary.framework/Versions/A/MicrosoftConversionLibrary,dylib.@rpath/LOZIPFileWrapper.framework/Versions/A/LOZIPFileWrapper.imports,dylib.@executable_path/../MonoBundle/libUsbDevAccs.dylib.imports,dylib.@executable_path/../Frameworks/YukiDesktop.framework/Versions/A/YukiDesktop.imports,dylib.@executable_path/libnss3.dylib.imports,dylib./usr/lib/libDiagnosticMessagesClient.dylib.imports,@rpath/PermissionsKit.framework/Versions/A/PermissionsKit,@rpath/OpenEmuXPCCommunicator.framework/Versions/A/OpenEmuXPCCommunicator,@rpath/UALAppCore.framework/UALAppCore,...,segment.__DATA_CONST.__auth_ptr,@executable_path/../MonoBundle/libUsbDevAccs.dylib,dylib.@rpath/MSP2P.framework/Versions/A/MSP2P.imports,dylib.@rpath/libswiftCoreMedia.dylib.imports,dylib.@rpath/Runes.framework/Versions/A/Runes.imports,@rpath/AFAnalyticsKit.framework/Versions/A/AFAnalyticsKit,segment.__DATA__.nsects,dylib.@executable_path/libtesseract.4.dylib.imports,dylib.@rpath/BAMCheckout.framework/BAMCheckout.imports,dylib.@executable_path/../Frameworks/libicucore.A.dylib.imports
0,,,-1,-1,-1,-1,-1,,,,...,,,-1,-1,-1,,,-1,-1,-1
1,,,-1,-1,-1,-1,-1,,,,...,,,-1,-1,-1,,,-1,-1,-1
2,,,-1,-1,-1,-1,-1,,,,...,,,-1,-1,-1,,,-1,-1,-1
3,,,-1,-1,-1,-1,-1,,,,...,,,-1,-1,-1,,,-1,-1,-1
4,,,-1,-1,-1,-1,-1,,,,...,,,-1,-1,-1,,,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3504,,,-1,-1,-1,-1,-1,,,,...,,,-1,-1,-1,,,-1,-1,-1
3505,,,-1,-1,-1,-1,-1,,,,...,,,-1,-1,-1,,,-1,-1,-1
3506,,,-1,-1,-1,-1,-1,,,,...,,,-1,-1,-1,,,-1,-1,-1
3507,,,-1,-1,-1,-1,-1,,,,...,,,-1,-1,-1,,,-1,-1,-1


In [6]:
df.describe()

Unnamed: 0,@rpath/ProGL.framework/Versions/A/ProGL,@rpath/MicrosoftConversionLibrary.framework/Versions/A/MicrosoftConversionLibrary,dylib.@rpath/LOZIPFileWrapper.framework/Versions/A/LOZIPFileWrapper.imports,dylib.@executable_path/../MonoBundle/libUsbDevAccs.dylib.imports,dylib.@executable_path/../Frameworks/YukiDesktop.framework/Versions/A/YukiDesktop.imports,dylib.@executable_path/libnss3.dylib.imports,dylib./usr/lib/libDiagnosticMessagesClient.dylib.imports,@rpath/PermissionsKit.framework/Versions/A/PermissionsKit,@rpath/OpenEmuXPCCommunicator.framework/Versions/A/OpenEmuXPCCommunicator,@rpath/UALAppCore.framework/UALAppCore,...,segment.__DATA_CONST.__auth_ptr,@executable_path/../MonoBundle/libUsbDevAccs.dylib,dylib.@rpath/MSP2P.framework/Versions/A/MSP2P.imports,dylib.@rpath/libswiftCoreMedia.dylib.imports,dylib.@rpath/Runes.framework/Versions/A/Runes.imports,@rpath/AFAnalyticsKit.framework/Versions/A/AFAnalyticsKit,segment.__DATA__.nsects,dylib.@executable_path/libtesseract.4.dylib.imports,dylib.@rpath/BAMCheckout.framework/BAMCheckout.imports,dylib.@executable_path/../Frameworks/libicucore.A.dylib.imports
count,2.0,6.0,3509.0,3509.0,3509.0,3509.0,3509.0,6.0,1.0,2.0,...,2.0,1.0,3509.0,3509.0,3509.0,1.0,5.0,3509.0,3509.0,3509.0
mean,1.0,1.0,-0.99886,-0.99715,-0.99943,-0.955828,-0.9943,1.0,1.0,1.0,...,1.0,1.0,-0.986321,-0.999145,-0.999145,1.0,1.0,-0.972072,-0.955258,-0.999715
std,0.0,0.0,0.033748,0.168814,0.02387,2.616616,0.075291,0.0,,0.0,...,0.0,,0.516364,0.029231,0.050644,,0.0,1.169654,2.650379,0.016881
min,1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,1.0,1.0,...,1.0,1.0,-1.0,-1.0,-1.0,1.0,1.0,-1.0,-1.0,-1.0
25%,1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,1.0,1.0,...,1.0,1.0,-1.0,-1.0,-1.0,1.0,1.0,-1.0,-1.0,-1.0
50%,1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,1.0,1.0,...,1.0,1.0,-1.0,-1.0,-1.0,1.0,1.0,-1.0,-1.0,-1.0
75%,1.0,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,1.0,1.0,...,1.0,1.0,-1.0,-1.0,-1.0,1.0,1.0,-1.0,-1.0,-1.0
max,1.0,1.0,0.0,9.0,0.0,154.0,0.0,1.0,1.0,1.0,...,1.0,1.0,21.0,0.0,2.0,1.0,1.0,48.0,156.0,0.0


In [10]:
df['dylib.@rpath/Runes.framework/Versions/A/Runes.imports'].value_counts()

-1    3508
2        1
Name: dylib.@rpath/Runes.framework/Versions/A/Runes.imports, dtype: Int64