In [3]:
import json
import argparse
import os
import pandas as pd

"""
parser = argparse.ArgumentParser(
    description="Opens JSON files of Mach-O data and aggregates into a single pandas dataframe, saving as a CSV."
)
parser.add_argument(
    "--path",
    default="./json_data",
    type=str,
    help="Where to look for parsed JSON files.",
)
parser.add_argument(
    "--outdir",
    default="/tmp/json_data",
    type=str,
    help="Where to output the CSV.",
)
parser.add_argument(
    "--outfile",
    default="macho_feature_vector.csv",
    type=str,
    help="What to call the CSV file.",
)
args = parser.parse_args()
"""

def load_json(filename: str) -> str:
    with open(filename, "r") as f:
        data = json.loads(f.read())
    return data


def get_files(path: str) -> list:
    filelist = []
    for root, dirs, files in os.walk(path):
        for filename in files:
            fullpath = os.path.join(root, filename)
            filelist.append(fullpath)
    return filelist


def parse_segment(load_command: object):
    segment = {}
    name = load_command["name"]
    segment["name"] = name
    segment[f"segment_{name}_vmsize"] = load_command["vmsize"]
    segment[f"segment_{name}_size"] = load_command["size"]
    segment[f"segment_{name}_initprot"] = load_command["initprot"]
    segment[f"segment_{name}_maxprot"] = load_command["maxprot"]
    segment[f"segment_{name}_nsects"] = load_command["nsects"]
    segment[f"segment_{name}_entropy"] = load_command["entropy"]
    for sect in load_command["sects"]:
        sectname = sect["name"]
        segment[f"segment_{name}_{sectname}"] = sect
    return segment


def parse_loaddylib(load_command: object, mach: object):
    dylib = {}
    name = load_command["name"]
    dylib["name"] = name
    if "imports" in mach["macho"].keys():
        for imp in mach["macho"]["imports"]:
            if imp[1] == name:
                impfunc = imp[0]
                dylib[f"dylib_{name}_{impfunc}"] = 1            

    dylib[f"dylib_{name}_cmdsize"] = load_command["cmd_size"]
    dylib[f"dylib_{name}_version"] = load_command["current_version"]
    dylib[f"dylib_{name}_timestamp"] = load_command["timestamp"]
    return dylib


def parse_json(data: object, filename: str):
    mach = {}
    mach["name"] = data["name"]
    mach["size"] = data["size"]
    mach["entropy"] = data["entropy"]
    mach["nlcs"] = data["macho"]["nlcs"]
    mach["slcs"] = data["macho"]["slcs"]
 
    for flag in data["macho"]["flags"]:
        name = f"flag_{flag}"
        mach[name] = 1
    if "packed" in filename:
        mach["packed"] = 1
    else:
        mach["packed"] = 0
    if data["malware"] == 1:
        mach["malware"] = 1
    else:
        mach["malware"] = 0
    for load_command in data["macho"]["lcs"]:
        lc_type = load_command["cmd"]
        if lc_type == "SEGMENT" or lc_type == "SEGMENT_64":
            segment = parse_segment(load_command)
            name = segment["name"]
            mach[f"segment_{name}"] = 1
            for k,v in segment.items():
                mach[f"{k}"] = v
        if lc_type == "LOAD_DYLIB":
            dylib = parse_loaddylib(load_command, data)
            name = dylib["name"]
            mach[f"dylib_{name}"] = 1
            for k,v in dylib.items():
                mach[f"{k}"] = v
    return mach

In [4]:
def call_parse(files, malware=0):
    global machos 
    global max_len 
    global keys 

    for file in files:
        toparse = []
        with open(file, "r") as f:
            jsondata = json.loads(f.read())
            if "universal" in jsondata.keys():
                for arch in jsondata["universal"]["machos"]:
                    jsondata["macho"] = arch
                    jsondata['malware'] = malware
                    toparse.append(jsondata)
            else:
                jsondata['malware'] = malware
                toparse.append(jsondata)
        for macho in toparse:
            mach = parse_json(macho, file)
            if not mach:
                print(f"failed {file}")
                continue
            machos.append(mach)
            [keys.append(x) for x in mach.keys()]
            cur_len = len(mach)
            if cur_len > max_len:
                max_len = cur_len
        
        


machos = []
max_len = 0
keys = []

files = get_files("json_benign")
call_parse(files)
# Repeat, but with malware. Mark malware in JSON
files = get_files("json_malware")
call_parse(files, malware=1)


In [None]:
df = pd.DataFrame(columns=set(keys))
count = 0
for mach in machos:
    df.loc[count] = 0  # Initializes all values for the ID to zero.
    df.loc[count, mach] = 1  # Sets relevant features to a value of one.
    df.loc[count]['name'] = mach['name']
    df.loc[count]['size'] = mach['size'] # update non-binary columns
    df.loc[count]['entropy'] = mach['entropy']
    df.loc[count]['nlcs'] = mach['nlcs']
    df.loc[count]['slcs'] = mach['slcs']
    count += 1

In [None]:
# Every column is an object. Iterate the column and set to appropriate types
for column in df.columns:
    a = df[column].describe()
    df[column] = df[column].astype(a.dtype)

In [None]:
df.name.value_counts()

# Feature Selection
SelectKBest

In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression
#Select top 2 features based on mutual info regression
X = df.drop(['packed','name','malware'], axis=1)
y = df['packed']
selector = SelectKBest(mutual_info_regression, k=200)
selector.fit(X, y)
X.columns[selector.get_support()]

In [None]:
for col in X.columns[selector.get_support()]:
    print(col)

In [None]:
for column in df.columns:
    if "upx" in column:
        print(column)

In [None]:
df[df["segment_upxTEXT"]==1]["segment_upxTEXT"]

In [None]:
test = {'name': '__TEXT', 'segment___TEXT_vmsize': 167936, 'segment___TEXT_size': 632, 'segment___TEXT_initprot': 'r-x', 'segment___TEXT_maxprot': 'r-x', 'segment___TEXT_nsects': 7, 'segment___TEXT_entropy': 0.7824717353762201}

In [None]:
for k,v in test.items():
    print(v)

In [None]:
df

In [None]:
len(files)