In [47]:
import json
import argparse
import os
import pandas as pd
from tqdm import tqdm
import dask.dataframe as dd

"""
parser = argparse.ArgumentParser(
    description="Opens JSON files of Mach-O data and aggregates into a single pandas dataframe, saving as a CSV."
)
parser.add_argument(
    "--path",
    default="./json_data",
    type=str,
    help="Where to look for parsed JSON files.",
)
parser.add_argument(
    "--outdir",
    default="/tmp/json_data",
    type=str,
    help="Where to output the CSV.",
)
parser.add_argument(
    "--outfile",
    default="macho_feature_vector.csv",
    type=str,
    help="What to call the CSV file.",
)
args = parser.parse_args()
"""

def load_json(filename: str) -> str:
    with open(filename, "r") as f:
        data = json.loads(f.read())
    return data


def get_files(path: str) -> list:
    filelist = []
    for root, dirs, files in os.walk(path):
        for filename in files:
            fullpath = os.path.join(root, filename)
            filelist.append(fullpath)
    return filelist


def parse_segment(load_command: object):
    segment = {}
    lname = load_command["name"]
    segment["name"] = lname
    segment[f"segment_{lname}_vmsize"] = load_command["vmsize"]
    segment[f"segment_{lname}_size"] = load_command["size"]
    segment[f"segment_{lname}_initprot"] = load_command["initprot"]
    segment[f"segment_{lname}_maxprot"] = load_command["maxprot"]
    segment[f"segment_{lname}_nsects"] = load_command["nsects"]
    segment[f"segment_{lname}_entropy"] = load_command["entropy"]
    for sect in load_command["sects"]:
        sectname = sect["name"]
        segment[f"segment_{lname}_{sectname}"] = sect
    return segment


def parse_loaddylib(load_command: object, mach: object):
    dylib = {}
    dname = load_command["name"]
    dylib["name"] = dname
    if "imports" in mach["macho"].keys():
        impcount = 0
        for imp in mach["macho"]["imports"]:
            if imp[1] == dname:
                impcount += 1
        dylib[f"dylib_{dname}_imports"] =  impcount

    #dylib[f"dylib_{dname}_cmdsize"] = load_command["cmd_size"]
    #dylib[f"dylib_{dname}_version"] = load_command["current_version"]
    #dylib[f"dylib_{dname}_timestamp"] = load_command["timestamp"]
    return dylib


def parse_json(data: object, filename: str):
    mach = {}
    mach["name"] = data["name"]
    
    mach["size"] = data["size"]
    mach["entropy"] = data["entropy"]
    mach["nlcs"] = data["macho"]["nlcs"]
    mach["slcs"] = data["macho"]["slcs"]
 
    for flag in data["macho"]["flags"]:
        fname = f"flag_{flag}"
        mach[fname] = 1
    if "packed" in filename:
        mach["packed"] = 1
    else:
        mach["packed"] = 0
    if data["malware"] == 1:
        mach["malware"] = 1
    else:
        mach["malware"] = 0
    
    for load_command in data["macho"]["lcs"]:
        lc_type = load_command["cmd"]
        if lc_type == "SEGMENT" or lc_type == "SEGMENT_64":
            segment = parse_segment(load_command)
            sname = segment["name"]
            mach[f"{sname}"] = 1
            for k,v in segment.items():
                mach[f"{k}"] = v
        if lc_type == "LOAD_DYLIB":
            dylib = parse_loaddylib(load_command, data)
            dname = dylib["name"]
            mach[f"{dname}"] = 1
            for k,v in dylib.items():
                mach[f"{k}"] = v
    
    return mach

In [48]:
def call_parse(files, malware=0):
    global machos 
    global max_len 
    global keys 

    for file in files:
        toparse = []
        with open(file, "r") as f:
            jsondata = json.loads(f.read())
            if "universal" in jsondata.keys():
                for arch in jsondata["universal"]["machos"]:
                    jsondata["macho"] = arch
                    jsondata['malware'] = malware
                    #print(jsondata)
                    toparse.append(jsondata)
            else:
                jsondata['malware'] = malware
                toparse.append(jsondata)
        for macho in toparse:
            mach = parse_json(macho, file)
            if not mach:
                print(f"failed {file}")
                continue
            machos.append(mach)
            [keys.append(x) for x in mach.keys()]
            cur_len = len(mach)
            if cur_len > max_len:
                max_len = cur_len

machos = []
max_len = 0
keys = []

files = get_files("json_benign")
call_parse(files)
# Repeat, but with malware. Mark malware in JSON
files = get_files("json_malware")
call_parse(files, malware=1)


In [None]:
df = pd.DataFrame(columns=set(keys))
for col in df.columns:
    if "imports" in col:
        df[col] = df[col].astype('object')

count=0
for mach in tqdm(machos, bar_format="{l_bar}{bar}"):
    """
    df.loc[count] = 0  # Initializes all values for the ID to zero.
    df.loc[count, mach] = 1  # Sets relevant features to a value of one.
    df.loc[count]['name'] = mach['name']
    df.loc[count]['size'] = mach['size'] # update non-binary columns
    df.loc[count]['entropy'] = mach['entropy']
    df.loc[count]['nlcs'] = mach['nlcs']
    df.loc[count]['slcs'] = mach['slcs']
    """
    for col in df.columns:
        if col in mach.keys():
            df.at[count, col] = mach[col]
        else:
            if "segment" in col:
                df.at[count, col] = 0
            elif "imports" in col:
                df.at[count, col] = ['none']
            else:
                df.at[count, col] = 0
    count+=1
df = df.convert_dtypes()

 77%|███████▋  

In [24]:
df.to_csv('macho.csv')

In [28]:
df = pd.read_csv('macho.csv', low_memory=False)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3664 entries, 0 to 3663
Columns: 2831 entries, Unnamed: 0 to dylib_@rpath/RxSwift.framework/Versions/A/RxSwift_imports
dtypes: float64(12), int64(1326), object(1493)
memory usage: 79.1+ MB


In [23]:
for col in df.columns:
    if "imports" in col:
        df[col] = df[col].apply(lambda x: len(x))

In [29]:
df['dylib_@rpath/libssl.1.0.0.dylib_imports'].unique()

array(['[]',
       "['__ZNKSt3__112basic_stringIcNS_11char_traitsIcEENS_9allocatorIcEEE4findEcm', '__ZNKSt3__112basic_stringIcNS_11char_traitsIcEENS_9allocatorIcEEE7compareEmmPKcm', '__ZNKSt3__120__vector_base_commonILb1EE20__throw_length_errorEv', '__ZNKSt3__121__basic_string_commonILb1EE20__throw_length_errorEv', '__ZNKSt3__16locale9use_facetERNS0_2idE', '__ZNKSt3__18ios_base6getlocEv', '__ZNSt11logic_errorC2EPKc', '__ZNSt12length_errorD1Ev', '__ZNSt3__112basic_stringIcNS_11char_traitsIcEENS_9allocatorIcEEE5eraseEmm', '__ZNSt3__112basic_stringIcNS_11char_traitsIcEENS_9allocatorIcEEE6appendEPKc', '__ZNSt3__112basic_stringIcNS_11char_traitsIcEENS_9allocatorIcEEE6appendEPKcm', '__ZNSt3__112basic_stringIcNS_11char_traitsIcEENS_9allocatorIcEEE6assignEPKc', '__ZNSt3__112basic_stringIcNS_11char_traitsIcEENS_9allocatorIcEEE6resizeEmc', '__ZNSt3__112basic_stringIcNS_11char_traitsIcEENS_9allocatorIcEEE7replaceEmmPKcm', '__ZNSt3__112basic_stringIcNS_11char_traitsIcEENS_9allocatorIcEEE7reserveE

In [44]:
df['dylib_@rpath/libssl.1.0.0.dylib_imports'].convert_dtypes()

0       []
1       []
2       []
3       []
4       []
        ..
3659    []
3660    []
3661    []
3662    []
3663    []
Name: dylib_@rpath/libssl.1.0.0.dylib_imports, Length: 3664, dtype: string

In [46]:
type(df['dylib_@rpath/libssl.1.0.0.dylib_imports'][0])

str

# Feature Selection
SelectKBest

In [None]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression
#Select top 2 features based on mutual info regression
X = ddf.drop(['packed','name','malware'], axis=1)
y = ddf['packed']
selector = SelectKBest(mutual_info_regression, k=200)
selector.fit(X, y)
X.columns[selector.get_support()]

In [1]:
X.columns[selector.get_support()]

NameError: name 'X' is not defined

In [2]:
df

NameError: name 'df' is not defined

In [30]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [28]:
X = df.drop(['packed','name','malware'], axis=1)
y = df['packed']

In [34]:
scaler = StandardScaler()
X_scale = scaler.fit_transform(X)

ValueError: setting an array element with a sequence.

In [39]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3664 entries, 0 to 3663
Columns: 2830 entries, @rpath/CocoaLumberjackSwift.framework/Versions/A/CocoaLumberjackSwift to dylib_@rpath/RxSwift.framework/Versions/A/RxSwift_imports
dtypes: Float64(12), Int64(1325), object(1488), string(5)
memory usage: 83.9+ MB


# SVM with forward selection

In [62]:
from sklearn.svm import SVC
from sklearn.feature_selection import SequentialFeatureSelector

In [None]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder()
enc.fit(df.select_dtypes(include=['object']).toarray())

In [None]:
svm = SVC()
X = df.drop(['packed','name','malware'], axis=1)
y = df['packed']
sfs = SequentialFeatureSelector(svm, n_features_to_select=50)
sfs.fit(X, y)
sfs.get_support()

In [76]:
df['dylib_@rpath/RxSwift.framework/Versions/A/RxSwift_imports'].values

array([list(['none']), list(['none']), list(['none']), ...,
       list(['none']), list(['none']), list(['none'])], dtype=object)

In [79]:
dfl = df.select_dtypes(include=['object'])

In [102]:
enc.fit([dfl[dfl.columns[100]], dfl[dfl.columns[200]]])

TypeError: Encoders require their input to be uniformly strings or numbers. Got ['list']

In [106]:
set(dfl[dfl.columns[150]])

TypeError: unhashable type: 'list'

In [127]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
X_mlb = mlb.fit_transform([dfl])

In [128]:
mlb.classes_

array(['dylib_/Library/Frameworks/GLib.framework/Libraries/libglib-2.0.0.dylib_imports',
       'dylib_/Library/Frameworks/GLib.framework/Libraries/libgthread-2.0.0.dylib_imports',
       'dylib_/System/Library/Frameworks/AGL.framework/Versions/A/AGL_imports',
       ..., 'segment___TEXT_qtmetadata', 'segment___TEXT_text_env',
       'segment___TEXT_upxTEXT'], dtype=object)

In [129]:
svm = SVC()
y = df['packed']
sfs = SequentialFeatureSelector(svm, n_features_to_select=50)
sfs.fit(X_mlb, y)
sfs.get_support()

ValueError: Found input variables with inconsistent numbers of samples: [1, 3664]