In [1]:
import os
from collections import Counter
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET
import numpy as np
from scipy import sparse

import util

TRAIN_DIR = "train"

call_set = set([])

def add_to_set(tree):
    for el in tree.iter():
        call = el.tag
        call_set.add(call)

def create_data_matrix(start_index, end_index, direc="train"):
    X = None
    classes = []
    ids = [] 
    i = -1
    for datafile in os.listdir(direc):
        if datafile == '.DS_Store':
            continue

        i += 1
        if i < start_index:
            continue 
        if i >= end_index:
            break

        # extract id and true class (if available) from filename
        id_str, clazz = datafile.split('.')[:2]
        ids.append(id_str)
        # add target class if this is training data
        try:
            classes.append(util.malware_classes.index(clazz))

        except ValueError:
            # we should only fail to find the label in our list of malware classes
            # if this is test data, which always has an "X" label
            assert clazz == "X"
            classes.append(-1)

        # parse file as an xml document
        tree = ET.parse(os.path.join(direc,datafile))
        add_to_set(tree)
        this_row = call_feats(tree)
        if X is None:
            X = this_row 
        else:
            X = np.vstack((X, this_row))

    return X, np.array(classes), ids

def call_feats(tree):
    good_calls = ['sleep', 'dump_line']

    call_counter = {}
    for el in tree.iter():
        #print el
        call = el.tag
        if call not in call_counter:
            call_counter[call] = 0
        else:
            call_counter[call] += 1
    print call_counter
    call_feat_array = np.zeros(len(good_calls))
    for i in range(len(good_calls)):
        call = good_calls[i]
        call_feat_array[i] = 0
        if call in call_counter:
            call_feat_array[i] = call_counter[call]

    return call_feat_array

## Feature extraction
def main():
    X_train, t_train, train_ids = create_data_matrix(0, 5, TRAIN_DIR)
    X_valid, t_valid, valid_ids = create_data_matrix(10, 15, TRAIN_DIR)

    print 'Data matrix (training set):'
    print X_train
    print 'Classes (training set):'
    print t_train


In [4]:
def main():
    X_train, t_train, train_ids = create_data_matrix(0, 5, TRAIN_DIR)
    X_valid, t_valid, valid_ids = create_data_matrix(10, 15, TRAIN_DIR)

    print 'Data matrix (training set):'
    print X_train
    print 'Classes (training set):'
    print t_train


In [3]:
if __name__ == "__main__":
    main()
    

Data matrix (training set):
[[  2.00000000e+00   0.00000000e+00]
 [  2.53000000e+02   3.43300000e+03]
 [  0.00000000e+00   0.00000000e+00]
 [  0.00000000e+00   0.00000000e+00]
 [  3.00000000e+00   6.35000000e+02]]
Classes (training set):
[ 8  6 12  8 10]


In [22]:
features=[]

In [20]:
x=list(set([1,2,2,3]))
x.extend([4,5])
x

[1, 2, 3, 4, 5]

In [32]:
def call_feats(tree):
    global features
    good_calls = ['process','sleep', 'dump_line']

    call_counter = {}
    for el in tree.iter():
        call = el.tag
        if call not in call_counter:
            call_counter[call] = 0
        else:
            call_counter[call] += 1
    features.extend(call_counter.keys())
    features=list(set(features))
    call_feat_array = np.zeros(len(good_calls))
    for i in range(len(good_calls)):
        call = good_calls[i]
        call_feat_array[i] = 0
        if call in call_counter:
            call_feat_array[i] = call_counter[call]

    return call_feat_array

In [37]:
def call_feats(tree):
    global features
    good_calls = features

    call_counter = {}
    for el in tree.iter():
        call = el.tag
        if call not in call_counter:
            call_counter[call] = 0
        else:
            call_counter[call] += 1
    call_feat_array = np.zeros(len(good_calls))
    for i in range(len(good_calls)):
        call = good_calls[i]
        call_feat_array[i] = 0
        if call in call_counter:
            call_feat_array[i] = call_counter[call]

    return call_feat_array

In [3]:
def create_data_matrix(start_index, end_index, direc="test"):
    X = None
    classes = []
    ids = [] 
    i = -1
    for datafile in os.listdir(direc):
        if datafile == '.DS_Store':
            continue

        i += 1
        if i < start_index:
            continue 
        if i >= end_index:
            break

        # extract id and true class (if available) from filename
        id_str, clazz = datafile.split('.')[:2]

        # add target class if this is training data
        

        # parse file as an xml document
        tree = ET.parse(os.path.join(direc,datafile))
        add_to_set(tree)
        for sub_tree in tree.iter():
            this_row = call_feats(tree)
            if X is None:
                X = this_row 
            else:
                X = np.vstack((X, this_row))
            ids.append(id_str)
            try:
                classes.append(util.malware_classes.index(clazz))

            except ValueError:
                # we should only fail to find the label in our list of malware classes
                # if this is test data, which always has an "X" label
                assert clazz == "X"
                classes.append(-1)

    return X, np.array(classes), ids


In [2]:
def call_feats(el):
    global features
    global ts
    ts=el
    good_calls = ['process','sleep', 'dump_line']

    call_counter = {}
    call = el.tag
    if call not in call_counter:
        call_counter[call] = 0
    else:
        call_counter[call] += 1
    huh = el.keys()
    for call in huh:
        if call not in call_counter:
            call_counter[call] = 0
        else:
            call_counter[call] += 1
    features.extend(call_counter.keys())
    features=list(set(features))
    call_feat_array = np.zeros(len(good_calls))
    for i in range(len(good_calls)):
        call = good_calls[i]
        call_feat_array[i] = 0
        if call in call_counter:
            call_feat_array[i] = call_counter[call]

    return call_feat_array

In [4]:
features=[]

In [5]:
features=[]
X_train, t_train, train_ids = create_data_matrix(0, 1, TRAIN_DIR)

AttributeError: 'ElementTree' object has no attribute 'tag'

In [253]:
def call_feats(tree):
    global features
    global ts
    good_calls = ['process','sleep', 'dump_line']

    call_counter = {}
    for el in tree.iter():
        call = el.tag
        if call not in call_counter:
            call_counter[call] = 0
        else:
            call_counter[call] += 1
        huh = el.items()
        for call in huh:
            if call not in call_counter:
                call_counter[call] = 0
            else:
                call_counter[call] += 1
    features.extend(call_counter.keys())
    features=list(set(features))
    call_feat_array = np.zeros(len(good_calls))
    for i in range(len(good_calls)):
        call = good_calls[i]
        call_feat_array[i] = 0
        if call in call_counter:
            call_feat_array[i] = call_counter[call]

    return call_feat_array

In [251]:
ts.items()

[('apifunction', 'NtTerminateProcess'), ('targetpid', '2224')]

In [254]:
features=[]
X_train, t_train, train_ids = create_data_matrix(0, 3086, TRAIN_DIR)

In [255]:
len(features)

1977618

In [256]:
def call_feats(tree):
    global features
    good_calls = features

    call_counter = {}
    for el in tree.iter():
        call = el.tag
        if call not in call_counter:
            call_counter[call] = 0
        else:
            call_counter[call] += 1
    call_feat_array = np.zeros(len(good_calls))
    for i in range(len(good_calls)):
        call = good_calls[i]
        call_feat_array[i] = 0
        if call in call_counter:
            call_feat_array[i] = call_counter[call]

    return call_feat_array

In [None]:
X_train, t_train, train_ids = create_data_matrix(0, 2300, TRAIN_DIR)
X_valid, t_valid, valid_ids = create_data_matrix(2300, 3086, TRAIN_DIR)

In [163]:
X_test, t_test, test_ids = create_data_matrix(0, 3724, "test")

In [226]:
import numpy as np

from time import time
from operator import itemgetter
from scipy.stats import randint as sp_randint

from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from sklearn.datasets import load_digits
from sklearn.ensemble import RandomForestClassifier

In [227]:
clf = RandomForestClassifier(n_estimators=100)

In [228]:
def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

In [244]:
param_grid = {"max_features": [None, 10,50,5,2,1,3, "log2"],
              "max_depth": [None, 10, 100],
              "min_samples_split": [1,5, 10],
              "min_samples_leaf": [1, 5,10]
}

In [245]:
grid_search = GridSearchCV(clf, param_grid=param_grid,cv=5)
start = time()
grid_search.fit(X_train, t_train)

print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.grid_scores_)))

GridSearchCV took 65.57 seconds for 216 candidate parameter settings.


In [246]:
report(grid_search.grid_scores_)

Model with rank: 1
Mean validation score: 0.889 (std: 0.007)
Parameters: {'max_features': 'log2', 'min_samples_split': 5, 'max_depth': 100, 'min_samples_leaf': 1}

Model with rank: 2
Mean validation score: 0.889 (std: 0.011)
Parameters: {'max_features': 1, 'min_samples_split': 1, 'max_depth': 100, 'min_samples_leaf': 1}

Model with rank: 3
Mean validation score: 0.889 (std: 0.005)
Parameters: {'max_features': 3, 'min_samples_split': 5, 'max_depth': 100, 'min_samples_leaf': 1}



In [238]:
model=RFC(n_estimators=100,max_features=1,min_samples_split=1,max_depth=100,min_samples_leaf=1).fit(X_train,t_train)

In [241]:
huh = model.predict(X_test)

In [217]:
import matplotlib.pyplot as plt

In [219]:
plt.matshow(pd.DataFrame(X_train).corr())
plt.show()

In [220]:
from string import letters
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [225]:
corr = pd.DataFrame(X_train).corr()
sns.set(style="white")
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3,
            square=True, xticklabels=5, yticklabels=5,
            linewidths=.5, cbar_kws={"shrink": .5}, ax=ax)
plt.savefig("test.png")

  if self._edgecolors == str('face'):


In [240]:
float(sum([1 if x==y else 0 for x,y in zip(huh,t_valid)]))/len(huh)

0.8893129770992366

In [166]:
import csv
resultFile = open("output.csv",'wb')
wr = csv.writer(resultFile, dialect='excel')
wr.writerow(test_ids)

In [167]:
with open('output.csv', 'wb') as f:
    writer = csv.writer(f)
    for val in ['Id']+test_ids:
        writer.writerow([val])

In [58]:
import pandas as pd

In [193]:
test=pd.read_csv("output.csv")

In [242]:
test['Prediction'] = huh

In [243]:
test.to_csv('output1.csv')

In [None]:
X = None
    classes = []
    ids = [] 
    i = -1

In [155]:
X = None
classes = []
ids = [] 
i = -1
start_index=0
end_index=1
direc='train'
for datafile in os.listdir(direc):
    print datafile
    if datafile == '.DS_Store':
        continue

    i += 1
    if i < start_index:
        continue 
    if i >= end_index:
        break

    # extract id and true class (if available) from filename
    id_str, clazz = datafile.split('.')[:2]
    ids.append(id_str)
    # add target class if this is training data
    try:
        classes.append(util.malware_classes.index(clazz))

    except ValueError:
        # we should only fail to find the label in our list of malware classes
        # if this is test data, which always has an "X" label
        assert clazz == "X"
        classes.append(-1)

    # parse file as an xml document
    tree = ET.parse(os.path.join(direc,datafile))
    add_to_set(tree)
    call_feats(tree)
    print(tree)

00269ea50001a6c699d0222032d45b74b2e7e8be9.None.xml
<ElementTree object at 0x1089ecf50>
00278ec420236020d6121dffe0cc20034422e7228.Lipler.xml


In [140]:
ts.items()

[('apifunction', 'NtTerminateProcess'), ('targetpid', '2224')]

In [160]:
len(features)

216

In [172]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-0.4a30.tar.gz (753kB)
[K    100% |████████████████████████████████| 753kB 140kB/s 
Building wheels for collected packages: xgboost
  Running setup.py bdist_wheel for xgboost ... [?25l- \ | / - \ | / - \ | / - \ | / - \ | / - \ | / - done
[?25h  Stored in directory: /Users/harrisonchase/Library/Caches/pip/wheels/24/4f/7d/95352d4cf7b2a0350462332fbd8838d666e17762fe0c05f276
Successfully built xgboost
Installing collected packages: xgboost
Successfully installed xgboost-0.4a30
[33mYou are using pip version 7.1.2, however version 8.0.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [177]:
import xgboost as xgb

In [186]:
set(t_train)

{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14}

In [178]:
dtrain = xgb.DMatrix(X_train, label=t_train)

In [188]:
param = {'bst:max_depth':2, 'bst:eta':1, 'silent':1, 'objective':'multi:softmax','num_class':15 }
param['nthread'] = 4
param['eval_metric'] = 'auc'
plst = param.items()
plst += [('eval_metric', 'ams@0')]

In [189]:
num_round = 10
bst = xgb.train( plst,dtrain)

In [190]:
dtest = xgb.DMatrix(X_test)

In [191]:
huh=bst.predict(dtest)