In [48]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv
from os import system
import keras
import scipy.io

In [16]:
import xml.etree.ElementTree as ET
import classification_starter as start

In [32]:
import os
from collections import Counter
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET
import numpy as np
from scipy import sparse

import util

In [28]:
def extract_feats(ffs, direc="train", global_feat_dict=None):

    fds = [] # list of feature dicts
    classes = []
    ids = [] 
    for datafile in os.listdir(direc):
        # extract id and true class (if available) from filename
        id_str,clazz = datafile.split('.')[:2]
        ids.append(id_str)
        # add target class if this is training data
        try:
            classes.append(util.malware_classes.index(clazz))
        except ValueError:
            # we should only fail to find the label in our list of malware classes
            # if this is test data, which always has an "X" label
            assert clazz == "X"
            classes.append(-1)
        rowfd = {}
        # parse file as an xml document
        tree = ET.parse(os.path.join(direc,datafile))
        # accumulate features
        [rowfd.update(ff(tree)) for ff in ffs]
        fds.append(rowfd)
        
    X,feat_dict = make_design_mat(fds,global_feat_dict)
    return X, feat_dict, np.array(classes), ids

In [29]:
def make_design_mat(fds, global_feat_dict=None):

    if global_feat_dict is None:
        all_feats = set()
        [all_feats.update(fd.keys()) for fd in fds]
        feat_dict = dict([(feat, i) for i, feat in enumerate(sorted(all_feats))])
    else:
        feat_dict = global_feat_dict
        
    cols = []
    rows = []
    data = []        
    for i in xrange(len(fds)):
        temp_cols = []
        temp_data = []
        for feat,val in fds[i].iteritems():
            try:
                # update temp_cols iff update temp_data
                temp_cols.append(feat_dict[feat])
                temp_data.append(val)
            except KeyError as ex:
                if global_feat_dict is not None:
                    pass  # new feature in test data; nbd
                else:
                    raise ex

        # all fd's features in the same row
        k = len(temp_cols)
        cols.extend(temp_cols)
        data.extend(temp_data)
        rows.extend([i]*k)

    assert len(cols) == len(rows) and len(rows) == len(data)
   

    X = sparse.csr_matrix((np.array(data),
                   (np.array(rows), np.array(cols))),
                   shape=(len(fds), len(feat_dict)))
    return X, feat_dict
    

## Here are two example feature-functions. They each take an xml.etree.ElementTree object, 
# (i.e., the result of parsing an xml file) and returns a dictionary mapping 
# feature-names to numeric values.
## TODO: modify these functions, and/or add new ones.

In [26]:
def first_last_system_call_feats(tree):

    c = Counter()
    in_all_section = False
    first = True # is this the first system call
    last_call = None # keep track of last call we've seen
    for el in tree.iter():
        # ignore everything outside the "all_section" element
        if el.tag == "all_section" and not in_all_section:
            in_all_section = True
        elif el.tag == "all_section" and in_all_section:
            in_all_section = False
        elif in_all_section:
            if first:
                c["first_call-"+el.tag] = 1
                first = False
            last_call = el.tag  # update last call seen
            
    # finally, mark last call seen
    c["last_call-"+last_call] = 1
    return c

In [27]:
def system_call_count_feats(tree):

    c = Counter()
    in_all_section = False
    for el in tree.iter():
        # ignore everything outside the "all_section" element
        if el.tag == "all_section" and not in_all_section:
            in_all_section = True
        elif el.tag == "all_section" and in_all_section:
            in_all_section = False
        elif in_all_section:
            c['num_system_calls'] += 1
    return c

In [13]:
## MAIN ------------------------------
##The following function does the feature extraction, learning, and prediction
def main():
    train_dir = "train"
    test_dir = "test"
    outputfile = "mypredictions.csv"  # feel free to change this or take it as an argument
    
    # TODO put the names of the feature functions you've defined above in this list
    ffs = [first_last_system_call_feats, system_call_count_feats]
    
    # extract features
    print "extracting training features..."
    X_train,global_feat_dict,t_train,train_ids = extract_feats(ffs, train_dir)
    print "done extracting training features"
    print
    
    # TODO train here, and learn your classification parameters
    print "learning..."
    learned_W = np.random.random((len(global_feat_dict),len(util.malware_classes)))
    print "done learning"
    print
    
#     # get rid of training data and load test data
#     del X_train
#     del t_train
#     del train_ids
#     print "extracting test features..."
#     X_test,_,t_ignore,test_ids = extract_feats(ffs, test_dir, global_feat_dict=global_feat_dict)
#     print "done extracting test features"
#     print
    
#     # TODO make predictions on text data and write them out
#     print "making predictions..."
#     preds = np.argmax(X_test.dot(learned_W),axis=1)
#     print "done making predictions"
#     print
    
#     print "writing predictions..."
#     util.write_predictions(preds, test_ids, outputfile)
#     print "done!"

if __name__ == "__main__":
    main()
    

/Users/hikarisorensen/Dropbox/00_Pika/CS 181/cs181-s17-hixor/p2


In [34]:
train_dir = "train"
test_dir = "test"

ffs = [first_last_system_call_feats, system_call_count_feats]
X_train,global_feat_dict,t_train,train_ids = extract_feats(ffs, train_dir)

In [42]:
with open('t_train_starter.csv', 'wb') as starter_t:
    wr = csv.writer(starter_t)
    wr.writerow(t_train)

In [43]:
with open('train_ids_starter.csv', 'wb') as starter_ids:
    wr = csv.writer(starter_ids)
    wr.writerow(train_ids)

In [44]:
df_X_train = pd.DataFrame(X_train)

PandasError: DataFrame constructor not properly called!

In [49]:
scipy.io.mmwrite("X_train_starter.csv", X_train)

In [10]:
tree = ET.parse("./train/00269ea50001a6c699d0222032d45b74b2e7e8be9.None.xml")
root = tree.getroot()
for child in root:
    print child.tag, child.attrib

process {'username': 'Administrator', 'index': '1', 'sha1': '0069ea50001a6c699d0222032d45b74b2e7e8be9', 'parentindex': '0', 'terminationtime': '00:10.547', 'filename_hash': 'hash_error', 'pid': '1952', 'filename': 'c:\\1025be1934a50c3355adb359507f2862.EX', 'filesize': '149270', 'starttime': '00:01.704', 'terminationreason': 'NormalTermination', 'executionstatus': 'OK', 'startreason': 'AnalysisTarget', 'md5': '1025be1934a50c3355adb359507f2862', 'applicationtype': 'Win32Application'}
process {'username': 'SYSTEM', 'index': '2', 'sha1': '97c7c354c12b89c797740b35ed81879be58f3deb', 'parentindex': '0', 'terminationtime': '00:15.000', 'filename_hash': '49083ae3725a0488e0a8fbbe1335c745f70c4667', 'pid': '984', 'filename': 'C:\\WINDOWS\\system32\\svchost.exe', 'filesize': '14336', 'starttime': '00:08.579', 'terminationreason': 'Timeout', 'executionstatus': 'OK', 'startreason': 'DCOMService', 'md5': '4fbc75b74479c7a6f829e0ca19df3366'}
process {'username': 'Administrator', 'index': '3', 'parentind

In [23]:
mypreds = pd.read_csv('mypredictions.csv',nrows=10)

In [24]:
mypreds

Unnamed: 0,Id,Prediction
0,0015c8c9ff02fea9d0f45692b9eebfb4abff4e42f,6
1,001f298a534ae4b0db7f2707169250aa215c3b5f2,6
2,001f5fdaaa8bbe20303527198d09a30bb7ca3eb50,6
3,002ca2c41b649f85c05ae30013436781a932fecc6,6
4,003e109543b4ea22d2bcc1ec309bf2fd34e9a1a1d,6
5,004070b468d6bb29848c76cfCd5887849c7bb648d,6
6,00461dd05c981edde167a5947c365472141e04bb1,6
7,005b95d2520C8621171566f5803437b0c443778e1,6
8,0071a3b818ed06d3865a24fdb31d4147c67fabfc5,6
9,007436715ec13cedd38344772a2144a3d79f3ea68,6
