In [1]:
import os
from collections import Counter
try:
    import xml.etree.cElementTree as ET
except ImportError:
    import xml.etree.ElementTree as ET
import numpy as np
from scipy import sparse

import util

TRAIN_DIR = "train"

call_set = set([])

def add_to_set(tree):
    for el in tree.iter():
        call = el.tag
        call_set.add(call)

def create_data_matrix(start_index, end_index, direc="train"):
    X = None
    classes = []
    ids = [] 
    i = -1
    for datafile in os.listdir(direc):
        if datafile == '.DS_Store':
            continue

        i += 1
        if i < start_index:
            continue 
        if i >= end_index:
            break

        # extract id and true class (if available) from filename
        id_str, clazz = datafile.split('.')[:2]
        ids.append(id_str)
        # add target class if this is training data
        try:
            classes.append(util.malware_classes.index(clazz))

        except ValueError:
            # we should only fail to find the label in our list of malware classes
            # if this is test data, which always has an "X" label
            assert clazz == "X"
            classes.append(-1)

        # parse file as an xml document
        tree = ET.parse(os.path.join(direc,datafile))
        add_to_set(tree)
        this_row = call_feats(tree)
        if X is None:
            X = this_row 
        else:
            X = np.vstack((X, this_row))

    return X, np.array(classes), ids


In [2]:
def call_feats(tree):
    global features
    global ts
    good_calls = ['process','sleep', 'dump_line']

    call_counter = {}
    for el in tree.iter():
        call = el.tag
        if call not in call_counter:
            call_counter[call] = 0
        else:
            call_counter[call] += 1
        huh = el.keys()
        for call in huh:
            if call not in call_counter:
                call_counter[call] = 0
            else:
                call_counter[call] += 1
    features.extend(call_counter.keys())
    features=list(set(features))
    call_feat_array = np.zeros(len(good_calls))
    for i in range(len(good_calls)):
        call = good_calls[i]
        call_feat_array[i] = 0
        if call in call_counter:
            call_feat_array[i] = call_counter[call]

    return call_feat_array

In [3]:
features=[]
X_train, t_train, train_ids = create_data_matrix(0, 3086, TRAIN_DIR)

In [None]:
def call_feats(tree):
    global features
    good_calls = features

    call_counter = {}
    for el in tree.iter():
        call = el.tag
        if call not in call_counter:
            call_counter[call] = 0
        else:
            call_counter[call] += 1
    call_feat_array = np.zeros(len(good_calls))
    for i in range(len(good_calls)):
        call = good_calls[i]
        call_feat_array[i] = 0
        if call in call_counter:
            call_feat_array[i] = call_counter[call]

    return call_feat_array

In [None]:
X_train, t_train, train_ids = create_data_matrix(0, 3086, TRAIN_DIR)

In [None]:
X_test, t_test, test_ids = create_data_matrix(0, 3724, "test")

In [None]:
import pandas as pd

In [None]:
huh=pd.DataFrame(X_train)
huh.columns=features
huh['Id']=train_ids
huh['t']=t_train
huh.to_csv('train_data.csv',index=False)

In [None]:
huh=pd.DataFrame(X_test)
huh.columns=features
huh['Id']=test_ids
huh.to_csv('test_data.csv',index=False)