In [37]:
import os
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
import sklearn
from collections import Counter
try:
	import xml.etree.cElementTree as ET
except ImportError:
	import xml.etree.ElementTree as ET
import numpy as np
from scipy import sparse
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers.embeddings import Embedding
import keras
import util
from keras.preprocessing import sequence


def extract_feats(ffs, direc="train", global_feat_dict=None):
	"""
	arguments:
	  ffs are a list of feature-functions.
	  direc is a directory containing xml files (expected to be train or test).
	  global_feat_dict is a dictionary mapping feature_names to column-numbers; it
	  should only be provided when extracting features from test data, so that 
	  the columns of the test matrix align correctly.

	returns: 
	  a sparse design matrix, a dict mapping features to column-numbers,
	  a vector of target classes, and a list of system-call-history ids in order 
	  of their rows in the design matrix.
	  
	  Note: the vector of target classes returned will contain the true indices of the
	  target classes on the training data, but will contain only -1's on the test
	  data
	"""
	fds = [] # list of feature dicts
	classes = []
	ids = [] 
	for datafile in os.listdir(direc):
		# extract id and true class (if available) from filename
		id_str,clazz = datafile.split('.')[:2]
		ids.append(id_str)
		# add target class if this is training data
		try:
			classes.append(util.malware_classes.index(clazz))
		except ValueError:
			# we should only fail to find the label in our list of malware classes
			# if this is test data, which always has an "X" label
			assert clazz == "X"
			classes.append(-1)
		rowfd = {}
		# parse file as an xml document
		tree = ET.parse(os.path.join(direc,datafile))
		# accumulate features
		[rowfd.update(ff(tree)) for ff in ffs]
		fds.append(rowfd)
		
	X,feat_dict = make_design_mat(fds,global_feat_dict)
	return X, feat_dict, np.array(classes), ids

def extractText(ff,direc="train", testDict = None):
    ids = []
    classes = []
    classesDict = {}
    fds = []
    secondTime = False
    if testDict == None:
        globalDict = dict()
        globalDict["counter"] = 0
    else:
        globalDict = testDict
        secondTime = True
    for datafile in os.listdir(direc):
        id_str,clazz = datafile.split('.')[:2]
        if clazz in classesDict:
            if classesDict[clazz] < 62:
                classesDict[clazz] += 1
                ids.append(id_str)
                try:
                    classes.append(util.malware_classes.index(clazz))
                except ValueError:
                    assert clazz == "X"
                    classes.append(-1)
                tree = ET.parse(os.path.join(direc,datafile))
                fds.append(ff(tree, globalDict,secondTime))
                del tree
        else:
            classesDict[clazz] = 0
            ids.append(id_str)
            try:
                classes.append(util.malware_classes.index(clazz))
            except ValueError:
                assert clazz == "X"
                classes.append(-1)
            tree = ET.parse(os.path.join(direc,datafile))
            fds.append(ff(tree, globalDict,secondTime))
            del tree
            
    if testDict == None:
        return np.array(fds), np.array(classes), ids, globalDict
    return np.array(fds), np.array(classes), ids
            
        
        


def make_design_mat(fds, global_feat_dict=None):
	"""
	arguments:
	  fds is a list of feature dicts (one for each row).
	  global_feat_dict is a dictionary mapping feature_names to column-numbers; it
	  should only be provided when extracting features from test data, so that 
	  the columns of the test matrix align correctly.
	   
	returns: 
		a sparse NxD design matrix, where N == len(fds) and D is the number of
		the union of features defined in any of the fds 
	"""
	if global_feat_dict is None:
		all_feats = set()
		[all_feats.update(fd.keys()) for fd in fds]
		feat_dict = dict([(feat, i) for i, feat in enumerate(sorted(all_feats))])
	else:
		feat_dict = global_feat_dict
		
	cols = []
	rows = []
	data = []		
	for i in range(len(fds)):
		temp_cols = []
		temp_data = []
		for feat,val in fds[i].items():
			try:
				# update temp_cols iff update temp_data
				temp_cols.append(feat_dict[feat])
				temp_data.append(val)
			except KeyError as ex:
				if global_feat_dict is not None:
					pass  # new feature in test data; nbd
				else:
					raise ex

		# all fd's features in the same row
		k = len(temp_cols)
		cols.extend(temp_cols)
		data.extend(temp_data)
		rows.extend([i]*k)

	assert len(cols) == len(rows) and len(rows) == len(data)
   

	X = sparse.csr_matrix((np.array(data),
				   (np.array(rows), np.array(cols))),
				   shape=(len(fds), len(feat_dict)))
	return X, feat_dict
	

## Here are two example feature-functions. They each take an xml.etree.ElementTree object, 
# (i.e., the result of parsing an xml file) and returns a dictionary mapping 
# feature-names to numeric values.
## TODO: modify these functions, and/or add new ones.
def first_last_system_call_feats(tree):
	"""
	arguments:
	  tree is an xml.etree.ElementTree object
	returns:
	  a dictionary mapping 'first_call-x' to 1 if x was the first system call
	  made, and 'last_call-y' to 1 if y was the last system call made. 
	  (in other words, it returns a dictionary indicating what the first and 
	  last system calls made by an executable were.)
	"""
	c = Counter()
	in_all_section = False
	first = True # is this the first system call
	last_call = None # keep track of last call we've seen
	for el in tree.iter():
		# ignore everything outside the "all_section" element
		if el.tag == "all_section" and not in_all_section:
			in_all_section = True
		elif el.tag == "all_section" and in_all_section:
			in_all_section = False
		elif in_all_section:
			if first:
				c["first_call-"+el.tag] = 1
				first = False
			last_call = el.tag  # update last call seen
			
	# finally, mark last call seen
	c["last_call-"+last_call] = 1
	return c

def system_call_count_feats(tree):
	"""
	arguments:
	  tree is an xml.etree.ElementTree object
	returns:
	  a dictionary mapping 'num_system_calls' to the number of system_calls
	  made by an executable (summed over all processes)
	"""
	c = Counter()
	in_all_section = False
	for el in tree.iter():
		# ignore everything outside the "all_section" element
		if el.tag == "all_section" and not in_all_section:
			in_all_section = True
		elif el.tag == "all_section" and in_all_section:
			in_all_section = False
		elif in_all_section:
			c['num_system_calls'] += 1
	return c

def full_Dictionary(tree):
	c = Counter()
	n=5
	in_all_section = False
	prev = []
	for el in tree.iter():
		# ignore everything outside the "all_section" element
		if el.tag == "all_section" and not in_all_section:
			in_all_section = True
		elif el.tag == "all_section" and in_all_section:
			in_all_section = False
		elif in_all_section:
			'''stringIfied = str(el)
			wordList = stringIfied.split()
			for i in range(len(wordList)):
				if prev != [] and i < n:
					c[str(prev[len(prev)-n+i : ]+wordList[0:i])] += 1
				else:
					c[str(wordList[i-n : i])] += 1
			prev = wordList'''
			#for (a,b) in el.attrib.items(): c[b] += 1
            
	return c

def fullText(tree,wordDict,secondTime=False):
    c = []
    in_all_section = False
    for el in tree.iter():
        if el.tag == "all_section" and not in_all_section:
            in_all_section = True
        elif el.tag == "all_section" and in_all_section:
            in_all_section = False
        elif in_all_section:
            1 == 1
            words = list(str(el))
            for word in words:
                if word in wordDict:
                    c.append(wordDict[word])
                elif wordDict["counter"] < 20000 and secondTime == False:
                    wordDict[word] = wordDict["counter"]
                    wordDict["counter"] += 1
                    c.append(wordDict[word])
            del words
    return c

In [38]:
train_dir = "train"
test_dir = "test"
outputfile = "sample_predictions.csv"  # feel free to change this or take it as an argument

# TODO put the names of the feature functions you've defined above in this list
ffs = [first_last_system_call_feats, system_call_count_feats,full_Dictionary]

# extract features
print("extracting training features...")
X_train, t_train, train_ids,dictionary = extractText(fullText)
#X_train,global_feat_dict,t_train,train_ids = extract_feats(ffs, train_dir)
print ("done extracting training features")

extracting training features...
done extracting training features


In [39]:
print(dictionary["counter"])

41


In [32]:
print("extracting test features...")
X_test,t_test,test_ids = extractText(fullText,direc=test_dir,testDict=dictionary)
print("done extracting test features")

extracting test features...
done extracting test features


In [43]:
maxi = 0
maxiAvg = 0
for i in range(len(X_train)):
    maxiAvg += len(X_train[i])
    if (len(X_train[i])) > maxi:
        maxi = len(X_train[i])
print(maxi)
print(maxiAvg/(len(X_train)))
maxi = 30000
#X_test1 = sequence.pad_sequences(X_test, maxlen=maxi)
X_train1 = sequence.pad_sequences(X_train, maxlen=maxi)

2345715
90405.75035868006


In [35]:
print(len(X_train))

697


In [None]:
embedding_vecor_length = 32
model = Sequential()
model.add(Embedding(41, embedding_vecor_length, input_length=maxi))
#model.add(LSTM(50,return_sequences=True))
model.add(LSTM(600))
model.add(Dense(units=15, activation='sigmoid'))
model.compile(loss='sparse_categorical_crossentropy', optimizer='sgd', metrics=['accuracy'])
model.fit(X_train1,t_train,epochs = 15,batch_size=128)
print("fuck")

Epoch 1/15


In [15]:
preds = model.predict(X_test1, batch_size=128)

In [16]:
print(preds)
for i in range(len(preds)):
    print(np.argmax(preds[i]))

[[ 0.48927093  0.49535623  0.48675019 ...,  0.51220775  0.47708943
   0.48735875]
 [ 0.48986587  0.49811772  0.48863852 ...,  0.51436949  0.47883123
   0.48840865]
 [ 0.48995483  0.49918237  0.48966318 ...,  0.51508254  0.47816429
   0.49179909]
 ..., 
 [ 0.48754254  0.49525422  0.48759383 ...,  0.51625836  0.4792057
   0.48702675]
 [ 0.48973992  0.49833071  0.48865083 ...,  0.51472598  0.47915244
   0.48834142]
 [ 0.48593065  0.49710554  0.48955482 ...,  0.51358432  0.47499725
   0.48550877]]
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8
8

In [42]:
'''X_trainNormalized=sklearn.preprocessing.normalize(X_train)
model = Sequential()
#model.add(Dense(units=64, activation='relu', input_dim=1219080))
model.add(Embedding(1000, 64, input_length=))
model.add(LSTM(100))
model.add(Dense(units=15, activation='softmax'))
model.compile(loss='categorical_crossentropy',
              optimizer='sgd',
              metrics=['accuracy'])
#learned_W = logReg.fit(X_train, np.array(t_train))
y_train = keras.utils.to_categorical(np.array(t_train), num_classes=15)
print(y_train)
model.fit(X_trainNormalized, y_train, epochs=2, batch_size=32)
#print(learned_W.get_params())
print("done learning")
print()'''

'X_trainNormalized=sklearn.preprocessing.normalize(X_train)\nmodel = Sequential()\n#model.add(Dense(units=64, activation=\'relu\', input_dim=1219080))\nmodel.add(Embedding(1000, 64, input_length=))\nmodel.add(LSTM(100))\nmodel.add(Dense(units=15, activation=\'softmax\'))\nmodel.compile(loss=\'categorical_crossentropy\',\n              optimizer=\'sgd\',\n              metrics=[\'accuracy\'])\n#learned_W = logReg.fit(X_train, np.array(t_train))\ny_train = keras.utils.to_categorical(np.array(t_train), num_classes=15)\nprint(y_train)\nmodel.fit(X_trainNormalized, y_train, epochs=2, batch_size=32)\n#print(learned_W.get_params())\nprint("done learning")\nprint()'

In [4]:
print("extracting test features...")
X_test,_,t_ignore,test_ids = extract_feats(ffs, test_dir, global_feat_dict=global_feat_dict)
print("done extracting test features")
print()

extracting test features...
done extracting test features



In [27]:
print("making predictions...")
#preds = np.argmax(X_test.dot(learned_W),axis=1)
#preds = logReg.predict(X_test)
preds = model.predict(sklearn.preprocessing.normalize(X_test), batch_size=128)
preds1 = []
print(len(preds))
print(len(preds[0]))

making predictions...
3724
15


In [41]:
()

()

In [29]:
logReg = linear_model.LogisticRegression()

In [30]:
learned_W = logReg.fit(X_train, np.array(t_train))

ValueError: setting an array element with a sequence.

In [17]:
preds = logReg.predict(X_test)

In [19]:
print(preds)
print("writing predictions...")
util.write_predictions(preds, test_ids, outputfile)
print("done!")

[10  8  8 ...,  8  8 13]
writing predictions...
done!


In [35]:
rnn=keras.layers.SimpleRNN(15, activation='tanh', use_bias=True, kernel_initializer='glorot_uniform', recurrent_initializer='orthogonal', bias_initializer='zeros', kernel_regularizer=None, recurrent_regularizer=None, bias_regularizer=None, activity_regularizer=None, kernel_constraint=None, recurrent_constraint=None, bias_constraint=None, dropout=0.0, recurrent_dropout=0.0, return_sequences=False, return_state=False, go_backwards=False, stateful=False, unroll=False)