Skip to content

Commit

Permalink
some polishing in MultiClassAnalysis
Browse files Browse the repository at this point in the history
  • Loading branch information
Hugo Gascon committed Jan 18, 2015
1 parent 8a0e1e2 commit cc01c3a
Showing 1 changed file with 49 additions and 23 deletions.
72 changes: 49 additions & 23 deletions adagio/core/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,18 @@ class MultiClassAnalysis:
""" A class to run a multiclass classification experiment """

def __init__(self, dataset_dir, families, split, precomputed_matrix="", y="", fnames=""):

"""
Args:
dataset_dir: a dir for pz objects
families: a dictionary with file names as keys and classes
(families for malware) as values
split: The proportion of training and testing data
precomputed_matrix: Data matrix in the case that it has been
previously computed
y: precomputed class labels
fnames: file names for precomputed data
"""
self.split = split
self.families = families
self.X = []
Expand Down Expand Up @@ -52,12 +63,14 @@ def __init__(self, dataset_dir, families, split, precomputed_matrix="", y="", fn
self.fnames = pz.load(fnames)
print "[*] file names loaded"

else:
files = self.read_files(dataset_dir, "fcgnx.pz")
else:
files = self.read_files(dataset_dir, "fcg.pz")
if len(files) > 0:
print "Loading {0} samples".format(len(files))
widgets = ['Unpickling... : ', Percentage(), ' ', Bar(marker='#',left='[',right=']'),
' ', ETA(), ' ']
widgets = ['Unpickling... : ',
Percentage(), ' ',
Bar(marker='#', left='[', right=']'), ' ',
ETA(), ' ']
pbar = ProgressBar(widgets=widgets, maxval=len(files))
pbar.start()
progress = 0
Expand All @@ -73,12 +86,19 @@ def __init__(self, dataset_dir, families, split, precomputed_matrix="", y="", fn

t0 = time.time()
x_i = self.compute_feature_vector(g)
self.feature_vector_times.append( time.time() - t0 )
self.label_dist = np.sum([self.label_dist, x_i], axis=0)
# save feature vector computing time for
# performance evaluation
self.feature_vector_times.append(time.time() - t0)
# save distribution of generated labels
self.label_dist = np.sum([self.label_dist,
x_i], axis=0)
# save sizes of the sample for further analysis
# of the dataset properties
self.sample_sizes.append(size)
self.neighborhood_sizes += ml.neighborhood_sizes(g)
for n, l in g.node.iteritems():
self.class_dist = np.sum([self.class_dist, l["label"]], axis=0)
self.class_dist = np.sum([self.class_dist,
l["label"]], axis=0)

# delete nx object to free memory
del g
Expand All @@ -97,11 +117,15 @@ def __init__(self, dataset_dir, families, split, precomputed_matrix="", y="", fn
print "[*] Stacking feature vectors..."
self.X = np.vstack(self.X)
print "[*] Converting features vectors to binary..."
self.X, self.b = ml.make_binary(self.X)
self.X, self.b = ml.make_binary(self.X)

################################
# Data Preprocessing functions #
################################

def read_files(self, d, file_extension, max_files=0):
""" Return a random list of N files with a certain extension in dir d
Args:
d: directory to read files from
file_extension: consider files only with this extension
Expand All @@ -113,16 +137,16 @@ def read_files(self, d, file_extension, max_files=0):
"""

files = []
for fn in os.listdir(d):
if fn.lower().endswith(file_extension):
files.append(os.path.join(d, fn))
for f in os.listdir(d):
if f.lower().endswith(file_extension):
files.append(os.path.join(d, f))
shuffle(files)

#if max_files is 0, return all the files in dir
# if max_files is 0, return all the files in dir
if max_files == 0:
max_files = len(files)
files = files[:max_files]

return files

def compute_feature_vector(self, g):
Expand Down Expand Up @@ -177,17 +201,21 @@ def randomize_dataset_closed_world(self):
self.Y_train = self.Y[ train_index ]
self.Y_test = self.Y[ test_index ]

###################################
# Learning & Evaluation functions #
###################################

def run_closed_experiment(self, iterations=10):
""" Train a classifier on test data, obtain the best combination of
paramters through a grid search cross-validation and test the classifier
parameters through a grid search cross-validation and test the classifier
using a closed-world split of the dataset. The results from the number
of iterations are saved as pz files.
"""
self.true_labels = np.array([])
self.predictions = np.array([])
for i in xrange(iterations):
self.randomize_dataset_closed_world()
clf = GridSearchCV(svm.LinearSVC(), {'C':np.logspace(-3,3,7)})
clf = GridSearchCV(svm.LinearSVC(), {'C':np.logspace(-3, 3, 7)})
clf.fit(self.X_train, self.Y_train)
out = clf.best_estimator_.predict(self.X_test)
self.predictions = np.append(self.predictions, out)
Expand All @@ -198,7 +226,7 @@ def run_closed_experiment(self, iterations=10):

def run_open_experiment(self, iterations=10):
""" Train a classifier on test data, obtain the best combination of
paramters through a grid search cross-validation and test the classifier
parameters through a grid search cross-validation and test the classifier
using a open-world split of the dataset. The results from the number
of iterations are saved as pz files.
"""
Expand All @@ -218,7 +246,7 @@ def run_open_experiment(self, iterations=10):
p = classes[np.where(scores==m)]
self.predictions = np.append(self.predictions, p)
self.true_labels = np.append(self.true_labels, self.Y_test)

pz.save(self.predictions, "mca_predictions_open.pz")
pz.save(self.true_labels, "mca_true_labels_open.pz")

Expand Down Expand Up @@ -356,7 +384,7 @@ def compute_label_histogram(self, g):
g_hash = ml.neighborhood_hash(g)
g_x = ml.label_histogram(g_hash)
return g_x

def randomize_dataset(self):
""" Randomly split the dataset in training and testing sets
"""
Expand Down Expand Up @@ -409,7 +437,6 @@ def run_linear_experiment(self, rocs_filename, iterations=10):
# Plotting functions #
########################


def plot_average_roc(self, filename, boundary=0.1):
""" Plot an average roc curve up to boundary using the rocs object of the the
Analysis object. It can be called after run_linear_experiment.
Expand Down Expand Up @@ -465,7 +492,6 @@ def plot_average_rocs(self, filename, roc_pickles, boundary=0.1):
plt.grid(True)
plt.savefig( filename, format='png' )


def get_high_ranked_neighborhoods(self, fcgnx_file, sorted_weights_idx, n_weights=3):
""" Retrieve the neigborhoods in a hashed graph with maximum weights.
Expand All @@ -482,7 +508,7 @@ def get_high_ranked_neighborhoods(self, fcgnx_file, sorted_weights_idx, n_weight
g = pz.load(fcgnx_file)
g_hash = ml.neighborhood_hash(g)
bits = len(instructionSet.INSTRUCTION_CLASS_COLOR)

neighborhoods = []
remaining_weights = n_weights

Expand Down

0 comments on commit cc01c3a

Please sign in to comment.