In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import StratifiedKFold
from scipy import interp
from itertools import cycle
import lib.helpers as jcfg_helpers

data_folder = '../data/'
dlls_file = data_folder + 'dlls.csv.gz'
malware_file = data_folder + 'malware_samples.csv.gz'
goodware_file = data_folder + 'goodware_samples.csv.gz'

In [2]:
# Read into DataFrames
dlls = pd.read_csv(dlls_file)
dlls.set_index('link', inplace=True)
malware = pd.read_csv(malware_file)
malware.set_index('link', inplace=True)
malware['malware'] = 1
goodware = pd.read_csv(goodware_file)
goodware.set_index('link', inplace=True)
goodware['malware'] = 0

In [3]:
# Create the test and train sets (without temporal consistency)
upper_limit = len(goodware) if len(goodware) < len(malware) else len(malware)
# dataset = pd.concat([malware.sample(n=upper_limit), goodware.sample(n=upper_limit)])
# Use the first n samples. This probably has to change to something better
dataset = pd.concat([malware[:upper_limit], goodware[:upper_limit]])
dataset = dataset.join(dlls)[['malware', 'dlls']]
dataset.dropna(inplace=True)
(train, test) = train_test_split(dataset)

In [4]:
# Create the count vectorizer
cv_token_pattern = u'[^;]+'
# cv = CountVectorizer(token_pattern=cv_token_pattern)
cv = CountVectorizer(token_pattern=cv_token_pattern, max_features=1000)
# Generate the word vector
train_X = cv.fit_transform(train.dlls)
train_Y = train.malware
test_X = cv.transform(test.dlls)
test_Y = test.malware

In [None]:
%%time
# Logistic Regression training
clf =  LogisticRegression()
lr = clf.fit(train_X, train_Y)

In [None]:
score = confusion_matrix(test_Y, lr.predict(test_X))
fp_rate, fn_rate, cc_rate, det_rate = jcfg_helpers.calc_ratios(score)
print('CC:\t{:.4f}'.format(cc_rate))
print('DR:\t{:.4f}'.format(det_rate))
print('FP:\t{:.4f}'.format(fp_rate))
print('FN:\t{:.4f}'.format(fn_rate))

malware_only = lr.predict_proba(test_X[np.where(test_Y == 1)])
goodware_only = lr.predict_proba(test_X[np.where(test_Y == 0)])
print('\nMalware stats:')
display(pd.Series(malware_only[:,1]).describe())
print('\nGoodware stats:')
display(pd.Series(goodware_only[:,0]).describe())

plt.figure(figsize=(10, 4), dpi=100)
plt.subplot(211)
plt.hist(malware_only[:,1], bins=20, alpha=0.75)
plt.subplot(212)
plt.hist(goodware_only[:,0], bins=20, alpha=0.75)
plt.show()

In [None]:
%%time
# Trying ROC
skf = StratifiedKFold(n_splits=10)
classifier =  LogisticRegression()

tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)
X = cv.transform(dataset.dlls)
y = dataset.malware
plt.figure(figsize=(10, 4), dpi=100)

i = 0
for train, test in skf.split(X, y):
    probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test])
    # Compute ROC curve and area the curve
    fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1])
    tprs.append(interp(mean_fpr, fpr, tpr))
    tprs[-1][0] = 0.0
    roc_auc = auc(fpr, tpr)
    aucs.append(roc_auc)
    plt.plot(fpr, tpr, lw=1, alpha=0.3,
             label='ROC fold %d (AUC = %0.2f)' % (i, roc_auc))

    i += 1
plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
         label='Luck', alpha=.8)

mean_tpr = np.mean(tprs, axis=0)
mean_tpr[-1] = 1.0
mean_auc = auc(mean_fpr, mean_tpr)
std_auc = np.std(aucs)
plt.plot(mean_fpr, mean_tpr, color='b',
         label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
         lw=2, alpha=.8)

std_tpr = np.std(tprs, axis=0)
tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
plt.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                 label=r'$\pm$ 1 std. dev.')

plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()

In [None]:
display(train_Y.values)

In [16]:
%%time
import sklearn.model_selection
import sklearn.datasets
import sklearn.metrics

import autosklearn.classification


def main():
    # X, y = sklearn.datasets.load_digits(return_X_y=True)
    # X_train, X_test, y_train, y_test = \
    #     sklearn.model_selection.train_test_split(X, y, random_state=1)
    X_train = train_X
    y_train = train_Y.values
    X_test = test_X
    y_test = test_Y.values

    automl = autosklearn.classification.AutoSklearnClassifier(
        include_estimators=['liblinear_svc'],
        include_preprocessors=['no_preprocessing'],
        time_left_for_this_task=600, per_run_time_limit=30,
        tmp_folder='/tmp/autoslearn_cv_example_tmp',
        output_folder='/tmp/autosklearn_cv_example_out',
        delete_tmp_folder_after_terminate=False,
        resampling_strategy='cv',
        resampling_strategy_arguments={'folds': 10},
        ensemble_size=1,
        initial_configurations_via_metalearning=0)

    # fit() changes the data in place, but refit needs the original data. We
    # therefore copy the data. In practice, one should reload the data
    automl.fit(X_train.copy(), y_train.copy(), dataset_name='malwr', metric=autosklearn.metrics.roc_auc)
    # During fit(), models are fit on individual cross-validation folds. To use
    # all available data, we call refit() which trains all models in the
    # final ensemble on the whole dataset.
    automl.refit(X_train.copy(), y_train.copy())

    print(automl.show_models())

    predictions = automl.predict(X_test)
    print("Accuracy score", sklearn.metrics.accuracy_score(y_test, predictions))


if __name__ == '__main__':
    main()

You are already timing task: index_run3
You are already timing task: index_run3
You are already timing task: index_run3
You are already timing task: index_run3
You are already timing task: index_run3
You are already timing task: index_run4
You are already timing task: index_run4
You are already timing task: index_run5




You are already timing task: index_run5




You are already timing task: index_run5




You are already timing task: index_run5
You are already timing task: index_run5
You are already timing task: index_run5




You are already timing task: index_run6
You are already timing task: index_run6
You are already timing task: index_run6
You are already timing task: index_run6
You are already timing task: index_run6
You are already timing task: index_run6
You are already timing task: index_run7
You are already timing task: index_run7
You are already timing task: index_run7




You are already timing task: index_run7
You are already timing task: index_run7
You are already timing task: index_run7
You are already timing task: index_run7


[(1.000000, SimpleClassificationPipeline({'balancing:strategy': 'none', 'classifier:__choice__': 'liblinear_svc', 'imputation:strategy': 'most_frequent', 'one_hot_encoding:use_minimum_fraction': 'False', 'preprocessor:__choice__': 'no_preprocessing', 'rescaling:__choice__': 'none', 'classifier:liblinear_svc:C': 249.47479984292363, 'classifier:liblinear_svc:dual': 'False', 'classifier:liblinear_svc:fit_intercept': 'True', 'classifier:liblinear_svc:intercept_scaling': 1, 'classifier:liblinear_svc:loss': 'squared_hinge', 'classifier:liblinear_svc:multi_class': 'ovr', 'classifier:liblinear_svc:penalty': 'l2', 'classifier:liblinear_svc:tol': 0.016275217185384647},
dataset_properties={
  'task': 1,
  'sparse': True,
  'multilabel': False,
  'multiclass': False,
  'target_type': 'classification',
  'signed': False})),
]
Accuracy score 0.690997677701
CPU times: user 36 s, sys: 587 ms, total: 36.6 s
Wall time: 10min 18s
