# Malware classifier - Part 2

The following uses the previous datasets to create a Logistic Regression classifier to predict if a sample is goodware or badware. The target sample to test is **bbb445901d3ec280951ac12132afd87c**, which is considered malware by 2% (1/48) of vendors on the first submission, evolving to 80%+ by the last submission.

Each dataset includes samples split by submissions of the target sample (e.g. dataset0.csv includes submissions up to the first submission of **bbb445901d3ec280951ac12132afd87c**).

Load all the datasets

In [1]:
import lib.data_loading as jcfg_data_loading
import pandas as pd
from IPython.display import display
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix

dataset_name = "dataset"
numb_datasets = 6

datasets = []

# Load all samples with an array, where each ith position are samples
# up to the ith+1 submission of the target sample
for i in range(numb_datasets):
    datasets.append(pd.read_csv(dataset_name + str(i) + '.csv'))
    datasets[i] = datasets[i].set_index('md5')
    datasets[i].dropna(inplace=True, subset=['imports'])
    print('Final {0}th dataset size: {1}'.format(i, len(datasets[i])))

Final 0th dataset size: 2798
Final 1th dataset size: 31
Final 2th dataset size: 22
Final 3th dataset size: 121
Final 4th dataset size: 286
Final 5th dataset size: 465


Transform the frames into binary vectors, where each position represents if a given import is present in the sample

In [25]:
# Parameters for the bag of words
# Split by the semicolon
count_vec_pattern = u'[^;]+'
# A call must be present at least in x samples
count_vec_min_df = 2
# A call can't appear in more than x% of the samples
count_vec_max_df = 0.1

# percentage of train and validation
training_size = 1.0
validation_size = 1 - training_size

count_vec_array = []

# Create each bag
for i in range(numb_datasets):
    count_vec_array.append(
        CountVectorizer(token_pattern=count_vec_pattern, max_df=count_vec_max_df, min_df=count_vec_min_df))

# Create train and validation data
# For each submission the training includes all the
# previous submissions
final_datasets = []

# Create first one by hand, since it as no previous
temp = train_test_split(datasets[0], test_size=validation_size)
final_datasets.append({
        'train_X': count_vec_array[0].fit_transform(temp[0].imports).toarray(),
        'train_Y': temp[0].malware.values,
        'validation_X': count_vec_array[0].transform(temp[1].imports).toarray(),
        'validation_Y': temp[1].malware.values
    })
print('#Features for 0th dataset: {0}'.format(len(count_vec_array[0].vocabulary_)))

for i in range(1, numb_datasets):
    temp = train_test_split(pd.concat(datasets[:i+1]), test_size=validation_size)
    final_datasets.append({
        'train_X': count_vec_array[i].fit_transform(temp[0].imports).toarray(),
        'train_Y': temp[0].malware.values,
        'validation_X': count_vec_array[i].transform(temp[1].imports).toarray(),
        'validation_Y': temp[1].malware.values
    })
    print('#Features for {0}th dataset: {1}'.format(i, len(count_vec_array[i].vocabulary_)))

#Features for 0th dataset: 10231
#Features for 1th dataset: 10262
#Features for 2th dataset: 10272
#Features for 3th dataset: 10752
#Features for 4th dataset: 11046
#Features for 5th dataset: 11598


Create the classifiers

In [26]:
# LR Parameters
# Inverse of regularization
c = 1000

lr_classifiers = []

for i in range(numb_datasets):
    lr_classifiers.append(LogisticRegression(C=c))
    lr_classifiers[i].fit(final_datasets[i]['train_X'], final_datasets[i]['train_Y'])

Score each classifier

In [27]:
if training_size != 1:
    for i in range(numb_datasets):
        score = lr_classifiers[i].score(final_datasets[i]['validation_X'], final_datasets[i]['validation_Y'])
        predictions = lr_classifiers[i].predict(final_datasets[i]['validation_X'])
        matrix = confusion_matrix(final_datasets[i]['validation_Y'], predictions)
        print('Score for classifier #{0}: {1}'.format(i, score))
        print('TN: {0}\tFN: {1}\tTP: {2}\tFP:{3}'.format(matrix[0][0], matrix[1][0], matrix[1][1], matrix[0][1]))
        print()

Check how each classifier deals with the target sample

In [28]:
# Malware sample
target_link = 'MzIwZDgzMjY0YjQ5NGQxMjhkZjk1YjE0YTlkNGQ1OTE'
# Goodware sample
# target_link = 'ZmU4NWM2NDA2Y2VkNGU1YTljYzNkYjJhNmFhZGE1Mzg'
target_imports = ';'.join(jcfg_data_loading.parse_static_imports(target_link))

for i in range(numb_datasets):
    X = (count_vec_array[i].transform([target_imports])).toarray()
    print(lr_classifiers[i].predict_proba(X))

[[ 0.02391248  0.97608752]]
[[ 0.00406816  0.99593184]]
[[ 0.00108669  0.99891331]]
[[ 0.00124398  0.99875602]]
[[ 0.00627197  0.99372803]]
[[ 0.00416493  0.99583507]]


In [29]:
test = lr_classifiers[0].predict(count_vec_array[0].transform(pd.concat(datasets[1:]).imports).toarray())
print(lr_classifiers[0].score(count_vec_array[0].transform(pd.concat(datasets[1:]).imports).toarray(), pd.concat(datasets[1:]).malware.values))
confusion_matrix(pd.concat(datasets[1:]).malware.values, test)

0.8


array([[371,  90],
       [ 95, 369]])