# Threshold

The purpose of this notebook is to check the confidence of the LR classifier on both malware and goodware.

To test this a classifier is trained with the first dataset and the remaining are used to check how both malware and goodware stands.

In [1]:
import lib.data_loading as jcfg_data_loading
import pandas as pd
from IPython.display import display
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import numpy as np

dataset_name = "dataset"
numb_datasets = 6

datasets = []

# Load all samples with an array, where each ith position are samples
# up to the ith+1 submission of the target sample
for i in range(numb_datasets):
    datasets.append(pd.read_csv(dataset_name + str(i) + '.csv'))
    datasets[i] = datasets[i].set_index('md5')
    datasets[i].dropna(inplace=True, subset=['imports'])
    print('Final {0}th dataset size: {1}'.format(i, len(datasets[i])))

Final 0th dataset size: 2798
Final 1th dataset size: 31
Final 2th dataset size: 22
Final 3th dataset size: 121
Final 4th dataset size: 286
Final 5th dataset size: 465


In [20]:
# Parameters for the bag of words
# Split by the semicolon
count_vec_pattern = u'[^;]+'
# A call must be present at least in x samples
min_df = 2
max_df = 0.28

train_test = [datasets[0], pd.concat(datasets[1:])]

# Create the bag of words
cv = CountVectorizer(token_pattern=count_vec_pattern, max_df=max_df, min_df=min_df)
# Create the training set
train_set = [
    cv.fit_transform(train_test[0].imports).toarray(),
    train_test[0].malware.values
]
# Create the classifier, train it and score it
lr = LogisticRegression()
lr.fit(train_set[0], train_set[1])

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [18]:
from scipy import stats
malware = cv.transform(train_test[1][train_test[1].malware == 1].imports).toarray()

prob = lr.predict_proba(malware)
results = pd.DataFrame(data=prob[:,1])
results.columns = ['malware_stats']

display(results.describe())
display(results[results.malware_stats >= .5].describe())
display(results[results.malware_stats < .5].describe())

Unnamed: 0,malware_stats
count,464.0
mean,0.77722
std,0.274033
min,0.000556
25%,0.7365
50%,0.873874
75%,0.968984
max,0.999997


Unnamed: 0,malware_stats
count,386.0
mean,0.888656
std,0.109443
min,0.502973
25%,0.865772
50%,0.912752
75%,0.978575
max,0.999997


Unnamed: 0,malware_stats
count,78.0
mean,0.225755
std,0.146076
min,0.000556
25%,0.09639
50%,0.227315
75%,0.368092
max,0.491232


In [19]:
goodware = cv.transform(train_test[1][train_test[1].malware == 0].imports).toarray()

prob = lr.predict_proba(goodware)
results = pd.DataFrame(data=prob[:,0])
results.columns = ['goodware_stats']

display(results.describe())
display(results[results.goodware_stats >= .5].describe())
display(results[results.goodware_stats < .5].describe())

Unnamed: 0,goodware_stats
count,461.0
mean,0.797655
std,0.272774
min,0.00296
25%,0.746578
50%,0.943437
75%,0.993119
max,1.0


Unnamed: 0,goodware_stats
count,386.0
mean,0.904315
std,0.120187
min,0.504271
25%,0.817171
50%,0.961301
75%,0.99568
max,1.0


Unnamed: 0,goodware_stats
count,75.0
mean,0.248714
std,0.150402
min,0.00296
25%,0.125309
50%,0.252112
75%,0.385716
max,0.499699
