# LR Big Dataset

Trying a LR classifier with the large import dataset

In [1]:
import pandas as pd
import numpy as np
import math
from IPython.display import display
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import lib.helpers as jcfg_helpers


samples = pd.read_csv('data/pe32_imports_labeled.csv')
samples['date'] = pd.to_datetime(samples['date'], format='%Y/%m/%d')
samples = samples.set_index('date').sort_index()
# Filter samples with at least x imports
# samples = samples[samples.imports.str.count(';') > 19]
goodware = samples[samples.malware == 0]
malware = samples[samples.malware == 1]
print('Total samples: {}'.format(len(samples)))

Total samples: 102382


In [144]:
# Dataset balancing
test_size = 0.1
# Find where to cut the array from the size of goodware, which is the limiting factor

g_validation = 27000
# m_validation = 

g_training = 3000
m_training = .1 * g_training

start = math.floor(len(goodware) * (1 - test_size))
end = math.floor(len(goodware) * test_size)

# assert start + end <= len(goodware)

training = pd.concat([goodware[:start], malware[:start]])
validation = pd.concat([goodware[start:start + end], malware[start:start + end]])
# training, validation = train_test_split(samples, test_size=test_size)

In [145]:
len(validation[validation.malware == 1])

3450

## Raw LR, w/o Feature Selection

In [146]:
# Count Vectorizer Stuff
cv_token_pattern = u'[^;]+'
min_df=0.0001

cv = CountVectorizer(token_pattern=cv_token_pattern, min_df=min_df)
train_X = cv.fit_transform(training.imports)
train_Y = training.malware
test_X = cv.transform(validation.imports)
test_Y = validation.malware

print('Total features: {}'.format(len(cv.get_feature_names())))
print('Malware in training: {}\nGoodware in training: {}'.format(len(training[training.malware == 1]),
                                                                 len(training[training.malware == 0])))
print('Malware in validation: {}\nGoodware in validation: {}'.format(len(validation[validation.malware == 1]),
                                                                     len(validation[validation.malware == 0])))

Total features: 22141
Malware in training: 31058
Goodware in training: 31058
Malware in validation: 3450
Goodware in validation: 3450


In [147]:
# Logistic Regression Stuff
lr = LogisticRegression().fit(train_X, train_Y)

In [148]:
# Scoring
score = confusion_matrix(test_Y, lr.predict(test_X))
fp_rate, fn_rate = jcfg_helpers.calc_ratios(score)

print('Score:\t{:.2f}%\nFP:\t{:.2f}%\nFN:\t{:.2f}%'.format(
    lr.score(test_X, test_Y) * 100, fp_rate * 100, fn_rate * 100))

Score:	81.39%
FP:	14.56%
FN:	21.83%


## LR with Feature Selection

### Malware/Goodware only features

Taking the imports that are only present in goodware/malware from 1.5 and use them as features for the LR.

In [153]:
# Getting the imports
malware_features = pd.Series.from_csv('data/malware_only_features.csv')
goodware_features = pd.Series.from_csv('data/goodware_only_features.csv')
features = pd.concat([malware_features, goodware_features])
# Keep only features that appear in the training
# features = features[features.isin(cv.get_feature_names())]
features = features[features.isin(cv.get_feature_names())]
#features = pd.concat([features, malware_features]).unique()
print('Goodware features: {}\nMalware features: {}'.format(
    len(goodware_features[goodware_features.isin(features)]),
    len(malware_features[malware_features.isin(features)])))

Goodware features: 7827
Malware features: 153


In [154]:
# Count Vectorizer Stuff
cv_token_pattern = u'[^;]+'
min_df=1

cv2 = CountVectorizer(token_pattern=cv_token_pattern, min_df=min_df, vocabulary=features)

train_X = cv2.transform(training.imports)
train_Y = training.malware
test_X = cv2.transform(validation.imports)
test_Y = validation.malware

print('Total features: {}'.format(len(cv2.get_feature_names())))
print('Malware in training: {}\nGoodware in training: {}'.format(len(training[training.malware == 1]),
                                                                 len(training[training.malware == 0])))
print('Malware in validation: {}\nGoodware in validation: {}'.format(len(validation[validation.malware == 1]),
                                                                     len(validation[validation.malware == 0])))

Total features: 7980
Malware in training: 31058
Goodware in training: 31058
Malware in validation: 3450
Goodware in validation: 3450


In [155]:
# Logistic Regression Stuff
lr2 = LogisticRegression().fit(train_X, train_Y)

In [156]:
# Scoring
score = confusion_matrix(test_Y, lr2.predict(test_X))
fp_rate, fn_rate = jcfg_helpers.calc_ratios(score)

display(score)
display(score[1][1] / score [1][0])
print('Score:\t{:.2f}%\nFP:\t{:.2f}%\nFN:\t{:.2f}%'.format(
    lr2.score(test_X, test_Y) * 100, fp_rate * 100, fn_rate * 100))

array([[ 319, 3131],
       [   0, 3450]])



inf

Score:	54.62%
FP:	47.58%
FN:	0.00%


### Common only features

Taking the imports that are only common in goodware/malware from 1.5 and use them as features for the LR.

In [97]:
# Getting the imports
common_features = pd.Series.from_csv('data/common_features.csv')
features2 = pd.Series(common_features)
# Keep only features that appear in the training
features2 = features2[features2.isin(cv.get_feature_names())]

In [98]:
# Count Vectorizer Stuff
cv_token_pattern = u'[^;]+'
min_df=1

cv3 = CountVectorizer(token_pattern=cv_token_pattern, min_df=min_df, vocabulary=features2)

train_X = cv3.transform(training.imports)
train_Y = training.malware
test_X = cv3.transform(validation.imports)
test_Y = validation.malware

print('Total features: {}'.format(len(cv3.get_feature_names())))
print('Malware in training: {}\nGoodware in training: {}'.format(len(training[training.malware == 1]),
                                                                 len(training[training.malware == 0])))
print('Malware in validation: {}\nGoodware in validation: {}'.format(len(validation[validation.malware == 1]),
                                                                     len(validation[validation.malware == 0])))

Total features: 16190
Malware in training: 2700
Goodware in training: 27000
Malware in validation: 300
Goodware in validation: 3000


In [99]:
# Logistic Regression Stuff
lr3 = LogisticRegression().fit(train_X, train_Y)

In [100]:
# Scoring
score = confusion_matrix(test_Y, lr3.predict(test_X))
fp_rate, fn_rate = jcfg_helpers.calc_ratios(score)

print('Score:\t{:.2f}%\nFP:\t{:.2f}%\nFN:\t{:.2f}%'.format(
    lr3.score(test_X, test_Y) * 100, fp_rate * 100, fn_rate * 100))

Score:	92.97%
FP:	27.33%
FN:	6.06%


### One-sided Features

Taking imports whose presence is higher in malware/goodware (one-sided imports).

In [7]:
# Calculate one-sided features
cv_token_pattern = u'[^;]+'
min_df=1

cv4 = CountVectorizer(token_pattern=cv_token_pattern, min_df=min_df)
cv4.fit(training.imports)

# Get the ocurrences of each feature
gtotal = len(training[training.malware == 0])
gcount = np.sum(cv4.transform(training[training.malware == 0].imports), axis=0, dtype=int)
mtotal = len(training[training.malware == 1])
mcount = np.sum(cv4.transform(training[training.malware == 1].imports), axis=0, dtype=int)
# Calculate the percentage each feature appears
gratio = np.divide(gcount, np.amax(gcount))
mratio = np.divide(mcount, np.amax(mcount))
# Calculate the difference and ratio
diff_ratio = np.absolute(gratio - mratio)
feature_ratio = np.divide(diff_ratio, np.amax(diff_ratio))

In [13]:
# The relevant imports, where the ratio > x
features3 = np.array(cv4.get_feature_names(), dtype=str)[np.where(feature_ratio > 0.01)[1]]
print('Relevant features: {}'.format(len(features3)))

Relevant features: 3118


In [14]:
# Count Vectorizer Stuff
cv_token_pattern = u'[^;]+'
min_df=1

# Build bigrams, yeah, don't do this... almost 1M features
# features3 = np.append(features3, [";".join(i) for i in zip(features3, features3[1:])])
cv5 = CountVectorizer(token_pattern=cv_token_pattern, min_df=min_df, vocabulary=features3)
# cv5.fit(features3)

train_X = cv5.fit_transform(training.imports)
train_Y = training.malware
test_X = cv5.transform(validation.imports)
test_Y = validation.malware

print('Total features: {}'.format(len(cv5.get_feature_names())))
print('Malware in training: {}\nGoodware in training: {}'.format(len(training[training.malware == 1]),
                                                                 len(training[training.malware == 0])))
print('Malware in validation: {}\nGoodware in validation: {}'.format(len(validation[validation.malware == 1]),
                                                                     len(validation[validation.malware == 0])))

Total features: 3118
Malware in training: 31058
Goodware in training: 31058
Malware in validation: 3450
Goodware in validation: 3450


In [15]:
# Logistic Regression Stuff
lr4 = LogisticRegression().fit(train_X, train_Y)

In [16]:
# Scoring
score = confusion_matrix(test_Y, lr4.predict(test_X))
fp_rate, fn_rate = jcfg_helpers.calc_ratios(score)

print('Score:\t{:.2f}%\nFP:\t{:.2f}%\nFN:\t{:.2f}%'.format(
    lr4.score(test_X, test_Y) * 100, fp_rate * 100, fn_rate * 100))

Score:	81.70%
FP:	14.96%
FN:	21.06%
