In [1]:
import pandas as pd
import numpy as np

import sklearn.ensemble as ske
from sklearn.model_selection import train_test_split
from sklearn import tree, linear_model
from sklearn.feature_selection import SelectFromModel
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
malware_simple_csv_path = '/content/drive/My Drive/MICS-CYBER-W207-Fall2022/For Students/Notebooks/malware-simple.csv'
data = pd.read_csv(malware_simple_csv_path, sep='|')
X = data.drop(['Name', 'md5', 'legitimate'], axis=1).values
y = data['legitimate'].values

In [4]:
data

Unnamed: 0,Name,md5,Machine,SizeOfOptionalHeader,Characteristics,MajorLinkerVersion,MinorLinkerVersion,SizeOfCode,SizeOfInitializedData,SizeOfUninitializedData,...,ResourcesNb,ResourcesMeanEntropy,ResourcesMinEntropy,ResourcesMaxEntropy,ResourcesMeanSize,ResourcesMinSize,ResourcesMaxSize,LoadConfigurationSize,VersionInformationSize,legitimate
0,memtest.exe,631ea355665f28d4707448e442fbf5b8,332,224,258,9,0,361984,115712,0,...,4,3.262823,2.568844,3.537939,8797.000000,216,18032,0,16,1
1,ose.exe,9d10f99a6712e28f8acd5641e3a7ea6b,332,224,3330,9,0,130560,19968,0,...,2,4.250461,3.420744,5.080177,837.000000,518,1156,72,18,1
2,setup.exe,4d92f518527353c0db88a70fddcfd390,332,224,3330,9,0,517120,621568,0,...,11,4.426324,2.846449,5.271813,31102.272727,104,270376,72,18,1
3,DW20.EXE,a41e524f8d45f0074fd07805ff0c9b12,332,224,258,9,0,585728,369152,0,...,10,4.364291,2.669314,6.400720,1457.000000,90,4264,72,18,1
4,dwtrig20.exe,c87e561258f2f8650cef999bf643a731,332,224,258,9,0,294912,247296,0,...,2,4.306100,3.421598,5.190603,1074.500000,849,1300,72,18,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
138042,VirusShare_8e292b418568d6e7b87f2a32aee7074b,8e292b418568d6e7b87f2a32aee7074b,332,224,258,11,0,205824,223744,0,...,7,4.122736,1.370260,7.677091,14900.714286,16,81654,72,0,0
138043,VirusShare_260d9e2258aed4c8a3bbd703ec895822,260d9e2258aed4c8a3bbd703ec895822,332,224,33167,2,25,37888,185344,0,...,26,3.377663,2.031619,5.050074,6905.846154,44,67624,0,15,0
138044,VirusShare_8d088a51b7d225c9f5d11d239791ec3f,8d088a51b7d225c9f5d11d239791ec3f,332,224,258,10,0,118272,380416,0,...,22,6.825406,2.617026,7.990487,14981.909091,48,22648,72,14,0
138045,VirusShare_4286dccf67ca220fe67635388229a9f3,4286dccf67ca220fe67635388229a9f3,332,224,33166,2,25,49152,16896,0,...,10,3.421627,2.060964,4.739744,601.600000,16,2216,0,0,0


In [5]:
# Feature selection using Trees Classifier
fsel = ske.ExtraTreesClassifier(random_state=123).fit(X, y)
model = SelectFromModel(fsel, prefit=True)
X_new = model.transform(X)
nb_features = X_new.shape[1]

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2)

In [7]:
features = []

print(f'{nb_features} features identified as important')

14 features identified as important


In [8]:
indices = np.argsort(fsel.feature_importances_)[::-1][:nb_features]
for f in range(nb_features):
    print(f'{f+1}. {data.columns[2+indices[f]]} ({fsel.feature_importances_[indices[f]]})')

1. DllCharacteristics (0.15455153297545743)
2. Characteristics (0.11955484507447357)
3. Machine (0.09485491184016995)
4. SectionsMaxEntropy (0.062438713361316636)
5. VersionInformationSize (0.05843576169934767)
6. Subsystem (0.057546065272329555)
7. ImageBase (0.054789317933551865)
8. ResourcesMaxEntropy (0.05194394109550367)
9. SizeOfOptionalHeader (0.04662988315739264)
10. MajorSubsystemVersion (0.04123564682039455)
11. SectionsMinEntropy (0.024154374842184873)
12. ResourcesMinEntropy (0.022715407415581503)
13. SectionsMeanEntropy (0.022550988494389846)
14. MajorOperatingSystemVersion (0.021238315268818073)


In [9]:
for f in sorted(np.argsort(fsel.feature_importances_)[::-1][:nb_features]):
    features.append(data.columns[2+f])

In [10]:
algorithms = {
    "DecisionTree": tree.DecisionTreeClassifier(max_depth=10),
    "RandomForest": ske.RandomForestClassifier(n_estimators=50),
    "GradientBoosting": ske.GradientBoostingClassifier(n_estimators=50),
    "AdaBoost": ske.AdaBoostClassifier(n_estimators=100),
    "GNB": GaussianNB()
}

In [11]:
results = {}
print('Now testing algorithms...\n')
for algo in algorithms:
    clf = algorithms[algo]
    clf.fit(X_train, y_train)
    score = clf.score(X_test, y_test)
    print("%s : %f %%" % (algo, score*100))
    results[algo] = score

Now testing algorithms...

DecisionTree : 99.061934 %
RandomForest : 99.449475 %
GradientBoosting : 98.913437 %
AdaBoost : 98.670771 %
GNB : 70.300616 %


In [12]:
winner = max(results, key=results.get)
print(f'Winning algorithm is {winner} with a {results[winner]*100}% accuracy')

Winning algorithm is RandomForest with a 99.44947482796088% accuracy


In [13]:
clf = algorithms[winner]
res = clf.predict(X_test)
mt = confusion_matrix(y_test, res)
print(f'False positive rate : {(mt[0][1] / float(sum(mt[0])))*100}%')
print(f'False negative rate : {(mt[1][0] / float(sum(mt[1])))*100}%')

False positive rate : 0.417353668590272%
False negative rate : 0.8656425262131187%


In [14]:
mt

array([[19327,    81],
       [   71,  8131]])