In [8]:
import pandas as pd
malData = pd.read_csv("MalwareData.csv", sep = "|")
malData = malData.dropna()

legit = malData[0:41323].drop(["legitimate"], axis=1)
mal = malData[41323::].drop(["legitimate"], axis=1)

print("The shape of the legit dataset is: %s samples, %s features" %(legit.shape[0],legit.shape[1]))
print("The shape of the malware dataset is: %s samples, %s features" %(mal.shape[0],mal.shape[1]))

The shape of the legit dataset is: 41323 samples, 56 features
The shape of the malware dataset is: 93041 samples, 56 features


In [9]:
print(malData.columns)

Index(['Name', 'md5', 'Machine', 'SizeOfOptionalHeader', 'Characteristics',
       'MajorLinkerVersion', 'MinorLinkerVersion', 'SizeOfCode',
       'SizeOfInitializedData', 'SizeOfUninitializedData',
       'AddressOfEntryPoint', 'BaseOfCode', 'BaseOfData', 'ImageBase',
       'SectionAlignment', 'FileAlignment', 'MajorOperatingSystemVersion',
       'MinorOperatingSystemVersion', 'MajorImageVersion', 'MinorImageVersion',
       'MajorSubsystemVersion', 'MinorSubsystemVersion', 'SizeOfImage',
       'SizeOfHeaders', 'CheckSum', 'Subsystem', 'DllCharacteristics',
       'SizeOfStackReserve', 'SizeOfStackCommit', 'SizeOfHeapReserve',
       'SizeOfHeapCommit', 'LoaderFlags', 'NumberOfRvaAndSizes', 'SectionsNb',
       'SectionsMeanEntropy', 'SectionsMinEntropy', 'SectionsMaxEntropy',
       'SectionsMeanRawsize', 'SectionsMinRawsize', 'SectionMaxRawsize',
       'SectionsMeanVirtualsize', 'SectionsMinVirtualsize',
       'SectionMaxVirtualsize', 'ImportsNbDLL', 'ImportsNb',
       'Impor

In [10]:
print(malData.head(5))

           Name                               md5  Machine  \
0   memtest.exe  631ea355665f28d4707448e442fbf5b8      332   
1       ose.exe  9d10f99a6712e28f8acd5641e3a7ea6b      332   
2     setup.exe  4d92f518527353c0db88a70fddcfd390      332   
3      DW20.EXE  a41e524f8d45f0074fd07805ff0c9b12      332   
4  dwtrig20.exe  c87e561258f2f8650cef999bf643a731      332   

   SizeOfOptionalHeader  Characteristics  MajorLinkerVersion  \
0                   224              258                   9   
1                   224             3330                   9   
2                   224             3330                   9   
3                   224              258                   9   
4                   224              258                   9   

   MinorLinkerVersion  SizeOfCode  SizeOfInitializedData  \
0                   0      361984                 115712   
1                   0      130560                  19968   
2                   0      517120                 621568   
3 

In [11]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split


In [47]:
data_in = malData.drop(['Name', 'md5', 'legitimate'], axis=1).values
labels = malData['legitimate'].values
extratrees = ExtraTreesClassifier().fit(data_in,labels)
select = SelectFromModel(extratrees,prefit=True)
data_in_new = select.transform(data_in)
print(data_in.shape,data_in_new.shape)

(134364, 54) (134364, 12)


In [48]:
import numpy as np
features = data_in_new.shape[1]
importances = extratrees.feature_importances_
indices = np.argsort(importances)[::-1]

for f in range(features):
    print('%d'%(f+1),malData.columns[2+indices[f]],importances[indices[f]])

1 DllCharacteristics 0.14473435180644728
2 Machine 0.10716313814113616
3 Characteristics 0.09780991937911077
4 SectionsMaxEntropy 0.06714910120613722
5 Subsystem 0.06365696655300689
6 VersionInformationSize 0.059470476859460636
7 ImageBase 0.055158723793457834
8 ResourcesMaxEntropy 0.04649348970659714
9 SizeOfOptionalHeader 0.046053787992099866
10 MajorSubsystemVersion 0.03861427654352859
11 ResourcesMinEntropy 0.036653641913143034
12 MajorOperatingSystemVersion 0.026199835549471754


In [49]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
legit_train, legit_test, mal_train, mal_test = train_test_split(data_in_new, labels, test_size=0.2)
classif =RandomForestClassifier(n_estimators=50)

classif.fit(legit_train, mal_train)

In [50]:
print('The score of the algorithm: ', classif.score(legit_test,mal_test)*100)

The score of the algorithm:  99.38972202582518


In [15]:
from sklearn.metrics import confusion_matrix
result = classif.predict(legit_test)
conf_mat = confusion_matrix(mal_test,result)

In [51]:
print('false positives: ',conf_mat[0][1]/sum(conf_mat)[0]*100)
print('false negatives: ',conf_mat[1][0]/sum(conf_mat)[1]*100)

false positives:  0.5021598272138229
false negatives:  0.7781635340596194


In [52]:
from sklearn.ensemble import GradientBoostingClassifier
grad_boost = GradientBoostingClassifier(n_estimators=50)
grad_boost.fit(legit_train,mal_train)

In [53]:
print('The score of the Gradient Boosting Classifier is ', grad_boost.score(legit_test,mal_test)*100)

The score of the Gradient Boosting Classifier is  98.83898336620399
