Label Samples into malware classes using vendors and avclass output.

In [1]:
import pandas as pd
import numpy as np
import ast

In [2]:
data_folder = '../data/csv_data/'
extension = '.csv.gz'
build_path = lambda x: data_folder + x

av_class_file = build_path('malware.verbose')

In [3]:
malware_verbose = pd.read_csv(av_class_file, sep='\t', header=None)
malware_verbose.columns = ['md5', 'classification', 'is_pup']

In [4]:
malware = malware_verbose[malware_verbose.is_pup == 0].drop(['is_pup'], axis=1)
malware.classification = malware.classification.apply(ast.literal_eval)

In [5]:
display(malware_verbose.is_pup.value_counts())

0    127854
1     37539
Name: is_pup, dtype: int64

In [6]:
malware_labeled = []
for i, m in malware.iterrows():
    sample = dict()
    sample['md5'] = m.md5
    total = sum(c[1] for c in m.classification)
    for k, v in m.classification:
        sample[k] = v/total
    malware_labeled.append(sample)

In [7]:
malware_classes = pd.DataFrame(malware_labeled).set_index('md5')

In [8]:
malwr_info = pd.read_csv(build_path('malwr_file_info_filtered' + extension)).set_index('link')[['md5']]

In [9]:
malware_classes = malwr_info.join(malware_classes, on='md5', how='inner').drop('md5', axis=1).dropna(how='all')

In [10]:
display(malware_classes.count())
display(malware_classes.count() / len(malware_classes))
display(malware_classes.sum() / malware_classes.count())
malware_classes.to_csv(build_path('malware_labeled.csv.gz'), compression='gzip')

other       70923
ransom      12489
spyware     29118
trojan     153578
virus       28565
worm        31286
dtype: int64

other      0.453626
ransom     0.079880
spyware    0.186240
trojan     0.982289
virus      0.182703
worm       0.200106
dtype: float64

other      0.230599
ransom     0.194829
spyware    0.155026
trojan     0.762199
virus      0.250492
worm       0.282318
dtype: float64