# Extract Classification

Take `md5_links.csv` and extract the antivirus classification.

Running this notebook will generate a CSV that contains the date, md5, link and classification from each vendor.

The value for each vender can be empty (if vendor ast was not present at time of scan), 'clean' or malware name.

Takes at least 50min to run.

In [80]:
import numpy as np
import pandas as pd
import gevent
import datetime
from gevent import monkey; monkey.patch_all()
import lib.data_loading as jcfg_data_loading
from IPython.display import display

samples = pd.read_csv('md5_links.csv')
samples['date'] = pd.to_datetime(samples['date'], format='%Y/%m/%d')
# Set date as index
samples = samples.set_index('date')
samples.drop(['malware'], axis=1, inplace=True)

# Split into 100 workers and load imports
n_workers = 100
buckets = np.array_split(samples, n_workers)
print('[{0}] Spawning jobs...'.format(datetime.datetime.now()))
jobs = [gevent.spawn(
    lambda x: [pd.DataFrame(
        data=jcfg_data_loading.parse_av_classification(l), index=[l]) for l in x.link.values], b) 
    for b in buckets]
print('[{0}] Done spawning.'.format(datetime.datetime.now()))
gevent.joinall(jobs)
print('[{0}] All jobs done.'.format(datetime.datetime.now()))
result = samples.join(pd.concat([l for j in jobs for l in j.value]), on='link')
print('[{0}] Concat done.'.format(datetime.datetime.now()))
result.to_csv(path_or_buf='data/av_classes.csv')

[2017-03-08 18:17:10.692833] Spawning jobs...
[2017-03-08 18:17:10.693633] Done spawning.
[2017-03-08 18:17:10.915959] All jobs done.
[2017-03-08 18:17:10.947339] Concat done.
