# Generate AV Classification

Takes the samples generated in 0 and creates a CSV with link and all vendors and their classification.

**This notebook uses celery** (`celery -A tasks worker --loglevel=info -Ofair`)

In [1]:
%%time
import pandas as pd
import lib.data_loading as jcfg_data_loading
import tasks as jcfg_celery_tasks
import numpy as np
from celery import group

samples = pd.read_csv('data/mined_data/pe32_samples.csv')
samples = jcfg_data_loading.set_pd_datetime_index(samples)

# Splitting in batches of ~10000 samples
n_batches = int(len(samples) / 10000)
batches = np.array_split(samples.link, n_batches)

print('Total batches: {}'.format(n_batches))

# For each batch, use celery to inflate the gzip and extract the classification
for idx, batch in enumerate(batches):
    print('Starting batch {}...'.format(idx + 1))
    
    # Creates ~10000 jobs and wait for them to finish
    jobs = group([jcfg_celery_tasks.extract_av_classification.s(link) for link in batch])
    result = jobs.apply_async()
    result.join()
    print('Jobs done.')    
    
    print('Creating checkpoint...')
    classifications = np.array(list(zip(batch, result.get())))
    class_frame = pd.DataFrame(data=list(classifications[:,1]), index=classifications[:,0])
    class_frame.index.name = 'link'
    class_frame.to_csv(path_or_buf='data/checkpoints/pe32_static_av_{}.csv'.format(idx + 1))
    classifications = []
    print('Checkpoint done.')

Total batches: 38
Starting batch 1...
Jobs done.
Creating checkpoint...
Checkpoint done.
Starting batch 2...
Jobs done.
Creating checkpoint...
Checkpoint done.
Starting batch 3...
Jobs done.
Creating checkpoint...
Checkpoint done.
Starting batch 4...
Jobs done.
Creating checkpoint...
Checkpoint done.
Starting batch 5...
Jobs done.
Creating checkpoint...
Checkpoint done.
Starting batch 6...
Jobs done.
Creating checkpoint...
Checkpoint done.
Starting batch 7...
Jobs done.
Creating checkpoint...
Checkpoint done.
Starting batch 8...
Jobs done.
Creating checkpoint...
Checkpoint done.
Starting batch 9...
Jobs done.
Creating checkpoint...
Checkpoint done.
Starting batch 10...
Jobs done.
Creating checkpoint...
Checkpoint done.
Starting batch 11...
Jobs done.
Creating checkpoint...
Checkpoint done.
Starting batch 12...
Jobs done.
Creating checkpoint...
Checkpoint done.
Starting batch 13...
Jobs done.
Creating checkpoint...
Checkpoint done.
Starting batch 14...
Jobs done.
Creating checkpoint...


In [10]:
# Concatenate the checkpoints
result = []
for cp in range(1, 39):
    result.append(pd.read_csv('data/checkpoints/pe32_static_av_{}.csv'.format(cp), dtype=str))
    
final = pd.concat(result)
final = final.set_index('link')
# First column is obviously wrong, probably some malformed HTML, drop it
final.drop(final.columns[0], inplace=True, axis=1)
final.to_csv(path_or_buf='data/mined_data/pe32_static_av.csv')