# Generate Static Imports

Extracts the imports from the samples, generating a CSV with link and imports.

**This notebook uses celery** (`celery -A tasks worker --loglevel=info -Ofair`)

In [6]:
%%time
import pandas as pd
import lib.data_loading as jcfg_data_loading
import tasks as jcfg_celery_tasks
import numpy as np
from celery import group

samples = pd.read_csv('data/mined_data/pe32_samples.csv')
samples = jcfg_data_loading.set_pd_datetime_index(samples)

# Splitting in batches of ~5000 samples
n_batches = int(len(samples) / 5000)
batches = np.array_split(samples.link, n_batches)

print('Total batches: {}'.format(n_batches))

# For each batch, use celery to inflate the gzip and extract the classification
for idx, batch in enumerate(batches):
    print('Starting batch {}...'.format(idx + 1))
    
    # Creates ~5000 jobs and wait for them to finish
    jobs = group([jcfg_celery_tasks.extract_imports.s(link) for link in batch])
    result = jobs.apply_async()
    result.join()
    print('Jobs done.')    
    
    print('Creating checkpoint...')
    imports = np.array(list(zip(batch, result.get())))
    imp_frame = pd.DataFrame(data=list(imports[:,1]), index=imports[:,0])
    imp_frame.index.name = 'link'
    imp_frame.columns = ['imports']
    imp_frame.to_csv(path_or_buf='data/checkpoints/pe32_static_imports_{}.csv'.format(idx + 1))
    imports = []
    print('Checkpoint done.')

Total batches: 77
Starting batch 1...
Jobs done.
Creating checkpoint...
Checkpoint done.


In [None]:
# Concatenate the checkpoints
result = []
for cp in range(1, 77):
    result.append(pd.read_csv('data/checkpoints/pe32_static_imports_{}.csv'.format(cp), dtype=str))
    
final = pd.concat(result)
final = final.set_index('link')
final.dropna(inplace=True)
final.to_csv(path_or_buf='data/mined_data/pe32_static_imports.csv')