# Generate Samples

Generates a CSV with date, md5, link and filetype of all PE32 samples that were downloaded.

In [1]:
import pandas as pd
from os import walk

headers = pd.read_csv(filepath_or_buffer='data/header_analyses.csv')
ori_len = len(headers)

# Filter PE32 samples
crit_pe32 = headers.file_type.map(lambda x: str(x).startswith('PE32 '))
headers = headers[crit_pe32]
new_len = len(headers)

# Remove unwanted columns
headers.drop(['antivirus', 'file_name'], inplace=True, axis=1)
# Keep just submission ID
headers.link = headers.link.apply(lambda x: x.split('/')[2])

print('Filtered {} out of {} samples ({:.2f}%)'.format(new_len, ori_len, new_len/ori_len * 100))

# Get all downloaded samples
[(_, __, downloaded)] = walk('data/analyses_gz')

# Get the links
links = list(headers.link)

# Get the intersection 
available = set(downloaded).intersection(links)

print('Downloaded {} out of {} samples ({:.2f}%)'.format(len(available), len(links), len(available)/len(links) * 100))

# Create the final CSV with the downloaded samples, ordered by date (index)
samples = headers[headers.link.isin(available)]
samples.is_copy = False
samples['date'] = pd.to_datetime(samples['date'], format='%d/%m/%Y')
samples = samples.set_index('date')
samples.to_csv(path_or_buf='data/mined_data/pe32_samples.csv')

print('CSV Created')

Filtered 388702 out of 642698 samples (60.48%)
Downloaded 388513 out of 388702 samples (99.95%)
CSV Created
