# Malwr File Info
Filter file info to remove empty, non PE32 samples or statistically irrelevant executable types.

In [1]:
import pandas as pd
import numpy as np

In [2]:
data_folder = '../../data/csv_data/'
extension = '.csv.gz'
build_path = lambda x: data_folder + x + extension

In [3]:
# Load and drop NA
malwr_info = pd.read_csv(build_path('malwr_file_info')).set_index('link').dropna()
# Make sure all our samples are PE32
assert len(malwr_info[~malwr_info.file_type.str.startswith('PE32')]) == 0

file_type_freq = malwr_info.file_type.value_counts()
malwr_info2 = malwr_info[malwr_info.file_type.isin(file_type_freq[:20].index)]

compressed_list = '|'.join(['compressed', 'extracting'])

compressed_size = malwr_info2[malwr_info2.file_type.str.contains(compressed_list)].file_type.value_counts().sum()
uncompressed_size = malwr_info2[~malwr_info2.file_type.str.contains(compressed_list)].file_type.value_counts().sum()
i = len(malwr_info)
k = len(malwr_info2)
assert compressed_size + uncompressed_size == len(malwr_info2)

malwr_info2.is_copy = False
malwr_info2.loc[malwr_info2.file_type.str.contains(compressed_list), 'compressed'] = True
malwr_info2.loc[~malwr_info2.file_type.str.contains(compressed_list), 'compressed'] = False

In [4]:
display('Remaining types of samples: {:.2f}%'.format(100*k/i))
display('Uncompressed PE32: {}'.format(uncompressed_size))
display('Compressed PE32: {}'.format(compressed_size))

'Remaining types of samples: 99.08%'

'Uncompressed PE32: 329906'

'Compressed PE32: 55032'

In [5]:
malwr_info2.to_csv(build_path('malwr_file_info_filtered'), compression='gzip')