In [1]:
import numpy as np
import gzip
from os import listdir
from os.path import isfile, join

gz_path = "data/analyses_gz"
analyses = listdir(gz_path)

import numpy as np
import pandas as pd
from lib.data_loading import loadHeadersCSV
from lib.plotting import *
from lib.helpers import *

# Build all other dataframes from this one
headers = loadHeadersCSV('data/header_analyses.csv')

In [2]:
# Filter only PE32 samples
crit_pe32 = headers.file_type.map(lambda x: str(x).startswith('PE32 '))
pe32 = headers[crit_pe32]

# Change link
pe32.is_copy = False
pe32['link'] = pe32['link'].map(lambda x: str(x).split('/')[2])

# Calculate missing samples, takes 20+ min
# missing = [s for s in samples if s not in analyses]
missing = ['OTBlNjFmNDRlZDI1NDZkMmI4M2I2N2Y4MGM4NjliOTQ', 'YTE0OGZlMjRkMTdjNGY2ZWE0ZjY4Y2FlNTcxMGRlOWI', 'OTQxOWM3YzY1MGQ0NDkzZmFhODRjOTU0NDY1MmMxOTI', 'OWVhYzAyYzM4YTQzNDkzNDhhNmRmNTMxOGUzMWVkOTA', 'NDc2NDhhY2RhNzk3NGM1MmJjNWJlNTE1NzdjZWM1YjU', 'NTM1MGVjZWM2N2UyNGViMzk3ODc0MzcwMTgyNjkzZTU', 'MmRlYzQwYTg1MDQ0NGIwZTllZjdiYjk5ZmVjNTcwODQ', 'NWJhNGE2ZDM0NDVmNDY5ZGE3YWZkYjc0ODBjNjgzZmU', 'MWMwNjEwZjYxM2IxNGVmMmFiNGY5MTY2Y2UwZmM3MTY', 'MWRkMjc0ZTA3MTI1NGNjMmE1ODUyZTRkNGJkMTkyOTU', 'ZWRiMTNlOWRhMGY2NGY4YTkzY2FhODIwYjQ2NmY4ODU', 'ZDg4Mjk5MjhlOGQ2NGJjMjliMTA3NjdkMmVhNjdjMmY', 'YmFhMDI1ZTY1NTg0NDgxMDg1ZDI1NWYyOTk1MjBjMjc', 'NWY3Y2E0MmJiMjMwNDI1MGEwYTRmMTk1NDc3OTMzMjE', 'NWExNTQyZmM3OGUxNDgzMGI4YmRkOTdhZmUxNTE1MDA', 'MjVlNGRiZmFmMDE2NDlmNmJmZDY0YjUxYzFjZDVjNjA', 'MzAxNTA3NjliNTBiNGMzMmI4MTM3MGRmMDdiYjdhMzY', 'N2YyYWU1MTNhNWY5NDBjOGE1ZTY2MGRiMGNkODVjNzQ', 'OTA3YjMyN2VjYmUyNDg4NGEwYzUzZjVlNjIyNmM3Nzg', 'MzhlMDE0YmQwYjE0NDMyM2IwOGIwZmZhNDcwNTNkNzY', 'ZGM5Yzc3NWFjZjg0NDY2ZmE5Mjk0ZGJiODhhN2VhNGM', 'OGQwYWIwMDc2ZmYyNGI1MDg5YTNkZjE4MjU5MGNkOGY', 'ZjRmZTlkZWNmNmI5NGZmZTgzYjg5ZDNlZTY4NmJmMDY', 'OTVjZTU4MzE3NDI2NDM4OWE2YTQxZjkwNDgwYWQ1NDE', 'ODQ4Yjk5NzE5NGJkNDY0OGE0MjIzZmQ1MDg1N2JlNjY', 'NDQ0MWFhNDMyMjVlNDBmYTkzYzZmNzMwM2I4YjRiODQ', 'YjliN2Y5ZDJlZjhhNGYyM2FkNDFlZWNmOGFmMmNjNjY', 'NDQ1NjljZWExNDBiNDEwNDlmMTBhN2M1MzcwMjkxOGI', 'YWM2YjM5YzQ2MDQzNGRjODliNjI2MTY5ZmE0OTZjZGI', 'MTVkZGY5OTdiMDllNGMxODlkNjBhODhkMWMzNGUxMDA', 'NWQ4YWJkYmM0ZDc1NGFlOWEwNGY5YjU3MWRmM2JkYWM', 'MTJmNWVkNTU4MmI5NDNjMDgyYzE2MWQ1M2EwZGYzYmY', 'YTk0OTFlNmEyNjA3NDVhZWIzMTNmNTU5ODllMDM4ODg', 'MjliMzM3MGNiNTI4NDllMDkzZDEwOTFlMGQzNTEwMzI', 'YjUwZGFjNDdlYzM0NGUwODg4ZmI5NDE1ODZmMWUxOTI', 'M2EzOGNlMjgwNTlkNDE3NjliMjU2NzYzOTg0NTQ4ZDM', 'MzY0NjM4MDQ1NjY5NGUwMDgwNWVhOWNlZjdjZjg5ODQ', 'M2FkOTNkODgwYTBhNDkyNGEzZWUxNTU2MTM4OTU1NjE', 'MWIzMDk5M2I1MTZkNGYyOTg1NmNlYzQ2ODQwNmE1MDk', 'ODUzYmQ5Y2NkNWY2NGJhY2FiYjNiM2RmOTNhMzg0MjA', 'NWQyMTBjNGExOWQxNGU1ZWFjZjM4YjdkM2IxNTM3NTE', 'NDk5OGZiMGRlZmVlNDBjNjkwYjE1MWJjZGE0MWQ1MTg', 'ZTU5NWUxMTcyMjYwNGVkNjlkOGRmMjBjZTcwMDdjMDQ', 'NTNjYzNkYzhlODEyNGI1NmJhNWE5ZTUxMjU5YWVlZjE', 'MzU2MTBhMzRiYTI2NDdlYWIxMDg1ZmE1MGJjOTJjNjY', 'YWRmNTdiODFmOTAwNDM0N2I3NmVjNDI4Y2EwZTk1YWM', 'MmRlMzhiYWNmZDAzNDM4MjliNTc5NTBmMzYyMmIzZGQ', 'NzBjNzZjYjdhMDUwNDQxNmE1OWE0M2MyMzUyMzQ2ZTI', 'Y2RhM2Y3NTEwZTk2NGFjN2EwNDhmNDZhOTY2ZmVmODY', 'Y2NmNmM2ZGIyZjRmNGFhMGE5YjNiZDc5OTM4MzlmYTU', 'YTY2NjYyNGM2YzU3NGE3Yjk5NTFhMmVjZmUyYzYwZmU', 'ZWRhNTU3ZTk3MWFlNGQ4YzlkYmRkOTU2MjZjMjdlMGY', 'NDRmOWE4Y2U1MWRkNGY3Yzg3YTZiZGY2OGE0ZGYxMTk', 'ZGUwZTg0NTY0NmFiNDViOWIxNmU5NzZkMmMzYmZmNDg', 'MzM4YWRlZTNlODAxNGI1OTk2NzhiNTViYjZhNzhkMDE', 'ODExZDQ3OTJkNzdiNDExN2JkMzIzNTA2NzgyZjAzMDk', 'ZjY3MGVkYjQ0YmUwNGMxNmIzMTVhZDc5YzYwZjFjNzg', 'YWVhZjAzZTRmOWI0NDA5MDkyNjA5MGVlZDc1ZWJkNDc', 'NzJhZTJmZTRmNDhmNDhhNDlhOWZiMTU2NDMxNzQ4ZTA', 'NzcyN2FlN2YzYTFiNGQ5YWJlNDVlMzZhNmZlMWYwODc', 'ZDI0NzFlNjY4ZmNhNDViZmI5NTY2NDljZjMzNWE2MDU', 'Y2VlMGNkYmUwMWVlNGE2MGE5MDM3OWM0ODAzNDA0OGM', 'YmFjZjUwYjg0ZjQ1NDNjZGE4OGVmNDRjNTg1NWEwMjY', 'NzZlZjRhZTMzMTgwNGYzMjg0NmY5YjUzZjE5NzRlMzg', 'ZjAwODRjNzQ1ZGVjNDZjYzg3YmM3YTk0NjliNWYzZWE', 'Y2U0NmY0YzhiNzQ4NDFiZjgyNGMzNzFjNjViOGE1ZDI', 'NzkyOWExZGU2MzBlNDc2NjllODIwMzllYjY1ZDMwMDU', 'MzU3YTIwMGU4YjljNGNkODgwMmQ5NDdjZTMyMmM0NjE', 'ZGNmZDMzNjJhZDdjNDI5NzhlYzIyMzljZThhOGYxNmM', 'YmRjZjRjNWQ2YWEzNGNjZGE5M2NlYzUzNjZlOTBkMWM', 'NTljM2RkZjMwYWQ1NGIwZWFhODk5MjIzNTYwZTYzMDk', 'YjMyMmJlODc2ZDQ1NGZlNjk0NDUzOGFjYTg4ZTFkMjE', 'ZWI4N2RjY2QzODg4NDY3MDhjZjIzNGQ5Yjg3M2M0MWU', 'NmYwZDBmOGVlNjM0NDJkY2EzOTYxODlmYzkxMjQ1YWM', 'YTdiOGIwYTAxODRkNGI2YjljZGVhNDlkZGRkYTc4ZmE', 'YzUwOTIzMDA5NDhjNDcwYjkwMDI1NzgxMTZkMmEzZjk', 'ZGI4ZGVhMTNkYzUyNDkzZWFjMDkyNGU0ODZlY2RjNjE', 'N2E0OTNiYjA3MDc4NGU2OTgyYjY4MThmN2JmNWRkMTA', 'MzQ3ZGUxMThmZWE5NDYxYWE2ZmQ5MWNkYTBhYWE0MmE', 'NzE5YzczM2Y1OGE5NGJiY2IzZDY4OTZmN2YwOWViYjA', 'YTNlNjJlYjQ4NzEyNDQwNjljMTA2NjkzMGNhMDVmMDY', 'ZDU2MjM1ZmM2MTMwNGQwOWFlZTdlMmE2YTRlMjAyNTE', 'NGM5NWUxN2Y3YjA3NDRmNzk0OTRkYWQ2YzcxOWNhYTM', 'N2U3ODFjY2RmMWE1NDgzODhiMjM4ZmFjMTU1NDU5Yjg', 'Y2Y5YjI3N2QwNTU4NDdlNWE0N2RkNWNiZTRmZWViNWI', 'ZWYxNTllNTk3ZmRmNDZhYzg3ZTQzMzA5ZTAwMDMzN2Y', 'YzNjOGUxMTc4ZDk0NGEwOGJlMjAwMzJiOTkwNDgxNzc', 'NDc3NjdkZGI2MzkyNGJlMjgxZDNhYzhhZWZkNzJiNjg', 'OWM4MWVjZjFlZWJlNGQxNGEwMWY5MzE3NzM3ZTY5MjI', 'Yzk3NjEwNzBiOTMzNGEzY2IwZjU2YWY2MWIyYjJjZTU', 'OWI0MWI4NGJmZGMwNDg1NWI5M2E5Y2U2NGMyNzFkMzA', 'ZTQxMmFlODc5MjE4NDJiMWFjNGU0ODQzNjVkN2ZmYzE', 'Yjk0M2RiYmJjYjZhNGQ0MzhjODY2N2ZhNmIyODRhYWI', 'OGNhMmNmMWFjZmZiNDE0MTgyZTNhZjY2NTJiMTgwNjQ', 'MTA4OTdjMmE0YjBlNGYzYjlmNDc5YTllZWQwMTEyYzQ', 'YzZjN2Q4OGIwNDY3NDQwY2FjMDdiMTQ0MDNiNGY3ODE', 'MTNhNDg0YTc4MmVlNDljMzg4YzIwNTgzMWNmZWNiYTc', 'MmQxZmFlZjI0NTMzNDczMDhjNGRhYWE4MjIxOWMzYjI', 'ZjRhY2FjOTQwNWFlNDZmMGE3YmY0YTk3MGYyMDliODA', 'OTBiYjUzN2Y2NmJjNGYwYzkxZTQ1YTM3MDU2ZDMyOWU', 'NWFhYzQxNzA2NTI3NDI4NDlhOGExMDYwZTY2M2U5ZjY', 'YWJlNjYyZDQxNjY3NDczNWJkNzYzM2NkZjBhZTI0YzA', 'ZWQ2M2FlMmViMDhhNDA4NmE2YWQzNWQxNmMwOWFhMmE', 'NjU4YTZjMjM2ZDNlNDJlZGEyMTUyNmE2OGRlMzZiNDU', 'NzE0MmY5NjdjYjU0NGQxMmI1ZDM2OGU0N2YwNTJiMTM', 'NTAyYjRhOWI3MTQwNDEyMjk3ZjZhYjlhNDZhNTY3MjI', 'Nzk4YzYzOWNhMDk5NDMzZmFkZWRmMGM3MDE2M2FmODE', 'YmJiMTA4MmJiMDU0NGI0ZmEzNTBkNzA2ZmQ0NzM2MzM', 'ZDM5NzQ0ZmEyNjcyNDExYzhlMzM3MjVhMDYwNTQ2MWU', 'ZDIzOTI2NTZhMDlmNDQ5Yzg2YTRiYjJlYmJmMmViMzI', 'MGE3NWE4MmJjYTY2NGVkYTg5MWE1NGFkODZhMjIzYzA', 'NjAzZGUzNmE0M2ZhNGE4ZGI1ZjA3OTMxMGUzNjc5MTQ', 'MDc3MTk4MzNiMDdjNGVjNzhkNGU5NmVjYTk3NWViYWI', 'NGI4OGMwMjQ1OTE4NDZhZWIzZWIwMTQwMGNmNTE0YTg', 'NzU5ZmYyMGU3OGVjNDBkMThkZTZlOTU4MzdmYjE0NTM', 'NzEyNzVkNTNkYjA0NDQwYmEwZTVkOWVjOTUwOTEwMjE', 'NDc4MmI0ZjU4YjllNDFlYWI3ODk2ZjA4YzczNjVjNjQ', 'ZTJiNzk1MTEzNjFhNDIyYjk0NTY2NGU3NjExOWQ2NjE', 'OWY5NThlNGQ4YzhmNDViYWFhNTlkOGM2NmU3NmVjNjU', 'MzdiZGRjZGEyOGRhNDc5NmI0MWE2NmQ3YjIzY2ViY2Q', 'NDczOWFmNTIyMWNjNDAxODllMjE2MTc5MjQzMzBkNDY', 'ZjkwYWVlY2RmNDllNDc2ZmIyOTFiZWE5NjQ2Y2UxZDc', 'ZTRjODVjY2IyZDE4NDU3ZGFlMTUzYmNjMTEyN2IxMWQ', 'MDc5NTI3NGVhNzIzNDRkMGEyOTE2NDA2OTdjY2Y3NTQ', 'MDU1ZWZhODhjNDMyNGQyZjk2OTlkYjQ1ODNhY2NkOGQ', 'MmZmZTYxYWMzYmE3NDk3MWFhOTI0OWE5OTNjODE0NmM', 'ODNmODA3ZGMyZmMyNGZlOGFiN2FlNmFlOTdiMmY0OTc', 'ZTA3MDgxNjljMmExNDc1ZDkyMDBkZWE3ZTJiYjdlZTc', 'Y2FmMDEyM2Y3NTY2NGM2NTg3YjlkNzAzMWQwZjI2ZGU', 'NDNiZTliOTkyM2ZjNDExN2IwOWQ0MjAxOWY0Y2Y1NDg', 'Zjc5MjA2ZWQ5ZGFjNDg4YzllMjRmNzFlMjk2ZmM4Zjg', 'ZTExMDE1ZDJkOWNkNDVlZGFlNmQxODRiNWViOGM2NTU', 'ZDM3YzRiNjUzNjQ1NDMxNzkyZWM3NGY3MTc1Zjk3M2E', 'YWFlZjdjOTExYzVhNDM0ZGJkODU4ZDE4ZjlhZTgwY2U', 'ZGZiZmNhMzVmOTg1NDYzZTlkNDM3MmZjMWQ2NzlhODA', 'NjFlM2I5Zjg1ODY5NGFlM2E5Y2E2ZTFmMzU3OTgzZjk', 'MGY5OWM3ZWMyZGZkNGQ2MDkwZDU3NWEzNDA3MTQ0ODY', 'MDNhZTk3ZDMwNzQwNGVmYWJjMjhiZjdmMGE4NWZkNmM', 'YzllNGFhYzY3ZDkzNGY3ODk4ZTc3YjRlNGUyMWIyY2Q', 'OGVkNzQ5ZDkyMTg1NGVjMjg5YTNlZmExMWY4YmY1NTU', 'Njg1NTljODU5Zjc0NDljZTljZTQwOTIzMDZiZDgzNWQ', 'MDMxYzBlYWRiYWQ5NGM2ZTk4N2Q1YTg5ODIxOTMyZjQ', 'YWJkNTNlNTc1MDliNGFmNGI5N2E5MDQ1YzdmNGU5MzA', 'ODlmMDg0YmNkODI4NDc4ZjhjM2RjN2YzODhmYmM4ZjM', 'OWM3ODI0MGRiNTY3NGI3MTliMDgzZGZjYzBlZWEwZmQ', 'YTAxOTYwNzJjZGMyNGFjYTlhNmMxMjNjNTllZjhmM2U', 'M2Q3MTgxMTZhNGQyNDBmZmIyNmM1YjIwMmE0NjRjOTY', 'MjRhYzkyMmQ2MjgwNDg4YmFlZWY1ZTJkMTAyNzcwYTc', 'M2Y3MmQyMjI0MWU1NDA3YmFiZDJmYzI2Nzc5Mjg2YzA', 'OTBjYjJlMDNkYTlmNGUzNThmODFjN2M2NWMwNjljN2Y', 'MDUyM2QzYjQ4YjdhNDEwMGFmNDJlZGM5NTJkZGY3OGQ', 'MmY4OGZhNGYwZjRjNDZmMDg4NWNmOWNhZWNlNjk2MDY', 'MjZiYmIzMTk0OGYzNDNhNDliNzRjNTcyNmRjOWU2NGQ', 'YmMzOWE3NWExYTMxNDZjYmIwYWY5OTY1YzE1Nzk1NWI', 'MzJiNGFlN2QyNjA4NDJlOGIyMjYzMGFlNjk0OTQ2YTE', 'Y2E2ODhlMDNhMjdkNGNmY2I2OGI0ZDY0YzA2MjZjOGY', 'YWIxODA0M2E3MjI0NGNmMjgwMWUzOWQ4M2UwYjg5NmE', 'NmI4Y2U5Mjg2ZjI0NDUwOWE4MWZiOTc1OGI0NWExZDc', 'Y2EyOTRlZDUzMDBkNDM1NjljN2ZlMDhlMjAyZmZkZTc', 'ZjJlN2EyMmRlYzA1NGMyZjhhYjY1MWIxMDI3YTEzMDA', 'YjIzZjdiODIyYmUyNDUwZDkxMTVlN2FlNjA4ZTBiYjk', 'YTM5ZmRiMDg0NDg0NDliYzhlYWEzZmJlYjFkZjY5ZmM', 'ZDM1MmI0NzJmOTBmNDc1YWIxYjk0ZWU5YTU5YzliZTM', 'NjhlMjFhNjg4NWU0NGI1MGIzNDYwMDBmYTJiMjg4YTk', 'NzViYTVjYmVkZTJhNDU3Y2ExZjY3ODgxNmZkNjc1MGE', 'YTZkNGMwOGE1OWYyNGM0MTljMjA1MDE2YzYyZWQ1MmQ', 'ODBhNjEzYTc0ZmNhNDM2MWEzZmQyNzFjODlkZjNiMDA', 'MTFkMzliOTQ5MGIyNDc1MDlhMGRlNWVmNDQ3NGRlYzY', 'MjA4MDFjN2M0MzgxNGQ5YTlkODY0NjYwNzMwOTU0MjM', 'NzczYThhOGI1MjZkNDBlODlhM2I1OGJmY2Y2MjRjYmM', 'MWNkOTY5YmY0OGZmNGJmMzhiMzIzYTNmZDQ5ZjllYmE', 'MGRjZWNkNjA4MDNiNGRhYWE4MzZkYjllNGJlOWJhYjk', 'OTM5NDA0ZmI1MDJiNDk1ZDg1YjUxMzE2Y2M4ZjQ4MmE', 'NjY1MGNkNTVkMDIxNGVjMDgwNWM1MWRlZWM2NTI2YTM', 'MTM3NDdmNDE0Njk4NDRlODhhYWRmZTlkNmQ2OTUwOWU', 'YjAzNjIxYjRjZjMzNDEwOTg4MWZjZjUzMDIxYWMwZTY', 'ZjFkMTMwNzBmNzkyNGUwYTk5YTI3MDU0ZjNlYjc2ZTE', 'YTA1NzQ4YzhkMTU0NDdkNDhmNDIwODgwNjY0MDkxMTE', 'MWYzZGU1OTlmNTJiNDc2NWI2MTEwMjIzMjk0Njg3NTA', 'ZmIwYzMwODFlYWIwNDM5N2EwN2Y1ZDBlYThkMmE0ZDY', 'MzA2OTU0MWM0ODEyNGNiYmE0NGE5N2E3YTllNWEwYmU', 'MDM0ZjAyMDE2OTEwNDZkZjhhNGZiNTRmYTcxMmVhN2M', 'YTRmMGMwOWQ0ZjdjNDExNDk3ZDViOGM3NGRlY2Y5YWE', 'NDk1OTFlZGFhY2YxNGEzY2I1NTkzMmE3YjAzOWY5Y2E', 'MTAxZjQ1OGZkNDFjNDkyY2JlZTZiYTExOWQ1ZjJmMmE', 'ZjUwYzU3MTc2NGUxNDQ1MGI4MGRhMzA3MzQ3YjkxMDE', 'MzhhNGM5ZjdhYWVjNDcxNWI0YWE3ZDEyZWI0ZDliNmU', 'MzY0NGFhNTI2MDliNDQ5NTlkNDE0N2JjOWNjY2M0Y2U', 'YWI1MjUyYTVhZWE2NDljNTk3MTQwMmQxYTEwNTAyOTA']

# Remove the missing samples
crit_miss = pe32.link.map(lambda x: x not in missing)
pe32 = pe32[crit_miss]

In [7]:
# For pseudo multithreading
import gevent
from gevent import monkey; monkey.patch_all

def check_behavior(file):
    with gzip.open(gz_path + '/' + file, 'rb') as gz:
        if b'#behavior' in gz.read():
            return True
    return False

total = 0
for i in range(10):
    samples = np.random.choice(pe32.link, size=1000)
    jobs = [gevent.spawn(check_behavior, s) for s in samples]
    gevent.joinall(jobs)
    total += sum(x.value for x in jobs)
    
total / 10000

0.9306