In [1]:
#!ls ~

In [2]:
import json
from pathlib import Path
import pandas as pd

json_file = Path.home()/'C_CPP_binaries_O0.json'

with open(json_file, 'r') as f:
    data = json.load(f)

In [3]:
list(data.keys())[:25]

['dev-libs/cJSON-1.7.15/libcjson.so.1.7.15',
 'dev-libs/libtecla-1.6.3/enhance',
 'dev-libs/nettle-3.7.3/pkcs1-conv',
 'dev-libs/pmdk-1.9.2/daxio',
 'dev-libs/satyr-0.39/satyr',
 'dev-libs/nettle-3.7.3/nettle-pbkdf2',
 'dev-libs/tree-sitter-python-0.19.1_p20211112/libtree-sitter-python.so.13',
 'dev-libs/pmdk-1.9.2/rpmemd',
 'dev-libs/libwacom-1.12/libwacom-list-devices',
 'dev-libs/pmdk-1.9.2/pmempool',
 'dev-libs/libwacom-1.12/libwacom-list-local-devices',
 'dev-libs/nettle-3.7.3/nettle-lfib-stream',
 'dev-libs/nettle-3.7.3/nettle-hash',
 'dev-libs/nettle-3.7.3/sexp-conv',
 'dev-libs/libffi-3.4.2-r1/libffi.so.8.1.0',
 'dev-libs/libmodbus-3.1.6/libmodbus.so.5.1.0',
 'dev-libs/libwacom-1.12/libwacom.so.2.6.1',
 'dev-libs/libtar-1.2.20-r4/libtar',
 'dev-libs/libserialport-0.1.1-r1/libserialport.so.0.1.0',
 'dev-libs/libtecla-1.6.3/libtecla.so.1.6.3',
 'dev-libs/liblognorm-2.0.6/lognormalizer',
 'dev-libs/libtecla-1.6.3/libtecla_r.so.1.6.3',
 'dev-libs/pmdk-1.9.2/librpmem.so.1.0.0',
 'de

In [4]:
k = list(data.keys())[0]
data[k]

{'dwarf_info': 'yes',
 'stripped': 'no',
 'language': 'C',
 'O_flag': ['-O0'],
 'inlined_func': 'no',
 'hash': '5af209fb7c21e61e80aba905d4788b1b',
 'x86-64': 'yes',
 'cython': 'no'}

In [5]:
f"{len([k for k in data if data[k]['stripped']=='no'])/len(data)*100}% of binaries are not stripped"

'99.99390113743787% of binaries are not stripped'

In [6]:
langs = [data[k]['language'] for k in data]
# len([x for x in langs if x == 'CPP'])
# len([x for x in langs if x == 'C'])
# len([x for x in langs if x == 'both'])

df = pd.DataFrame(data).transpose().reset_index().rename({'index': 'binary'}, axis=1)
df['Opt'] = df.O_flag.apply(lambda x: ','.join(x))

In [7]:
df.groupby('language').count()['binary']

language
C       22493
CPP      9169
both     1131
Name: binary, dtype: int64

In [8]:
# calculate target N binaries before we filter out unwanted cases
N = int(len(df)*.08)
N

2623

In [25]:
# keep only debug bins
df = df[df.dwarf_info=='yes']
df = df[df.stripped=='no']
df = df[df.language=='C']
df = df[df.Opt=='-O0']

In [27]:
print(f'Filtered out {int(N/.08)-len(df):,} binaries')
df

Filtered out 11,653 binaries


Unnamed: 0,binary,dwarf_info,stripped,language,O_flag,inlined_func,hash,x86-64,cython,Opt
0,dev-libs/cJSON-1.7.15/libcjson.so.1.7.15,yes,no,C,[-O0],no,5af209fb7c21e61e80aba905d4788b1b,yes,no,-O0
1,dev-libs/libtecla-1.6.3/enhance,yes,no,C,[-O0],no,809eb422c5367fbf4527e7b4f7e95de5,yes,no,-O0
2,dev-libs/nettle-3.7.3/pkcs1-conv,yes,no,C,[-O0],no,a71c58c9694c00e24bb5708a6222d626,yes,no,-O0
4,dev-libs/satyr-0.39/satyr,yes,no,C,[-O0],no,7d34e12083bcf0aa2beeb14f2eec68b9,yes,no,-O0
5,dev-libs/nettle-3.7.3/nettle-pbkdf2,yes,no,C,[-O0],no,65388dfea1edb19fd9174312062342c2,yes,no,-O0
...,...,...,...,...,...,...,...,...,...,...
32780,dev-erlang/esip-1.0.45/esip_drv.so,yes,no,C,[-O0],no,21383d15270ca5b911ea5fc833e74c88,yes,no,-O0
32785,dev-erlang/fast_yaml-1.0.32/fast_yaml.so,yes,no,C,[-O0],no,cbecd9c132a82e512db00a8bf7fc59ab,yes,no,-O0
32786,dev-erlang/fast_xml-1.1.48/fxml_stream.so,yes,no,C,[-O0],no,8f499ef065c604c6723746df90da7082,yes,no,-O0
32789,dev-erlang/eimp-1.0.21/eimp,yes,no,C,[-O0],no,5bae5bdf4ebe701ddfce2aa03603e072,yes,no,-O0


In [28]:
#df.sample(frac=0.08)

# this should approximate the TyDA-min 8% of binaries
df.sample(n=N)

Unnamed: 0,binary,dwarf_info,stripped,language,O_flag,inlined_func,hash,x86-64,cython,Opt
10392,media-plugins/gst-transcoder-1.14.1-r1/libgstt...,yes,no,C,[-O0],no,e760d87637f0a14950d35e87c2a7f40e,yes,no,-O0
11346,dev-python/pyproj-3.3.1/_datadir.cpython-39-x8...,yes,no,C,[-O0],no,f09f6052be9a24f08ad056b81fe5bc6f,yes,yes,-O0
4969,sys-libs/tdb-1.4.5/tdbrestore,yes,no,C,[-O0],no,c22f2f7c62f66c2277c2af9cd15fc790,yes,no,-O0
18497,media-video/vlc-3.0.17.3-r2/libswscale_plugin.so,yes,no,C,[-O0],no,494a0e08e1b75591f8acc76b60ba8e1e,yes,no,-O0
13956,net-mail/qmrtg-2.1-r2/qfilt,yes,no,C,[-O0],no,d7fcb627b48150221554dcc54572d3d3,yes,no,-O0
...,...,...,...,...,...,...,...,...,...,...
7912,media-libs/libpulse-16.0/pacat,yes,no,C,[-O0],no,7526dd3cc9122fbd859c3bdd86fd136d,yes,no,-O0
12877,dev-lang/lua-5.3.6-r2/luac5.3,yes,no,C,[-O0],no,481ca7b686eb48f035e9b4ffff8e72a8,yes,no,-O0
17019,app-arch/unadf-0.7.12-r1/unadf,yes,no,C,[-O0],no,8c6ef6a6cd66e33346256c43df636bfa,yes,no,-O0
8415,sys-block/libzbc-5.5.1/libzbc-5.5.1.so,yes,no,C,[-O0],no,8939418ecc286a95f862665a5f558bff,yes,no,-O0


In [29]:
df.sample(3)

Unnamed: 0,binary,dwarf_info,stripped,language,O_flag,inlined_func,hash,x86-64,cython,Opt
31122,media-radio/ax25-tools-0.0.10_rc5/rsmemsiz,yes,no,C,[-O0],no,fc5c92ae337cf0e30f4b686f8532a1e5,yes,no,-O0
26170,sys-cluster/pmix-2.1.1/mca_preg_native.so,yes,no,C,[-O0],yes,364b608e445d9a36c19d991ff2263695,yes,no,-O0
7719,media-libs/netpbm-10.86.30/pbmtopi3,yes,no,C,[-O0],no,e96e7b57401240ef2a3117e9e6353dc7,yes,no,-O0


In [41]:
df[df.binary.apply(lambda x: 'dev-games' in x)]

Unnamed: 0,binary,dwarf_info,stripped,language,O_flag,inlined_func,hash,x86-64,cython,Opt
28548,dev-games/libnw-1.30.02/nwresext,yes,no,C,[-O0],no,f36db8b4ff4a9b15fded1847a9a58b3b,yes,no,-O0
28549,dev-games/libnw-1.30.02/nwstrref,yes,no,C,[-O0],no,7daa0eee1a58009f8b938bdab49c2c01,yes,no,-O0
28550,dev-games/libnw-1.30.02/nwtsfix,yes,no,C,[-O0],no,b2e3af52ff5b6e32d69bf2c27e81a5b7,yes,no,-O0
28551,dev-games/libnw-1.30.02/nwaggext,yes,no,C,[-O0],no,b18dffb2be6f63d2d3b6121faf9d0737,yes,no,-O0
28552,dev-games/libnw-1.30.02/nw2da2csv,yes,no,C,[-O0],no,afa2e0402ea3489687abdaf22dcf8e0b,yes,no,-O0
28553,dev-games/libnw-1.30.02/nwareamap,yes,no,C,[-O0],no,d58d46ae632a9f0f1c9f0b9da077ae11,yes,no,-O0
28554,dev-games/libnw-1.30.02/nwmrgplc,yes,no,C,[-O0],no,5d25e1f3ced924411da5cdba6bfb092e,yes,no,-O0
28555,dev-games/libnw-1.30.02/nwtsmerge,yes,no,C,[-O0],no,6f3f1631420e70584b8b48f649964ff5,yes,no,-O0
28556,dev-games/libnw-1.30.02/nwtsrefs,yes,no,C,[-O0],no,d863c3ee3bd108996c86ea57ed9a76fd,yes,no,-O0
28557,dev-games/libnw-1.30.02/itpdis,yes,no,C,[-O0],no,05b5493533c8776f60070c9edd3240dc,yes,no,-O0
