In [1]:
#!ls ~

In [2]:
import json
from pathlib import Path
import pandas as pd

json_file = Path.home()/'C_CPP_binaries_O0.json'

with open(json_file, 'r') as f:
    data = json.load(f)

In [3]:
list(data.keys())[:25]

['dev-libs/cJSON-1.7.15/libcjson.so.1.7.15',
 'dev-libs/libtecla-1.6.3/enhance',
 'dev-libs/nettle-3.7.3/pkcs1-conv',
 'dev-libs/pmdk-1.9.2/daxio',
 'dev-libs/satyr-0.39/satyr',
 'dev-libs/nettle-3.7.3/nettle-pbkdf2',
 'dev-libs/tree-sitter-python-0.19.1_p20211112/libtree-sitter-python.so.13',
 'dev-libs/pmdk-1.9.2/rpmemd',
 'dev-libs/libwacom-1.12/libwacom-list-devices',
 'dev-libs/pmdk-1.9.2/pmempool',
 'dev-libs/libwacom-1.12/libwacom-list-local-devices',
 'dev-libs/nettle-3.7.3/nettle-lfib-stream',
 'dev-libs/nettle-3.7.3/nettle-hash',
 'dev-libs/nettle-3.7.3/sexp-conv',
 'dev-libs/libffi-3.4.2-r1/libffi.so.8.1.0',
 'dev-libs/libmodbus-3.1.6/libmodbus.so.5.1.0',
 'dev-libs/libwacom-1.12/libwacom.so.2.6.1',
 'dev-libs/libtar-1.2.20-r4/libtar',
 'dev-libs/libserialport-0.1.1-r1/libserialport.so.0.1.0',
 'dev-libs/libtecla-1.6.3/libtecla.so.1.6.3',
 'dev-libs/liblognorm-2.0.6/lognormalizer',
 'dev-libs/libtecla-1.6.3/libtecla_r.so.1.6.3',
 'dev-libs/pmdk-1.9.2/librpmem.so.1.0.0',
 'de

In [4]:
k = list(data.keys())[0]
data[k]

{'dwarf_info': 'yes',
 'stripped': 'no',
 'language': 'C',
 'O_flag': ['-O0'],
 'inlined_func': 'no',
 'hash': '5af209fb7c21e61e80aba905d4788b1b',
 'x86-64': 'yes',
 'cython': 'no'}

In [5]:
f"{len([k for k in data if data[k]['stripped']=='no'])/len(data)*100}% of binaries are not stripped"

'99.99390113743787% of binaries are not stripped'

In [6]:
langs = [data[k]['language'] for k in data]
# len([x for x in langs if x == 'CPP'])
# len([x for x in langs if x == 'C'])
# len([x for x in langs if x == 'both'])

df = pd.DataFrame(data).transpose().reset_index().rename({'index': 'binary'}, axis=1)
df['Opt'] = df.O_flag.apply(lambda x: ','.join(x))

In [7]:
df.groupby('language').count()['binary']

language
C       22493
CPP      9169
both     1131
Name: binary, dtype: int64

In [8]:
# calculate target N binaries before we filter out unwanted cases
N = int(len(df)*.08)
N

2623

In [25]:
# keep only debug bins
df = df[df.dwarf_info=='yes']
df = df[df.stripped=='no']
df = df[df.language=='C']
df = df[df.Opt=='-O0']

In [27]:
print(f'Filtered out {int(N/.08)-len(df):,} binaries')
df

Filtered out 11,653 binaries


Unnamed: 0,binary,dwarf_info,stripped,language,O_flag,inlined_func,hash,x86-64,cython,Opt
0,dev-libs/cJSON-1.7.15/libcjson.so.1.7.15,yes,no,C,[-O0],no,5af209fb7c21e61e80aba905d4788b1b,yes,no,-O0
1,dev-libs/libtecla-1.6.3/enhance,yes,no,C,[-O0],no,809eb422c5367fbf4527e7b4f7e95de5,yes,no,-O0
2,dev-libs/nettle-3.7.3/pkcs1-conv,yes,no,C,[-O0],no,a71c58c9694c00e24bb5708a6222d626,yes,no,-O0
4,dev-libs/satyr-0.39/satyr,yes,no,C,[-O0],no,7d34e12083bcf0aa2beeb14f2eec68b9,yes,no,-O0
5,dev-libs/nettle-3.7.3/nettle-pbkdf2,yes,no,C,[-O0],no,65388dfea1edb19fd9174312062342c2,yes,no,-O0
...,...,...,...,...,...,...,...,...,...,...
32780,dev-erlang/esip-1.0.45/esip_drv.so,yes,no,C,[-O0],no,21383d15270ca5b911ea5fc833e74c88,yes,no,-O0
32785,dev-erlang/fast_yaml-1.0.32/fast_yaml.so,yes,no,C,[-O0],no,cbecd9c132a82e512db00a8bf7fc59ab,yes,no,-O0
32786,dev-erlang/fast_xml-1.1.48/fxml_stream.so,yes,no,C,[-O0],no,8f499ef065c604c6723746df90da7082,yes,no,-O0
32789,dev-erlang/eimp-1.0.21/eimp,yes,no,C,[-O0],no,5bae5bdf4ebe701ddfce2aa03603e072,yes,no,-O0


In [59]:
#df.sample(frac=0.08)

# sample2k5 = df.sample(n=2500)

# this should approximate the TyDA-min 8% of binaries
sample = df.sample(n=N)     # tydamin size
sample

Unnamed: 0,binary,dwarf_info,stripped,language,O_flag,inlined_func,hash,x86-64,cython,Opt
11032,dev-python/pycryptodome-3.14.1/_raw_des3.abi3.so,yes,no,C,[-O0],no,731958c28196a1e0bc996c38e96538aa,yes,no,-O0
263,dev-libs/glib-2.72.1/gdbus,yes,no,C,[-O0],no,aa5fee0075dac5ca914e29ea006b21ac,yes,no,-O0
20774,sci-libs/cdf-3.8.0-r1/cdfcompare,yes,no,C,[-O0],no,87cb6ab573667911d0315bbbe5401397,yes,no,-O0
11306,dev-python/patiencediff-0.2.2/_patiencediff_c....,yes,no,C,[-O0],no,44f6c3723e32b180e31c7846d6cb8bef,yes,no,-O0
17290,dev-tcltk/tclpython-5.0-r1/tclpython3.so.5.0,yes,no,C,[-O0],no,68226a174584b1d2c182d15f12405e19,yes,no,-O0
...,...,...,...,...,...,...,...,...,...,...
3360,media-sound/poc-0.4.1/mp3length,yes,no,C,[-O0],no,d1905ac0f40d1b40bb76ac43ddbf1901,yes,no,-O0
23991,dev-perl/UUID-0.280.0/UUID.so,yes,no,C,[-O0],no,1b7ae4f8080bc8375a9caf95037e383c,yes,no,-O0
23981,dev-perl/Tk-804.36.0/InputO.so,yes,no,C,[-O0],no,cab872c3992bd214f6d9c5ea67cd08b9,yes,no,-O0
22376,net-misc/asterisk-18.8.0-r1/app_bridgewait.so,yes,no,C,[-O0],no,1c94d0814e6fd146fe4a6ad57bf69122,yes,no,-O0


In [29]:
df.sample(3)

Unnamed: 0,binary,dwarf_info,stripped,language,O_flag,inlined_func,hash,x86-64,cython,Opt
31122,media-radio/ax25-tools-0.0.10_rc5/rsmemsiz,yes,no,C,[-O0],no,fc5c92ae337cf0e30f4b686f8532a1e5,yes,no,-O0
26170,sys-cluster/pmix-2.1.1/mca_preg_native.so,yes,no,C,[-O0],yes,364b608e445d9a36c19d991ff2263695,yes,no,-O0
7719,media-libs/netpbm-10.86.30/pbmtopi3,yes,no,C,[-O0],no,e96e7b57401240ef2a3117e9e6353dc7,yes,no,-O0


In [45]:
test = df.sample(3)
test

Unnamed: 0,binary,dwarf_info,stripped,language,O_flag,inlined_func,hash,x86-64,cython,Opt
29411,net-libs/gssdp-1.4.0.1/libgssdp-1.2.so.0.104.0,yes,no,C,[-O0],no,7fd8b3ff9acca656b16db2328f8a56ce,yes,no,-O0
9489,sci-biology/samtools-1.15.1/md5fa,yes,no,C,[-O0],no,f522ab5fbc673577b3fa70f87de5dc44,yes,no,-O0
3801,app-misc/mosquitto-2.0.14/mosquitto_sub,yes,no,C,[-O0],no,d1ca180ef68d06d733294dfec0dc518d,yes,no,-O0


In [64]:
# [x for x in test.binary]
import os
import shutil
from tqdm import tqdm

BASE = Path.home()/'Downloads/c_cpp'
dest = Path.home()/'Downloads/sample_tydamin'
# dest = Path.home()/'Downloads/sample_test'
# ------------------------------------

dest.mkdir()
for x in tqdm(sample.binary):
    Path(dest/x).parent.mkdir(exist_ok=True, parents=True)
    os.symlink(BASE/x, dest/x)

    # here is the copy version if we want it later:
    # shutil.copy2(BASE/x, dest/x)

100%|██████████| 3/3 [00:00<00:00, 2318.58it/s]


In [None]:
# 2441, 1836, 1193, 569, 346, 200, 86, 87