In [22]:
import json
from pathlib import Path
import pandas as pd
pd.set_option('display.max_colwidth', 20)   # JSON columns make things look weird in notebook without this
pd.options.mode.copy_on_write = True        # TESTING THIS...

# load up the binaries we already have in our dataset

first_dataset = Path.home()/'Downloads/sample_tydamin'

In [10]:
import os

binary_files = []
for dirpath, dirnames, filenames in os.walk(first_dataset):
    binary_files.extend([Path(dirpath)/x for x in filenames if x])

    # print(filenames)

In [15]:
sum([x.exists() for x in binary_files])

2623

In [24]:
# load
json_file = Path.home()/'C_CPP_binaries_O0.json'
with open(json_file, 'r') as f:
    data = json.load(f)

df = pd.DataFrame(data).transpose().reset_index().rename({'index': 'binary'}, axis=1)
df['Opt'] = df.O_flag.apply(lambda x: ','.join(x))

# keep only debug bins
df = df[df.dwarf_info=='yes']
df = df[df.stripped=='no']
df = df[df.language=='C']
df = df[df.Opt=='-O0']

In [25]:
df

Unnamed: 0,binary,dwarf_info,stripped,language,O_flag,inlined_func,hash,x86-64,cython,Opt
0,dev-libs/cJSON-1...,yes,no,C,[-O0],no,5af209fb7c21e61e...,yes,no,-O0
1,dev-libs/libtecl...,yes,no,C,[-O0],no,809eb422c5367fbf...,yes,no,-O0
2,dev-libs/nettle-...,yes,no,C,[-O0],no,a71c58c9694c00e2...,yes,no,-O0
4,dev-libs/satyr-0...,yes,no,C,[-O0],no,7d34e12083bcf0aa...,yes,no,-O0
5,dev-libs/nettle-...,yes,no,C,[-O0],no,65388dfea1edb19f...,yes,no,-O0
...,...,...,...,...,...,...,...,...,...,...
32780,dev-erlang/esip-...,yes,no,C,[-O0],no,21383d15270ca5b9...,yes,no,-O0
32785,dev-erlang/fast_...,yes,no,C,[-O0],no,cbecd9c132a82e51...,yes,no,-O0
32786,dev-erlang/fast_...,yes,no,C,[-O0],no,8f499ef065c604c6...,yes,no,-O0
32789,dev-erlang/eimp-...,yes,no,C,[-O0],no,5bae5bdf4ebe701d...,yes,no,-O0


In [39]:
#df[df.binary==str(binary_files[0].relative_to(first_dataset))]
rel_filenames = [str(x.relative_to(first_dataset)) for x in binary_files]

df1 = df[df.binary.isin(rel_filenames)]
df1

Unnamed: 0,binary,dwarf_info,stripped,language,O_flag,inlined_func,hash,x86-64,cython,Opt
4,dev-libs/satyr-0...,yes,no,C,[-O0],no,7d34e12083bcf0aa...,yes,no,-O0
5,dev-libs/nettle-...,yes,no,C,[-O0],no,65388dfea1edb19f...,yes,no,-O0
10,dev-libs/libwaco...,yes,no,C,[-O0],no,6a338e287d3a8eff...,yes,no,-O0
12,dev-libs/nettle-...,yes,no,C,[-O0],no,996a24c58a895fc2...,yes,no,-O0
25,dev-libs/libtar-...,yes,no,C,[-O0],no,15008c2f0fb2af80...,yes,no,-O0
...,...,...,...,...,...,...,...,...,...,...
32753,games-board/xska...,yes,no,C,[-O0],no,b45dc1767b365ba6...,yes,no,-O0
32762,games-board/gnom...,yes,no,C,[-O0],no,dbc6fdf482f0cb95...,yes,no,-O0
32765,games-board/pion...,yes,no,C,[-O0],no,1b324583d8f7f641...,yes,no,-O0
32770,games-board/pion...,yes,no,C,[-O0],no,42edd4b7f8f736ad...,yes,no,-O0


In [45]:
# remove the binaries we already have in our dataset (df1)
remaining_df = df[~df.binary.isin(df1.binary)]
remaining_df

Unnamed: 0,binary,dwarf_info,stripped,language,O_flag,inlined_func,hash,x86-64,cython,Opt
0,dev-libs/cJSON-1...,yes,no,C,[-O0],no,5af209fb7c21e61e...,yes,no,-O0
1,dev-libs/libtecl...,yes,no,C,[-O0],no,809eb422c5367fbf...,yes,no,-O0
2,dev-libs/nettle-...,yes,no,C,[-O0],no,a71c58c9694c00e2...,yes,no,-O0
8,dev-libs/libwaco...,yes,no,C,[-O0],no,0e8803d7cc14af8d...,yes,no,-O0
11,dev-libs/nettle-...,yes,no,C,[-O0],no,ed5becfb555cdab9...,yes,no,-O0
...,...,...,...,...,...,...,...,...,...,...
32780,dev-erlang/esip-...,yes,no,C,[-O0],no,21383d15270ca5b9...,yes,no,-O0
32785,dev-erlang/fast_...,yes,no,C,[-O0],no,cbecd9c132a82e51...,yes,no,-O0
32786,dev-erlang/fast_...,yes,no,C,[-O0],no,8f499ef065c604c6...,yes,no,-O0
32789,dev-erlang/eimp-...,yes,no,C,[-O0],no,5bae5bdf4ebe701d...,yes,no,-O0


In [52]:
# sample the second half
df2 = remaining_df.sample(n=len(df1), random_state=720)       # seed to make this reproducible
df2

Unnamed: 0,binary,dwarf_info,stripped,language,O_flag,inlined_func,hash,x86-64,cython,Opt
12556,dev-lang/lua-5.1...,yes,no,C,[-O0],no,22f5ffd3385ab94c...,yes,no,-O0
7629,media-libs/netpb...,yes,no,C,[-O0],no,4efafffe0923b690...,yes,no,-O0
6823,media-libs/aften...,yes,no,C,[-O0],no,44d232ff95c75a7f...,yes,no,-O0
2332,www-client/httra...,yes,no,C,[-O0],no,c731d8f087b6584a...,yes,no,-O0
29274,net-libs/libsrsi...,yes,no,C,[-O0],no,50b6db3af84d138a...,yes,no,-O0
...,...,...,...,...,...,...,...,...,...,...
3509,sys-fs/avfs-1.1....,yes,no,C,[-O0],no,72c776ff385b0694...,yes,no,-O0
10995,dev-python/panda...,yes,no,C,[-O0],no,c9029fca0530dfcd...,yes,yes,-O0
20892,sci-libs/eccodes...,yes,no,C,[-O0],no,5bdb37a7f2a59d8e...,yes,no,-O0
4959,sys-libs/libutem...,yes,no,C,[-O0],no,850351bc6bb2846d...,yes,no,-O0


In [53]:
from tqdm import tqdm
import shutil

BASE = Path.home()/'Downloads/c_cpp'
dest = Path.home()/'Downloads/sample2_tydamin'

dest.mkdir()
for x in tqdm(df2.binary):
    Path(dest/x).parent.mkdir(exist_ok=True, parents=True)
    shutil.copy2(BASE/x, dest/x)
    # os.symlink(BASE/x, dest/x)

100%|██████████| 2623/2623 [01:40<00:00, 26.16it/s]
