In [2]:
#!ls ~/trained_models/

In [3]:
from itertools import chain
from pathlib import Path
from datatype_recovery.models.dataset import load_dataset_from_path, max_typesequence_len_in_dataset
from datatype_recovery.models.dataset.encoding import *

import torch_geometric.transforms as T
from torch_geometric.loader import DataLoader
from tqdm import tqdm

from datatype_recovery.models.metrics import acc_heuristic_numcorrect

def eval_model_on_dataset(model_path:Path, device:str, dataset_path:Path) -> float:
    '''
    Evaluates the model on the given dataset and returns the accuracy of the corrected
    model output against the dataset labels
    '''
    dataset = load_dataset_from_path(dataset_path)
    max_true_seq_len = max_typesequence_len_in_dataset(dataset_path)
    return eval_model_on_subset(model_path, device, dataset, max_true_seq_len)

def eval_model_on_subset(model_path:Path, device:str, dataset, max_true_seq_len:int) -> float:
    '''
    Evaluates the model on the given subset and returns the accuracy of the corrected
    model output against the dataset labels
    '''
    model = torch.load(model_path)
    print(model)

    # take the max of model seq length and max seq length of dataset so we
    # calculate accuracy correctly (without truncating something)
    max_len = max(model.max_seq_len, max_true_seq_len)

    # prepare the data loaders
    batch_size = 64
    dataset.transform = T.Compose([ToBatchTensors(), ToFixedLengthTypeSeq(max_len)])

    # split the dataset into the part divisible by batch size and the leftovers
    # we can chain these together for performance - our metrics simply iterate
    # through all elements in the batch
    batched_total = len(dataset)-(len(dataset)%batch_size)
    batch_loader = DataLoader(dataset[:batched_total], batch_size=batch_size)
    leftovers_loader = DataLoader(dataset[batched_total:], batch_size=1)

    print(f'Running eval...')

    model.to(device)
    model.eval()
    num_correct = 0

    for data in tqdm(chain(batch_loader, leftovers_loader), total=len(batch_loader)+len(leftovers_loader)):
        data.to(device)
        out = model(data.x, data.edge_index, data.batch)
        num_correct += acc_heuristic_numcorrect(data.y, out)

    accuracy = num_correct/len(dataset)
    print(f'Accuracy = {accuracy*100:,.2f}%')

    return accuracy

In [4]:
# TODO: eval on an interesting subset of the data
# from torch.utils.data import Subset
# Subset()

In [5]:
dataset_path = Path.home()/'trainset_astera_5hops_nocomp'
model_path = Path.home()/'trained_models/structural_3out_3hops_nocomp.pt'

eval_model_on_dataset(model_path, 'cuda:3', dataset_path)

Calculating max true sequence length in dataset trainset_astera_5hops_nocomp


100%|██████████| 24691/24691 [00:20<00:00, 1184.87it/s]


StructuralTypeSeqModel(
  (gat_layers): ModuleList(
    (0): GATConv(31, 128, heads=1)
    (1-2): 2 x GATConv(128, 128, heads=1)
  )
  (pred_head): Linear(128, 66, bias=True)
)
Running eval...


100%|██████████| 436/436 [00:17<00:00, 25.02it/s]


Accuracy = 2.22%


0.022234822404924873

In [6]:
dataset = load_dataset_from_path(dataset_path)

In [7]:
dataset[0].varid

(0, 0, 1323424, '17', 'l')

In [8]:
import pandas as pd

# pd.read_csv(dataset.root/dataset.raw_file_names[0])
from datatype_recovery.models.dataset import TypeSequenceDataset

ds = TypeSequenceDataset(dataset_path)

In [9]:
exp_runs = pd.read_csv(ds.exp_runs_path)
exp_runs

Unnamed: 0,RunGid,RunFolder,BinariesCsv,FuncsCsv,ParamsCsv,LocalsCsv
0,0,/home/cls0027/exp_builds/astera.exp/rundata/run1,/home/cls0027/exp_builds/astera.exp/rundata/ru...,/home/cls0027/exp_builds/astera.exp/rundata/ru...,/home/cls0027/exp_builds/astera.exp/rundata/ru...,/home/cls0027/exp_builds/astera.exp/rundata/ru...


In [10]:

# TODO: to slice and dice this dataset, we really need to UNIFY the data frames (binaries/funcs/params/locals)
# (NOTE this should actually happen during initial dataset creation, unless we specify a --quick mode or something)
#
# we HAVE RunGid mapping (RunGID->files)
#
# --> convert (local) binary IDs to a global binary ID (binary GID)
# (find # binaries in each run, take the max # binaries and round up to the NEXT largest 1,000...this is the BASE binary GID for this run)
# (now add each binary ID to the BASE to get its GID, for all tables (bins/funcs/params/locals))
# (save the base in this exp_runs table)
#       --> HAVE TO COPY THE FILES LOCALLY during this step so we don't overwrite the originals!
# --> pd.concat() all tables across all runs for each table type, one at a time (binary GIDs should now be unique)

# ...NOW we should be able to run our analysis to slice/dice the data based on the
# whole dataset (split/index using the varid's for individual data points)

In [11]:
# TODO: do this for all projects in the dataset...

base_gid = 1000     # start here

for i in range(len(exp_runs)):
    bins_df = pd.read_csv(exp_runs.iloc[i].BinariesCsv)
    next_base_gid = base_gid + int(len(bins_df)/1000)*1000 + 1000

    bins_df['OrigBinaryId'] = bins_df['BinaryId']
    bins_df['BinaryId'] = bins_df.BinaryId + base_gid
    bins_df

    base_gid = next_base_gid    # update for next entry

Unnamed: 0,BinaryId,Name,OrigBinaryId
0,1000,fighter,0
1,1001,assets,1
2,1002,config,2
3,1003,audio,3
4,1004,input,4
5,1005,debug,5
6,1006,sprites,6
7,1007,collision,7
8,1008,pakutil,8


In [32]:
funcs_df = pd.read_csv(exp_runs.iloc[0].FuncsCsv)
params_df = pd.read_csv(exp_runs.iloc[0].ParamsCsv)
locals_df = pd.read_csv(exp_runs.iloc[0].LocalsCsv)

df_list = [funcs_df, params_df, locals_df]

for df in df_list:
    df['BinaryId'] = df.BinaryId.apply(lambda bid: bins_df[bins_df.OrigBinaryId==bid].BinaryId.iloc[0])

In [39]:
# TODO: concat all funcs_dfs, params_dfs, etc. before writing to dataset root/processed/params.csv
# TODO: write all 4 files back to local root folder (root/processed I guess)
# TODO: have the VariableGraphBuilder use these files while creating the dataset instead of the original files
# - download copies files locally if desired
# - process FIRST does this logic (remap binaries, combine dfs, write to csv) then
#   creates var graphs from THESE csvs (so varid will have global binid)

In [12]:
exp_runs.iloc[0].RunFolder

RunGid                                                         0
RunFolder       /home/cls0027/exp_builds/astera.exp/rundata/run1
BinariesCsv    /home/cls0027/exp_builds/astera.exp/rundata/ru...
FuncsCsv       /home/cls0027/exp_builds/astera.exp/rundata/ru...
ParamsCsv      /home/cls0027/exp_builds/astera.exp/rundata/ru...
LocalsCsv      /home/cls0027/exp_builds/astera.exp/rundata/ru...
Name: 0, dtype: object