## Imports

In [1]:
# import os
# os.chdir('../')

In [2]:
"""
    IMPORTING LIBS
"""
import inspect
import sys
import dgl

import numpy as np
import os
import socket
import time
import random
import glob
import argparse, json
import pickle

import torch
import torch.nn as nn
import torch.nn.functional as F

import torch.optim as optim
from torch.utils.data import DataLoader

from tensorboardX import SummaryWriter
from tqdm import tqdm

import random
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import itertools
import numpy as np
import scipy.sparse as sp

import pandas as pd

import dgl.function as fn
MODEL_NAME = 'GraphSage'

class DotDict(dict):
    def __init__(self, **kwds):
        self.update(kwds)
        self.__dict__ = self
        
current_dir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parent_dir = os.path.dirname(current_dir)
sys.path.insert(0, parent_dir)

"""
    IMPORTING CUSTOM MODULES/METHODS
"""
# from nets.COLLAB_edge_classification.load_net import gnn_model # import all GNNS
from nets.COLLAB_edge_classification.load_net import gnn_model
from data.data import LoadData
"""
    GPU Setup
"""
def gpu_setup(use_gpu, gpu_id):
    os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
    os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)  

    if torch.cuda.is_available() and use_gpu:
        print('cuda available with GPU:',torch.cuda.get_device_name(0))
        device = torch.device("cuda")
    else:
        print('cuda not available')
        device = torch.device("cpu")
    return device
use_gpu = False; gpu_id = -1; device = None # CPU
# """
#     USER CONTROLS
# """

  from .autonotebook import tqdm as notebook_tqdm


## LOAD DATA

In [3]:
def load_datset(DATASET_NAME):
    print("[I] Loading data (notebook) ...")
    dataset = LoadData(DATASET_NAME)
    print("[I] Finished loading.....")
    # print(dataset)

    return dataset

## DEFINE PARAMETER

In [4]:
def define_parameter(MODEL_NAME, dataset):
    
#     MODEL_NAME = 'MF'
    # MODEL_NAME = 'GatedGCN'
    
    n_heads = -1
    edge_feat = False
    pseudo_dim_MoNet = -1
    kernel = -1
    gnn_per_block = -1
    embedding_dim = -1
    pool_ratio = -1
    n_mlp_GIN = -1
    gated = False
    self_loop = False
    max_time = 12
    layer_type = 'dgl'
    num_embs = -1
    pos_enc = True
    #pos_enc = False
    pos_enc_dim = 10

    
    if MODEL_NAME == 'MF':
        seed=41; epochs=500; batch_size=32*1024; init_lr=0.01; lr_reduce_factor=0.5; lr_schedule_patience=10; min_lr = 1e-5; weight_decay=0
        L=0; hidden_dim=256; out_dim=hidden_dim; num_embs=235868;
    
    if MODEL_NAME == 'MLP':
        seed=41; epochs=500; batch_size=32*1024; init_lr=0.001; lr_reduce_factor=0.5; lr_schedule_patience=10; min_lr = 1e-5; weight_decay=0
        L=3; hidden_dim=80; out_dim=hidden_dim; dropout=0.0; readout='mean'; gated = False  # Change gated = True for Gated MLP model
    
    if MODEL_NAME == 'GCN':
        seed=41; epochs=500; batch_size=32*1024; init_lr=0.001; lr_reduce_factor=0.5; lr_schedule_patience=10; min_lr = 1e-5; weight_decay=0
        L=5; hidden_dim=74; out_dim=hidden_dim; dropout=0.0; readout='mean';
        
    if MODEL_NAME == 'GraphSage':
        seed=41; epochs=500; batch_size=32*1024; init_lr=0.001; lr_reduce_factor=0.5; lr_schedule_patience=10; min_lr = 1e-5; weight_decay=0
        L=10; hidden_dim=38; out_dim=hidden_dim; dropout=0.0; readout='mean'; layer_type='edgefeat'

    if MODEL_NAME == 'GAT':
        seed=41; epochs=500; batch_size=32*1024; init_lr=0.001; lr_reduce_factor=0.5; lr_schedule_patience=10; min_lr = 1e-5; weight_decay=0
        L=3; n_heads=3; hidden_dim=19; out_dim=n_heads*hidden_dim; dropout=0.0; readout='mean'; layer_type='dgl'
    
    if MODEL_NAME == 'GIN':
        seed=41; epochs=500; batch_size=32*1024; init_lr=0.001; lr_reduce_factor=0.5; lr_schedule_patience=10; min_lr = 1e-5; weight_decay=0
        L=3; hidden_dim=60; out_dim=hidden_dim; dropout=0.0; readout='mean';
        
    if MODEL_NAME == 'MoNet':
        seed=41; epochs=500; batch_size=32*1024; init_lr=0.001; lr_reduce_factor=0.5; lr_schedule_patience=10; min_lr = 1e-5; weight_decay=0
        L=3; hidden_dim=53; out_dim=hidden_dim; dropout=0.0; readout='mean';
        
    if MODEL_NAME == 'GatedGCN':
        seed=41; epochs=500; batch_size=32*1024; init_lr=0.001; lr_reduce_factor=0.5; lr_schedule_patience=10; min_lr = 1e-5; weight_decay=0
        L=5; hidden_dim=35; out_dim=hidden_dim; dropout=0.0; readout='mean'; edge_feat = False; layer_type='edgereprfeat'
        
    # generic new_params
    net_params = {}
    net_params['device'] = device
    net_params['in_dim'] = dataset.graph.ndata['feat'].shape[-1]
    net_params['in_dim_edge'] = dataset.graph.edata['feat'].shape[-1]
    net_params['residual'] = True
    net_params['hidden_dim'] = hidden_dim
    net_params['out_dim'] = out_dim
    num_classes = 1
    net_params['n_classes'] = num_classes
    net_params['n_heads'] = n_heads
    net_params['L'] = L  # min L should be 2
    net_params['readout'] = "mean"
    net_params['layer_norm'] = True
    net_params['batch_norm'] = True
    net_params['in_feat_dropout'] = 0.0
    net_params['dropout'] = 0.0
    net_params['edge_feat'] = edge_feat
    net_params['self_loop'] = self_loop
    net_params['layer_type'] = layer_type
    
    # for MF
    net_params['num_embs'] = num_embs
    
    # for MLPNet 
    net_params['gated'] = gated
    
    # specific for MoNet
    net_params['pseudo_dim_MoNet'] = 2
    net_params['kernel'] = 3
    
    # specific for GIN
    net_params['n_mlp_GIN'] = 2
    net_params['learn_eps_GIN'] = True
    net_params['neighbor_aggr_GIN'] = 'sum'
    
    # specific for graphsage
    net_params['sage_aggregator'] = 'maxpool'   
    
    # specific for pos_enc_dim
    net_params['pos_enc'] = pos_enc
    net_params['pos_enc_dim'] = pos_enc_dim

    
    params = {}
    params['seed'] = seed
    params['epochs'] = epochs
    params['batch_size'] = batch_size
    params['init_lr'] = init_lr
    params['lr_reduce_factor'] = lr_reduce_factor 
    params['lr_schedule_patience'] = lr_schedule_patience
    params['min_lr'] = min_lr
    params['weight_decay'] = weight_decay
    params['print_epoch_interval'] = 5
    params['max_time'] = max_time

    return net_params, params
    


## WRITE TO FILE

In [5]:
def write_to_file(test_edges, test_pred, src_id, APPLICATION_NAME):

    ROOT_PATH = "../new/"
    df = pd.read_csv(ROOT_PATH+ APPLICATION_NAME+'_node.csv')
    stmt_type=['FunctionDeclaration', 'ArrowFunctionExpression', 'FunctionExpression']
    df1 = df[df.type.isin(stmt_type)]
    # out_csv_file = open("candidates_csv/candidate_"+APPLICATION_NAME+"_"+str(src_id)+".csv","w+")
    # out_csv_file.write("src,dst,score,file_name,start_line,name\n")
    src_list =[]
    dst_lst =[]
    scores =[]
    file_names =[]
    start_lines = []
    names =[]
    
    for i in range(len(test_edges)):
        temp_df = df1[df1['new_id']==test_edges[i][1].item()]
        # print(temp_df)
        file_name = temp_df.iloc[0]['file_name']
        start_line = temp_df.iloc[0]['start_line']
        name = temp_df.iloc[0]['name']
        if pd.isna(temp_df.iloc[0]['name']):
            name= ""
        # pred_score = "{:.12f}".format(float(test_pred[i].item()))
        src_list.append(str(test_edges[i][0].item()))
        dst_lst.append(str(test_edges[i][1].item()))
        scores.append(float(test_pred[i].item()))
        file_names.append(file_name)
        start_lines.append(start_line)
        names.append(name)
        # out_csv_file.write(str(test_edges[i][0].item())+","+str(test_edges[i][1].item())+","+pred_score+","+file_name+","+str(start_line)+","+name+"\n")
    
    df = pd.DataFrame({'src':src_list, 'dst':dst_lst, 'score':scores, 'file_name':file_names, 'start_line':start_lines, 'name':names})
    df = df.sort_values(by=['score'], ascending=False).reset_index(drop=True)
    pred_scores = ["{:.12f}".format(i) for i in df['score'].tolist()]
    df['score']=pred_scores
    # print(df)
    df.to_csv("candidates_csv/candidate_"+APPLICATION_NAME+"_"+str(src_id)+".csv")

    # out_csv_file.flush()


## GENERATE TEST DATA

In [6]:
# def get_test_data(src_id):
#     print(src_id)
#     ROOT_PATH = "new_dataset/new/"
#     df = pd.read_csv(ROOT_PATH+ APPLICATION_NAME+'_node.csv')
#     stmt_type=['FunctionDeclaration', 'ArrowFunctionExpression', 'FunctionExpression']
#     df1 = df[df.type.isin(stmt_type)]
#     test_neg_id = df1['new_id'].tolist()
#     src = [src_id]*len(test_neg_id)
#     d={'src':src, 'dst':test_neg_id}
#     test_df = pd.DataFrame(d)
#     # print(len(test_neg_id), d)
#     test_edges = torch.from_numpy(test_df.to_numpy())
#     return test_edges


def get_test_data(src_id, for_true_negative=False, APPLICATION_NAME=""):
    # print(src_id)
    ROOT_PATH = "../new/"
    df = pd.read_csv(ROOT_PATH+ APPLICATION_NAME+'_node.csv')
    d = dict()
    f = open('../full_ast/'+APPLICATION_NAME+'_import_dict.json')
    data = json.load(f)
    u=[]
    v=[]
    temp_df_src = df[df['new_id']==src_id]
    file_name = temp_df_src.iloc[0]['file_name']
    lst = data[file_name]
    new_lst = []
    new_lst.append(file_name)
    for x in lst:
        if "lodash/internal" in x:
            x = x.replace("lodash/internal","lodash/.internal")
        new_lst.append(x)
    # print(new_lst)
    stmt_type=['FunctionDeclaration', 'ArrowFunctionExpression', 'FunctionExpression']
    dst_df = df[(df.type.isin(stmt_type)) & (df.file_name.isin(new_lst))]
    if for_true_negative:
        return dst_df
    # print(dst_df['file_name'])
    # print(file_name, "====>",  len(dst_df))
    test_neg_id = dst_df['new_id'].tolist()
    done_list={}
    for node in test_neg_id:
        if (src_id,node) not in done_list:
                u.append(src_id)
                v.append(node)
                done_list[(src_id,node)]=True

    d={'src':u, 'dst':v}
    test_df = pd.DataFrame(d)
#     print(test_df)
    test_edges = torch.from_numpy(test_df.to_numpy())
    return test_edges

In [7]:
# get_test_data(8439, APPLICATION_NAME='lodash')

## VIEW MODEL PARAMETER

In [8]:
def view_model_param(MODEL_NAME, net_params):
    # print(net_params)
    model = gnn_model(MODEL_NAME, net_params)
    total_param = 0
    # print("MODEL DETAILS:\n")
    # print(model)
    for param in model.parameters():
        # print(param.data.size())
        total_param += np.prod(list(param.data.size()))
    # print('MODEL/Total parameters:', MODEL_NAME, total_param)
    return total_param

tim1 = time.time()


## Load Data

In [9]:
def LoadDF(APPLICATION_NAME):
    df = pd.read_csv("../new/"+APPLICATION_NAME+"_node.csv")
    return df  

In [10]:
def set_parameters(MODEL_NAME, dataset, DATASET_NAME):
    net_params, params = define_parameter(MODEL_NAME=MODEL_NAME, dataset=dataset)
    config = {}
    gpu = {}
    gpu['use'] = use_gpu
    gpu['id'] = gpu_id
    config['gpu'] = gpu
    # GNN model, dataset, out_dir
    config['model'] = MODEL_NAME
    config['dataset'] = DATASET_NAME
    out_dir = 'out/debug/'
    config['out_dir'] = out_dir
    config['params'] = params
    # network parameters
    config['net_params'] = net_params
    params = config['params']
    DATASET_NAME = config['dataset']
    device = gpu_setup(config['gpu']['use'], config['gpu']['id'])
    out_dir = config['out_dir']
    MODEL_NAME = config['model']
    net_params = config['net_params']
    net_params['device'] = device
    net_params['gpu_id'] = config['gpu']['id']
    net_params['batch_size'] = params['batch_size']
    net_params['in_dim'] = dataset.graph.ndata['feat'].shape[-1]
    net_params['in_dim_edge'] = dataset.graph.edata['feat'].shape[-1]
    net_params['n_classes'] = 1  # binary prediction
    net_params['total_param'] = view_model_param(MODEL_NAME, net_params)
    
    return net_params, params

## Train Model

In [11]:
def train_model(dataset, EPOC_NUMBER, APPLICATION_NAME, net_params, params, MODEL_NAME):
    t0 = time.time()    
    DATASET_NAME = dataset.name

    if MODEL_NAME in ['GatedGCN']:
        if net_params['pos_enc']:
            print("[!] Adding graph positional encoding",net_params['pos_enc_dim'])
            dataset._add_positional_encodings(net_params['pos_enc_dim'])
            print('Time PE:',time.time()-t0)

    graph = dataset.graph
    evaluator=""
    train_edges, val_edges, val_edges_neg, test_edges, test_edges_neg = dataset.train_edges, dataset.val_edges, dataset.val_edges_neg, dataset.test_edges, dataset.test_edges_neg
    device = net_params['device']
    random.seed(params['seed'])
    np.random.seed(params['seed'])
    torch.manual_seed(params['seed'])
    if device.type == 'cuda':
        torch.cuda.manual_seed(params['seed'])

    print("Graph: ", graph)
    print("Training Edges: ", len(train_edges))
    print("Validation Edges: ", len(val_edges) + len(val_edges_neg))
    print("Test Edges: ", len(test_edges) + len(test_edges_neg))

    print(net_params)
    model = gnn_model(MODEL_NAME, net_params)
    model = model.to(device)
    out_dir = 'out/debug/'
    PATH = out_dir + 'checkpoints/' + MODEL_NAME + "_NEW_DATASET--" + APPLICATION_NAME + "/RUN_/epoch_"+str(EPOC_NUMBER)+".pkl"
    t = torch.load(PATH)
    model.load_state_dict(torch.load(PATH))
#     print(model)

    optimizer = optim.Adam(model.parameters(), lr=params['init_lr'], weight_decay=params['weight_decay'])
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max',
                                                        factor=params['lr_reduce_factor'],
                                                        patience=params['lr_schedule_patience'],
                                                        verbose=True)

    return model, graph


In [12]:
def get_EPOC_Number(PATH):
    file_path = PATH+"/ROC_CURVE/*"
    project_list = glob.glob(file_path)
    max_ = -1
    for line in project_list:
        index = line.rfind("_")
        last_index = line.rfind(".png")
        number = int(line[index+1:last_index])
        max_ = max(max_, number)
    # print(max_)
    return max_

## plot graph

In [13]:
def plot_graph(rank_list,APPLICATION_NAME):
    # print(rank_list)
    import matplotlib.pyplot as plt
    import seaborn as sns

    # plt.hist(rank_list, color = 'blue', edgecolor = 'black',
    #          bins = int(max(rank_list)/5))

    # seaborn histogram
    plt.figure(figsize=(16, 8))
    sns.distplot(rank_list, hist=True, kde=False, 
                bins=int(max(rank_list)/2), color = 'blue',
                hist_kws={'edgecolor':'black'})
    # Add labels

    plt.title('Histogram of Candidate Ranking')
    plt.xlabel('Ranking')
    plt.ylabel('Count')
    plt.savefig("candidate_figures/positive_"+APPLICATION_NAME+".pdf")
    # plt.savefig("ms_0.7.0.png")


## CANDIDATE FOR MISSED EDGES

In [14]:
def evaluate_network_for_missed_edge(model, device, graph, test_edges,
                     batch_size, DATASET_NAME="", MODEL_NAME="", SOURCE_NODE=0, DST_NODE=0, APPLICATION_NAME=""):
    
    model.eval()
    with torch.no_grad():
        graph = graph.to(device)
        x = graph.ndata['feat'].to(device)
        e = graph.edata['feat'].to(device).float()
        try:
            x_pos_enc = graph.ndata['pos_enc'].to(device)
            h = model(graph, x, e, x_pos_enc) 
        except:
            h = model(graph, x, e)

        test_edges = test_edges.to(device)

        test_preds = []
        for perm in DataLoader(range(test_edges.size(0)), batch_size):
            edge = test_edges[perm].t()
            # print("edge ==> ",edge, len(edge))
            test_preds += [model.edge_predictor(h[edge[0]], h[edge[1]]).squeeze().cpu()]
        # print(test_preds)
        if len(test_edges)==1:
            t= test_preds[0]
            t =np.expand_dims(t,0)
            t = torch.tensor(t)
            test_preds = [t]
        test_pred = torch.cat(test_preds, dim=0)
        
    # write_to_file(test_edges=test_edges, test_pred=test_pred, src_id=SOURCE_NODE, APPLICATION_NAME=APPLICATION_NAME)

    src=[]
    dst=[]
    scores = []
    for i in range(len(test_edges)):
        pred_score = "{:.12f}".format(float(test_pred[i].item()))
        s = int(test_edges[i][0].item())
        d = int(test_edges[i][1].item())
        src.append(s)
        dst.append(d)
        scores.append(pred_score)

    d = {"src": src, "dst":dst, "score":scores}
    df = pd.DataFrame(d)
    df = df.sort_values(by='score', ascending=False, na_position='first').reset_index(drop=True)
    return df

In [18]:
def print_rank(df, SOURCE_NODE, node_df):
    df['score']=df.score.replace('',np.nan).astype(float)
    temp_df = df[(df["score"]>=.005)]
    temp_df = df
    if len(temp_df)>0:
        should_print = False
        for i in range(len(temp_df)):
            src = int(temp_df.iloc[i]['src'])
            dst = int(temp_df.iloc[i]['dst'])
            score = temp_df.iloc[i]['score']

            df1 = node_df[node_df['new_id']==src]
            # print(df1)
            src_file_name = df1.iloc[0]['file_name']
            # name = df1.iloc[0]['name']
            src_start_line = df1.iloc[0]['start_line']
            src_start_column = df1.iloc[0]['start_column']

            if src_file_name =='/Users/masudulhasanmasudbhuiyan/Documents/gitlab/libraries/lodash/cloneWith.js':
                should_strt_line = 37
            if src_file_name == '/Users/masudulhasanmasudbhuiyan/Documents/gitlab/libraries/lodash/test/nthArg.js':
                should_strt_line = 21
            if src_file_name == '/Users/masudulhasanmasudbhuiyan/Documents/gitlab/libraries/lodash/test/reduceRight.js':
                should_strt_line = 40
            if src_file_name == '/Users/masudulhasanmasudbhuiyan/Documents/gitlab/libraries/lodash/keyBy.js':
                should_strt_line = 28
            if src_file_name == '/Users/masudulhasanmasudbhuiyan/Documents/gitlab/libraries/lodash/test/wrap.js':
                should_strt_line = 44
            if src_file_name == '/Users/masudulhasanmasudbhuiyan/Documents/gitlab/libraries/lodash/test/divide.test.js':
                should_strt_line = 6
            if src_file_name == '/Users/masudulhasanmasudbhuiyan/Documents/gitlab/libraries/lodash/test/rearg.js':
                should_strt_line = 68
            if src_file_name == '/Users/masudulhasanmasudbhuiyan/Documents/gitlab/libraries/formula-parser/src/grammar-parser/grammar-parser.js':
                should_strt_line = 387
                should_strt_line = 479
            if src_file_name == '/Users/masudulhasanmasudbhuiyan/Documents/gitlab/libraries/lodash/cloneDeep.js':
                should_strt_line = 25
            
            if src_start_line == should_strt_line:
                should_print = True
                df1 = node_df[node_df['new_id']==dst]
                # print(df1)
                file_name = df1.iloc[0]['file_name']
                name = df1.iloc[0]['name']
                start_line = df1.iloc[0]['start_line']
                start_column = df1.iloc[0]['start_column']
                print(src_file_name, src_start_line, src_start_column,'==========>', file_name, name, start_line, start_column, '=======>', score)
        if should_print: 
            print("\n\n")

In [23]:
def get_candidate_for_missed_call_site(PATH, EPOC_NUMBER, model, graph, params, DATASET_NAME, MODEL_NAME, APPLICATION_NAME):
    df =pd.read_csv("dynamic_edges/dynamic_edges_"+APPLICATION_NAME+".csv")
    df = df.drop_duplicates(keep='first').reset_index(drop=True)


    missed_df = pd.read_csv("../csv_files/id_files/"+APPLICATION_NAME+"_missed_call_site_ids.csv")
    ids = missed_df['id'].tolist()
    src=df['src'].tolist()
    missed_lst = [x for x in ids if x not in src]
    lst = missed_lst

    node_df_org = pd.read_csv("../new/"+APPLICATION_NAME+"_node.csv")
    node_df = node_df_org[node_df_org['new_id'].isin(lst)].reset_index(drop=True)

    include = ['/Users/masudulhasanmasudbhuiyan/Documents/gitlab/libraries/lodash/cloneWith.js', '/Users/masudulhasanmasudbhuiyan/Documents/gitlab/libraries/lodash/test/nthArg.js', 
                '/Users/masudulhasanmasudbhuiyan/Documents/gitlab/libraries/lodash/keyBy.js',
                '/Users/masudulhasanmasudbhuiyan/Documents/gitlab/libraries/lodash/test/wrap.js','/Users/masudulhasanmasudbhuiyan/Documents/gitlab/libraries/lodash/test/divide.test.js'
                '/Users/masudulhasanmasudbhuiyan/Documents/gitlab/libraries/lodash/test/rearg.js'
                '/Users/masudulhasanmasudbhuiyan/Documents/gitlab/libraries/lodash/cloneDeep.js']
    include = ['/Users/masudulhasanmasudbhuiyan/Documents/gitlab/libraries/lodash/test/nthArg.js']
    # include = ['/Users/masudulhasanmasudbhuiyan/Documents/gitlab/libraries/formula-parser/src/grammar-parser/grammar-parser.js']
    for i in range(len(node_df)):
        file_name = node_df.iloc[i]['file_name']
        name = node_df.iloc[i]['name']
        start_line = node_df.iloc[i]['start_line']
        start_column = node_df.iloc[i]['start_column']
        src_id = node_df.iloc[i]['new_id']
        # print(file_name, name, start_line, start_column, src_id)
        if file_name in include:
            test_edges = get_test_data(src_id=src_id, APPLICATION_NAME=APPLICATION_NAME)
            # print(len(test_edges))
            if len(test_edges) > 0:
                df = evaluate_network_for_missed_edge(
                        model, device, graph, test_edges, params['batch_size'], DATASET_NAME=DATASET_NAME, MODEL_NAME=MODEL_NAME, SOURCE_NODE=src_id, APPLICATION_NAME=APPLICATION_NAME)
                # print(df)
                print_rank(df,src_id, node_df_org)

                # print("\n\n")




## Candidate for True Negative edges

In [22]:
print(os.getcwd())
# APPLICATION_NAME = "lodash"


MODEL_NAME = 'GatedGCN'


# to_do_list = ['lodash', 'formula-parser', 'mathjs' ]
# app_list = ['lodash', 'formula-parser', 'mathjs' ]

to_do_list = ['formula-parser', 'lodash', 'express','js-yaml']
app_list = ['formula-parser', 'lodash', 'express','js-yaml']

to_do_list = ['lodash']
app_list = ['lodash']

# print(os.getcwd())
# if "with_dynamic_edge" not in str(os.getcwd()):
#     os.chdir('with_dynamic_edge/')
# print(os.getcwd())
for i in range(len(to_do_list)):
    APPLICATION_NAME = app_list[i]

    # df = pd.read_csv('../csv_files/id_files/'+ APPLICATION_NAME+'_missed_call_site_ids.csv')
    # ids = df['id'].tolist()

    # dynamic_edges = pd.read_csv('../with_dynamic_edge/dynamic_edges/dynamic_edges_lodash.csv')
    # dynamic_edges = dynamic_edges[dynamic_edges['src']==ids[0]]

    DATASET_NAME = 'NEW_DATASET--'+APPLICATION_NAME
    print("\033[97m ------------------------------------------------------------------")
    print("\033[97m ", APPLICATION_NAME)
    print("\033[97m ------------------------------------------------------------------")
    # print("[I] Loading data (notebook) ...")
    node_df = LoadDF(APPLICATION_NAME)
    # print(node_df)
    # print("[I] Finished loading.")
    PATH = "results/NEW_DATASET--"+APPLICATION_NAME+"/"+MODEL_NAME
    EPOC_NUMBER = get_EPOC_Number(PATH=PATH)
    # EPOC_NUMBER = 149
    print(EPOC_NUMBER)
    dataset = load_datset(DATASET_NAME)
    net_params, params = set_parameters(MODEL_NAME, dataset, DATASET_NAME)
    model, graph = train_model(dataset, EPOC_NUMBER, APPLICATION_NAME, net_params, params, MODEL_NAME)
    # print(model, graph)
    rank_list = get_candidate_for_missed_call_site(PATH, EPOC_NUMBER, model, graph, params, DATASET_NAME, MODEL_NAME, APPLICATION_NAME)
    # plot_graph(rank_list, APPLICATION_NAME)

/Users/masudulhasanmasudbhuiyan/Documents/gitlab/fresh_start/with_bidirectional_edge
[97m ------------------------------------------------------------------
[97m  lodash
[97m ------------------------------------------------------------------
142
[I] Loading data (notebook) ...
[I] Loading dataset lodash...
y ==>  77423 77423


  y=torch.tensor(targets, dtype=torch.float32)


[I] Finished loading.
[I] Data load time: 51.1365s
[I] Finished loading.....
cuda not available
[!] Adding graph positional encoding 10
Time PE: 24.760075569152832
Graph:  Graph(num_nodes=77423, num_edges=239893,
      ndata_schemes={'x_one_hot': Scheme(shape=(53,), dtype=torch.int64), 'x': Scheme(shape=(1,), dtype=torch.int64), 'param_len_one_hot': Scheme(shape=(9,), dtype=torch.int64), 'param_len': Scheme(shape=(1,), dtype=torch.int64), 'args_len_one_hot': Scheme(shape=(9,), dtype=torch.int64), 'args_len': Scheme(shape=(1,), dtype=torch.int64), 'name_one_hot': Scheme(shape=(1451,), dtype=torch.int64), 'name': Scheme(shape=(1,), dtype=torch.int64), 'feat': Scheme(shape=(1522,), dtype=torch.int64), 'pos_enc': Scheme(shape=(10,), dtype=torch.float32)}
      edata_schemes={'feat': Scheme(shape=(1,), dtype=torch.int64)})
Training Edges:  215686
Validation Edges:  47980
Test Edges:  434
{'device': device(type='cpu'), 'in_dim': 1522, 'in_dim_edge': 1, 'residual': True, 'hidden_dim': 35, 'ou

In [24]:
get_candidate_for_missed_call_site(PATH, EPOC_NUMBER, model, graph, params, DATASET_NAME, MODEL_NAME, APPLICATION_NAME)






## Dynamic edge

In [None]:
APPLICATION_NAME = 'formula-parser'
df =pd.read_csv("dynamic_edges/dynamic_edges_"+APPLICATION_NAME+".csv")
df = df.drop_duplicates(keep='first').reset_index(drop=True)


missed_df = pd.read_csv("../csv_files/id_files/"+APPLICATION_NAME+"_missed_call_site_ids.csv")
# missed_df = missed_df.drop_duplicates(subset=['id'], keep='first').reset_index(drop=True)
ids = missed_df['id'].tolist()

src=df['src'].tolist()
# len(src)

missed_lst = [x for x in ids if x not in src]
# lst = random.sample(missed_lst, 50)
lst = missed_lst
# len(ids), len(src), len(missed_lst)
# df = df[~df['src'].isin(ids)]
# df = df.sample(n=20)
# df
node_df = pd.read_csv("../new/"+APPLICATION_NAME+"_node.csv")
node_df = node_df[node_df['new_id'].isin(lst)].reset_index(drop=True)
# print(node_df)
# exclude = []
exclude = ['toUpperCase','require','forEach','copySync','resolve','replace','Error','reduce','toString','push','fn','bind','on','done','split','slice','substr',
'match','log','matrix','bignumber','throws','map', 'func', 'constant','each','assign','toLowerCase']
lst=[]
for i in range(len(node_df)):
    file_name = node_df.iloc[i]['file_name']
    name = node_df.iloc[i]['name']
    start_line = node_df.iloc[i]['start_line']
    start_column = node_df.iloc[i]['start_column']
    # if name not in exclude:
    lst.append(name)
    if name not in exclude and "/dist/" not in file_name:
        print(file_name, name, start_line, start_column)
        

/Users/masudulhasanmasudbhuiyan/Documents/gitlab/libraries/formula-parser/src/error.js call 49 8
/Users/masudulhasanmasudbhuiyan/Documents/gitlab/libraries/formula-parser/src/evaluate-by-operator/operator/formula-function.js nan 18 17
/Users/masudulhasanmasudbhuiyan/Documents/gitlab/libraries/formula-parser/src/evaluate-by-operator/operator/formula-function.js nestedFormula 36 17
/Users/masudulhasanmasudbhuiyan/Documents/gitlab/libraries/formula-parser/src/grammar-parser/grammar-parser.js callVariable 93 15
/Users/masudulhasanmasudbhuiyan/Documents/gitlab/libraries/formula-parser/src/grammar-parser/grammar-parser.js evaluateByOperator 143 15
/Users/masudulhasanmasudbhuiyan/Documents/gitlab/libraries/formula-parser/src/grammar-parser/grammar-parser.js toNumber 189 15
/Users/masudulhasanmasudbhuiyan/Documents/gitlab/libraries/formula-parser/src/grammar-parser/grammar-parser.js callFunction 200 15
/Users/masudulhasanmasudbhuiyan/Documents/gitlab/libraries/formula-parser/src/grammar-parser

In [None]:
from collections import Counter
print(Counter(lst))

Counter({'parseNumber': 328, 'anyIsError': 151, 'flatten': 102, 'Error': 99, '__webpack_require__': 92, 'parseNumberArray': 77, 'push': 74, 'substring': 57, 'cdf': 56, 'call': 42, 'indexOf': 41, 'multiply': 34, 'on': 33, 'parseDate': 30, 'split': 30, 'apply': 29, 'toString': 29, 'inv': 26, 'rest': 25, 'slice': 24, 'mean': 24, 'reverse': 24, 'map': 21, 'getDate': 21, 'fn': 21, 'pdf': 20, 'test': 18, 'getFullYear': 17, 'concat': 15, 'arrayEach': 15, 'substr': 14, 'evaluateByOperator': 14, 'toUpperCase': 13, 'add': 13, 'toNumber': 13, 'replace': 12, 'getMonth': 12, 'f': 12, 'REPT': 12, 'sort': 11, 'forEach': 11, 'passfunc': 11, 'match': 10, '_random_fn': 10, 'reduce': 9, 'cols': 9, 'transpose': 9, 'S': 8, 'P': 8, 'parse': 8, 'toArray': 8, 'emit': 8, 'createToken': 7, 'compute': 7, 'numbers': 6, 'func': 6, 'setTimeout': 6, 'subtract': 6, nan: 6, 'join': 5, 'rows': 5, 'toLowerCase': 5, 'norm': 5, 'matrixmult': 5, 'done': 5, 'COUNT': 4, 'COUNTA': 4, 'pop': 4, 'argsToArray': 4, 'COMBIN': 4, '

In [None]:
import glob
import pandas as pd
filepath = "dynamic_edges/*"
project_list = glob.glob(filepath)
project_list
# lst=[]
for x in project_list:
    print(x)
    index = x.rfind("_")
    last_index = x.rfind(".csv")
    DATASET_NAME = x[index+1:last_index]
    print(DATASET_NAME)
    df = pd.read_csv(x)
    df = df.drop_duplicates(subset=['src', 'dst'], keep='first').reset_index(drop=True)

    func_df = pd.read_csv("../new/"+DATASET_NAME+"_function_edges.csv")

    new_df = df.merge(func_df, indicator=True, how='outer').query('_merge=="left_only"').drop('_merge', axis=1)
    print(len(df), len(func_df), len(new_df))
    # print(df.shape)

# print(lst)

dynamic_edges/dynamic_edges_formula-parser.csv
formula-parser
1327 2841 833
dynamic_edges/dynamic_edges_lodash.csv
lodash
905 2263 673
dynamic_edges/dynamic_edges_express.csv
express
4732 1221 3646
dynamic_edges/dynamic_edges_js-yaml.csv
js-yaml
382 688 176
dynamic_edges/dynamic_edges_mathjs.csv
mathjs
673 7489 338
