In [None]:
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Dropout, Flatten, Conv2D, MaxPooling2D
from keras.layers import Input, Embedding
from keras import backend as K

import os, sys, glob, numpy as np, sqlite3, json, random, difflib, cPickle as cp
from copy import deepcopy
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report
sys.path.append('..')
from util.helpers import Tree_Dataset as Dataset, make_dir_if_not_exists as mkdir, get_rev_dict, prepend_line_numbers
from util.ast_helpers import get_subtree_list

from scipy.spatial.distance import cdist

In [None]:
def get_org_model(max_subtrees, max_nodes, embedding_dim, vocab_size, best_checkpoint):
    program_input = Input(shape=(max_subtrees, max_nodes), dtype='int32', name='program')
    embedded_program = Embedding(output_dim=embedding_dim, input_dim=vocab_size, name='program_embedding')(program_input)

    # Use convolution over program input
    base = Conv2D(64, (1, 1), padding='same', activation='relu', name='base')(embedded_program)
    tower_1 = Conv2D(64, (1, max_nodes), padding='valid', activation='relu', name='tower1')(base)
    tower_2 = Conv2D(64, (3, max_nodes), strides=(3, 1), padding='valid', activation='relu', name='tower2')(base)
    program_features = keras.layers.concatenate([tower_1, tower_2], axis=1)
    program_vector = Flatten()(program_features)

    # embed test_id, and problem_id input
    problem_id_input = Input(shape=(1,), dtype='int32', name='problem_id')
    embedded_problem_id = Embedding(output_dim=5, input_dim=cnt_problem_ids, name='problem_id_embedding')(problem_id_input)
    embedded_problem_id = keras.layers.Reshape((5,))(embedded_problem_id)

    test_id_input = Input(shape=(1,), dtype='int32', name='test_id')
    embedded_test_id = Embedding(output_dim=5, input_dim=test_suite_size, name='test_id_embedding')(test_id_input)
    embedded_test_id = keras.layers.Reshape((5,))(embedded_test_id)

    merged = keras.layers.concatenate([program_vector, embedded_test_id, embedded_problem_id])

    hidden = Dense(128, activation='relu')(merged)
    hidden = Dense(64, activation='relu')(hidden)
    hidden = Dense(32, activation='relu')(hidden)
    logits = Dense(2)(hidden)
    output = keras.layers.Softmax()(logits)
    org_model = Model(inputs=[program_input, problem_id_input, test_id_input], outputs=output)

    print 'original model summary!'
    org_model.summary()
    org_model.load_weights(best_checkpoint, by_name=True, skip_mismatch=False, reshape=False)
    org_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    return org_model

def get_emb_model(max_subtrees, max_nodes, embedding_dim, vocab_size):
    program_input = Input(shape=(max_subtrees, max_nodes), dtype='int32', name='program')
    output_embedded_program = Embedding(output_dim=embedding_dim, input_dim=vocab_size, name='program_embedding')(program_input)
    emb_model = Model(inputs=program_input, outputs=output_embedded_program)
    
    emb_model.summary()
    return emb_model

def get_rest_of_the_model(max_subtrees, max_nodes, embedding_dim, vocab_size):
    embedded_program_input = Input(shape=(max_subtrees, max_nodes, embedding_dim))

    base = Conv2D(64, (1, 1), padding='same', activation='relu', name='base')(embedded_program_input)
    tower_1 = Conv2D(64, (1, max_nodes), padding='valid', activation='relu', name='tower1')(base)
    tower_2 = Conv2D(64, (3, max_nodes), strides=(3, 1), padding='valid', activation='relu', name='tower2')(base)
    program_features = keras.layers.concatenate([tower_1, tower_2], axis=1)
    program_vector = Flatten()(program_features)

    problem_id_input = Input(shape=(1,), dtype='int32', name='problem_id')
    embedded_problem_id = Embedding(output_dim=5, input_dim=cnt_problem_ids, name='problem_id_embedding')(problem_id_input)
    embedded_problem_id = keras.layers.Reshape((5,))(embedded_problem_id)

    test_id_input = Input(shape=(1,), dtype='int32', name='test_id')
    embedded_test_id = Embedding(output_dim=5, input_dim=test_suite_size, name='test_id_embedding')(test_id_input)
    embedded_test_id = keras.layers.Reshape((5,))(embedded_test_id)

    merged = keras.layers.concatenate([program_vector, embedded_test_id, embedded_problem_id])

    hidden = Dense(128, activation='relu')(merged)
    hidden = Dense(64, activation='relu')(hidden)
    hidden = Dense(32, activation='relu')(hidden)
    logits = Dense(2)(hidden)
    output = keras.layers.Softmax()(logits)
    model = Model(inputs=[embedded_program_input, problem_id_input, test_id_input], outputs=output)
    print 'rest of the model summary!'
    model.summary()
    return model

def prepare_failing_data(data_directory, program_ids_for_eval):
    '''collect negative examples (which fail a test case)'''

    neg_example_indices = []
    for idx, program_id in enumerate(program_ids):
        # each program appears with many test cases, some of which can pass
        if program_id in program_ids_for_eval and not verdicts[idx]:
            neg_example_indices.append(idx)
    print 'len(neg_example_indices):', len(neg_example_indices)

    neg_program_ids, neg_programs, neg_problem_ids, neg_test_ids,  neg_verdicts, neg_buggy_subtrees = [], [], [], [], [], []
    done = set()
    for idx in neg_example_indices:
        neg_program_ids.append(program_ids[idx])
        neg_programs.append(programs[idx])
        neg_problem_ids.append(problem_ids[idx])
        neg_test_ids.append(test_ids[idx])
        neg_verdicts.append(verdicts[idx])
        neg_buggy_subtrees.append(buggy_subtrees[idx])
        done.add('{}_{}'.format(program_ids[idx], test_ids[idx]))
    
    print 'len(done):', len(done)

    # to make things compatible with older negative subtree processing
    with open(os.path.join(data_directory, 'examples-eval.pkl'), 'r') as f:
        eval_examples = cp.load(f)

    _, eval_program_ids, _, _, _, eval_buggy_subtrees = zip(*eval_examples)

    pid_to_buggy_subtree_map = {}
    for eval_program_id, eval_buggy_subtree in zip(eval_program_ids, eval_buggy_subtrees):
        pid_to_buggy_subtree_map[eval_program_id] = eval_buggy_subtree
        
    to_delete = []
    neg_buggy_line_to_subtrees = []
    for idx, neg_program_id in enumerate(neg_program_ids):
        try:
            neg_buggy_line_to_subtrees.append(pid_to_buggy_subtree_map[neg_program_id])
        except KeyError:
            to_delete.append(idx)
            
    print len(to_delete), len(neg_buggy_line_to_subtrees)
    assert len(neg_program_ids) == len(neg_buggy_line_to_subtrees), len(neg_buggy_line_to_subtrees)

    return neg_program_ids, neg_programs, neg_problem_ids, neg_test_ids,  neg_verdicts, neg_buggy_subtrees, neg_buggy_line_to_subtrees

def get_negative_example(evaluation_dataset, index):
    neg_program_ids, neg_programs, neg_problem_ids, neg_test_ids, neg_verdicts, neg_buggy_subtrees, neg_buggy_line_to_subtrees = evaluation_dataset
    neg_program_id = neg_program_ids[index]
    neg_program = neg_programs[index]
    neg_problem_id = neg_problem_ids[index]
    neg_test_id = neg_test_ids[index]
    neg_verdict = neg_verdicts[index]
    neg_buggy_subtree = neg_buggy_subtrees[index]
    neg_buggy_line_to_subtree = neg_buggy_line_to_subtrees[index]
    return neg_program_id, neg_program, neg_problem_id, neg_test_id, neg_verdict, neg_buggy_subtree, neg_buggy_line_to_subtree

def get_models_result_for_example(example_tuple, org_model, emb_model, model):
    neg_program_id, neg_program, neg_problem_id, neg_test_id, neg_verdict, neg_buggy_subtree, neg_buggy_line_to_subtree = example_tuple
    example_x = [ np.array([neg_program]), np.array([[neg_problem_id]]), np.array([[neg_test_id]]) ]
    example_y = keras.utils.to_categorical([neg_verdict], num_classes=2)

    emb_program = emb_model.predict([example_x[0]])

    test_x = [emb_program, example_x[1], example_x[2]]
    test_y = example_y

    model_result =  model.predict(test_x, verbose=False)
    org_model_result = org_model.predict(example_x, verbose=False)

    return model_result[0], org_model_result[0]

def genetate_classification_report(evaluation_dataset, org_model, emb_model, model):

    org_model_Y_pred, model_Y_pred = [], []
    neg_program_ids, neg_programs, neg_problem_ids, neg_test_ids, neg_verdicts, neg_buggy_subtrees, neg_buggy_line_to_subtrees = evaluation_dataset
    todo_cnt = len(neg_program_ids)
    
    for idx in range(test_cnt):
        neg_program_id, neg_program, neg_problem_id, neg_test_id, neg_verdict, \
        neg_buggy_subtree, neg_buggy_line_to_subtree = get_negative_example(evaluation_dataset, idx)
        neg_problem_id = rev_problem_id_dict[neg_problem_id]

        example_x = [ np.array([neg_program]), np.array([[neg_problem_id]]), np.array([[neg_test_id]]) ]
        example_y = keras.utils.to_categorical([neg_verdict], num_classes=2)

        emb_program = emb_model.predict([example_x[0]])
        test_x = [emb_program, example_x[1], example_x[2]]
        test_y = example_y
    
        model_Y_pred.append(np.squeeze(model.predict(test_x, verbose=False)))
        org_model_Y_pred.append(np.squeeze(org_model.predict(example_x, verbose=False)))

        print 'new model results:'
        model_Y_pred = np.argmax(model_Y_pred, axis=1)
        print '#correct predictions:', np.sum(neg_verdicts[:test_cnt] == model_Y_pred)
        print(classification_report(neg_verdicts[:test_cnt], model_Y_pred))

        print '\norg model results:'
        org_model_Y_pred = np.argmax(org_model_Y_pred, axis=1)
        print '#correct predictions:', np.sum(neg_verdicts[:test_cnt] == org_model_Y_pred)
        print(classification_report(neg_verdicts[:test_cnt], org_model_Y_pred))

################################# attribution helpers

# original model
global_embedded_incorrect_programs_cache = {}
def get_embedded_program(session, emb_model, program, program_id):
    if program_id is None:
        emb_program = emb_model.predict([program])
        return emb_program
    
    global global_embedded_incorrect_programs_cache
    if program_id not in global_embedded_incorrect_programs_cache:
        emb_program = emb_model.predict([program])
        global_embedded_incorrect_programs_cache[program_id] = emb_program
    return global_embedded_incorrect_programs_cache[program_id]

# emb model
def make_predictions_and_gradients(session, model, softmax_output, label_gradients):
    feed_list = [model.input[0], model.input[1], model.input[2], target_label_one_hot]
    run_graph = session.make_callable([softmax_output, label_gradients], feed_list=feed_list)

    def preds_and_grads_fn(emb_program, example_x, example_y):
        softmax_predictions_out, label_gradients_out = run_graph(emb_program, example_x[1], example_x[2], example_y)
        return softmax_predictions_out, label_gradients_out

    return preds_and_grads_fn

def top_label_id_and_score(emb_program, example_x, preds_and_grads_fn):
    dummy_target_label_index = 0
    dummy_train_y = keras.utils.to_categorical(dummy_target_label_index, num_classes=2)
    preds, _ = preds_and_grads_fn(emb_program, example_x, [dummy_train_y])
    index = np.argmax(preds[0])
    return index, preds[0][index]

def integrated_gradients(emb_program, example_x, example_y, predictions_and_gradients, emb_baseline, steps=100):
    inp = emb_program
    baseline = emb_baseline

    if baseline is None:
        baseline = 0*inp
    assert baseline.shape == inp.shape, 'baseline.shape:{}, inp.shape:{}'.format(baseline.shape, inp.shape)

    # Scale input and compute gradients.
    scaled_inputs = [baseline + (float(i)/steps)*(inp-baseline) for i in range(0, steps+1)]
    predictions, grads = [], []
    for scaled_input in scaled_inputs:
        top_label_id, score = top_label_id_and_score(scaled_input, example_x, predictions_and_gradients)
        prediction, grad = predictions_and_gradients(scaled_input, example_x, example_y)  # shapes: <steps+1>, <steps+1, inp.shape>
        predictions.append(prediction)
        grads.append(grad)

    avg_grads = np.average(np.array(grads[:-1]), axis=0)
    assert np.shape(avg_grads) == np.shape(emb_program), 'avg_grads shape:{}, inp shape:{}'.format(np.shape(avg_grads), np.shape(emb_program))
    integrated_gradients_value = (inp-baseline)*avg_grads  # shape: <inp.shape>
    return integrated_gradients_value, np.squeeze(np.array(predictions))

################################################# other helper functions

def get_id_map(ast, program_id=None):
    '''shuffles ids before assigning them indices using 
    program_id as randomness seed if program_id is not None'''
    
    ids = []
    for subtree, coord in ast:
        for node in subtree:
            if '_<id>_' in node and '@' in node:
                org_id = node.split('_<id>_')[1].split('@')[0]
                if org_id not in ids:
                    ids.append(org_id)
                    
    if program_id is not None:
        random.seed(program_id)
        random.shuffle(ids)

    id_map = {}
    for id_ in ids:
        id_map[id_] = len(id_map)
    return id_map

def normalize_ids(ast, id_map):
    new_ast = []
    for subtree, coord in ast:
        new_subtree = []
        for node in subtree:
            if '_<id>_' in node and '@' in node:
                org_id = node.split('_<id>_')[1].split('@')[0]
                new_subtree.append(node.replace('_<id>_' + org_id + '@', '_<id>_' + str(id_map[org_id]) + '@'))
            else:
                new_subtree.append(node)
        assert len(new_subtree) == len(subtree)
        new_ast.append((new_subtree, coord))
    return new_ast


def vectorize_subtree_list_ast(_tl_dict, subtree_list_ast, max_subtrees_per_program, max_nodes_per_subtree, buggy_line):
    if len(subtree_list_ast) > max_subtrees_per_program:
        return None, None
    
    vec_ast = []
    buggy_subtree = None
    for idx, (subtree, coord) in enumerate(subtree_list_ast):
        if buggy_subtree is None:
            line, char = map(int, coord.split(':'))
            if line == buggy_line: buggy_subtree = idx
        vec_subtree = []
        for token in subtree:
            vec_subtree.append(_tl_dict[token])
        vec_ast.append(vec_subtree)
        
        if len(vec_subtree) > max_nodes_per_subtree:
            return None, None
    return vec_ast, buggy_subtree


def load_all_correct_programs(db_path, problem_id_set):
    query='''SELECT p.program_id, program, user_id, subtree_list_ast_without_leaves, time_stamp FROM
            programs p INNER JOIN orgsource o ON o.program_id = p.program_id
            INNER JOIN test_run_summary trs ON trs.program_id = p.program_id
            WHERE trs.verdict="ALL_PASS" AND problem_id=?;'''

    correct_programs = {}

    with sqlite3.connect(db_path) as conn:
        c = conn.cursor()

        for problem_id in problem_id_set:
            correct_programs[problem_id] = []
            vectorization_errors = 0
            size_mismatch_errors = 0

            for row in c.execute(query, (problem_id,)):
                program_id, program, user_id, subtree_list_ast, time_stamp = row
                subtree_list_ast = json.loads(subtree_list_ast)
                program = program.encode('utf-8','ignore')

                id_map = get_id_map(subtree_list_ast, program_id=program_id)
                norm_id_subtree_list_ast = normalize_ids(subtree_list_ast, id_map)
                try:
                    vec_ast, buggy_subtree = vectorize_subtree_list_ast(tl_dict, norm_id_subtree_list_ast, dataset.max_subtrees_per_program, dataset.max_nodes_per_subtree, buggy_line=0)
                except KeyError:
                    vectorization_errors += 1
                
                if vec_ast is not None:
                    correct_programs[problem_id].append((program_id, program, user_id, time_stamp, id_map, vec_ast))
                else:
                    size_mismatch_errors += 1

            print 'problem_id:', problem_id, '#correct_programs:', len(correct_programs[problem_id]),
            print 'vectorization_errors:', vectorization_errors, 'size_mismatch_errors:', size_mismatch_errors

        c.close()
    return correct_programs

def get_all_correct_program_embeddings(correct_programs):
    correct_program_embeddings = {}
    correct_program_id_map = {}
    _user_indices = {}
    for problem_id in correct_programs:
        _user_indices[problem_id] = {}
        correct_program_id_map[problem_id] = []
        ast_list = []
        for idx, (program_id, program, user_id, time_stamp, id_map, vec_ast) in enumerate(correct_programs[problem_id]):
            ast_list.append(vec_ast)
            correct_program_id_map[problem_id].append(program_id)
            if user_id not in _user_indices[problem_id]:
                _user_indices[problem_id][user_id] = set()
            _user_indices[problem_id][user_id].add(idx)
            
        ast_batch, _, _ = dataset.prepare_batch(ast_list)
        embeddings_batch = get_embedded_program(sess, emb_model, ast_batch, None)
        correct_program_embeddings[problem_id] = embeddings_batch
        correct_program_id_map[problem_id] = np.array(correct_program_id_map[problem_id])
        print problem_id, len(correct_program_embeddings[problem_id]), len(_user_indices[problem_id])

    return correct_program_embeddings, correct_program_id_map, _user_indices

def get_correct_embeddings(problem_id, user_id_to_exclude, in_time_stamp=None):
    global _user_indices, correct_program_embeddings, correct_programs
    
    if user_id_to_exclude is not None and user_id_to_exclude in _user_indices[problem_id]:
        indices_to_exclude = deepcopy(_user_indices[problem_id][user_id_to_exclude])
    else:
        indices_to_exclude = set()
            
    if in_time_stamp is not None:
        for problem_id in correct_programs:
            for idx, (program_id, _, _, time_stamp, _, _) in enumerate(correct_programs[problem_id]):
                if time_stamp > in_time_stamp:
                    indices_to_exclude.add(idx)

    all_indices = set(range(len(correct_program_embeddings[problem_id])))
    indices = all_indices - indices_to_exclude
    indices = sorted(list(indices))

    assert in_time_stamp is not None or len(indices) > 0, '#indices:%d, problem_id:%s, user_id:%s' % (len(indices), problem_id, user_id_to_exclude)
    return correct_program_embeddings[problem_id][indices], correct_program_id_map[problem_id][indices]

def get_correct_embeddings_using_clustering(problem_id, user_id_to_exclude, inc_embedding):
    global _user_indices, correct_program_embeddings, cluster_store
    kmeans, kmeans_labels, kmeans_labels_to_index_map = cluster_store[problem_id][0], cluster_store[problem_id][1], cluster_store[problem_id][3]
    X = inc_embedding
    emb_shape = np.shape(X)
    X = np.reshape(X, (emb_shape[0], emb_shape[1] * emb_shape[2] * emb_shape[3]))
    cluster_index = kmeans.predict(X)[0]
    
    return_indices = kmeans_labels_to_index_map[cluster_index]
    
    if user_id_to_exclude is None:
        indices_to_exclude = set()
    else:
        if user_id_to_exclude in _user_indices[problem_id]:
            indices_to_exclude = _user_indices[problem_id][user_id_to_exclude]
        else:
            indices_to_exclude = set()
    return_indices = return_indices - indices_to_exclude
    indices = sorted(list(return_indices))
    return correct_program_embeddings[problem_id][indices], correct_program_id_map[problem_id][indices]

def get_program_details(program_id):
    query = '''SELECT problem_id, user_id, program 
                FROM orgsource o INNER JOIN programs p ON o.program_id=p.program_id
                WHERE o.program_id=?;'''
    with sqlite3.connect(db_path) as conn:
        c = conn.cursor()
        for row in c.execute(query, (program_id,)):
            problem_id, user_id, program = row
    return problem_id, user_id, program.encode('utf-8','ignore')

def remove_empty_lines(program):
    lines = [line for line in program.split('\n') if len(line.strip()) > 0]
    return '\n'.join(lines)

def get_least_k(arr, k):
    safe_k = min(len(arr)-1, k)
    least_k_indices = np.argpartition(arr, safe_k)[:k]
    sorted_least_k_indices = least_k_indices[np.argsort(arr[least_k_indices])]
    return sorted_least_k_indices, arr[sorted_least_k_indices]

global_ref_program_detail_cache = {}
def get_ref_program(program_id):
    if program_id not in global_ref_program_detail_cache:
        global direct_eval_set
        c_program_id, diff_out, diff_len, edit_locations = direct_eval_set[program_id]

        query='''SELECT program, user_id, subtree_list_ast_without_leaves FROM
            programs p INNER JOIN orgsource o ON o.program_id = p.program_id
            WHERE p.program_id=?;'''

        with sqlite3.connect(db_path) as conn:
            c = conn.cursor()

            row = c.execute(query, (program_id,)).next()
            inc_program, inc_user_id, inc_subtree_list_ast = row
            inc_program = inc_program.encode('utf-8','ignore')
            inc_subtree_list_ast = json.loads(inc_subtree_list_ast)

            row = c.execute(query, (c_program_id,)).next()
            ref_program, ref_user_id, ref_subtree_list_ast = row
            ref_program = ref_program.encode('utf-8','ignore')
            ref_subtree_list_ast = json.loads(ref_subtree_list_ast)

            c.close()
            global_ref_program_detail_cache[program_id] = (inc_program, inc_user_id, inc_subtree_list_ast, c_program_id,                                                     ref_program, ref_user_id, ref_subtree_list_ast, diff_out,                                                     edit_locations)
    
    return global_ref_program_detail_cache[program_id]


def get_baselines(incorrect_embedding, correct_embeddings, correct_embedding_ids, k=25):
    emb_shape = np.shape(incorrect_embedding)
    flat_len = emb_shape[1] * emb_shape[2] * emb_shape[3]
    incorrect_embedding = np.reshape(incorrect_embedding, (1, flat_len))
    correct_embeddings = np.reshape(correct_embeddings, (np.shape(correct_embeddings)[0], flat_len))

    cosine_distances = cdist(incorrect_embedding, correct_embeddings, 'cosine')
    indices, vals = get_least_k(cosine_distances[0], k)
    k = min(k, len(correct_embeddings))
    baselines = correct_embeddings[indices[:k]]
    baseline_ids = correct_embedding_ids[indices[:k]]
    emb_shape = list(emb_shape)
    emb_shape[0] = k
    baselines = np.reshape(baselines, emb_shape)
    return baselines, baseline_ids

def get_verified_baseline(baselines, baseline_ids, problem_id, test_id):
    global predictions_and_gradients
    dummy_program_vec = [0]
    for idx in range(len(baselines)):
        baseline = baselines[idx:idx+1]
        baseline_id = baseline_ids[idx]
        example_x = [ np.array([dummy_program_vec]), np.array([[problem_id]]), np.array([[test_id]]) ]
        top_label_id, score = top_label_id_and_score(baseline, example_x, predictions_and_gradients)
        correct_prediction = np.equal(top_label_id, 1)
        if correct_prediction:
            return baseline, baseline_id, idx
    return None, None, None

def get_line_attribution(subtree_attributions, program_id):
    global subtree_to_line_maps
    subtree_to_line = subtree_to_line_maps[program_id]
    line_attributions = {}
    for idx, attrb in enumerate(subtree_attributions):
        try:
            line = subtree_to_line[idx]
        except:
            break
        if line not in line_attributions:
            line_attributions[line] = []
        line_attributions[line].append(attrb)
    lines = list(sorted(line_attributions.keys()))
    min_line = lines[0]
    line_attributions_list = []
    for line in lines:
        line_attributions_list.append(np.mean(line_attributions[line]))
    return np.array(line_attributions_list), min_line

def get_top_k_lines(subtree_attributions, program_id, k):
    line_attributions_list, min_line = get_line_attribution(subtree_attributions, program_id)
    k = min(len(line_attributions_list), k)
    indices, attrbs = get_top_k(line_attributions_list, k)
    lines = map(lambda index : index + min_line, indices)
    return lines, attrbs, len(line_attributions_list)

def get_top_k(arr, k):
    top_k_indices = np.argpartition(arr, -k)[-k:]
    top_k_indices = list(reversed(top_k_indices[np.argsort(arr[top_k_indices])]))
    return top_k_indices, arr[top_k_indices]

In [None]:
data_directory = 'data/network_inputs/bugloc-original/'
checkpoints_directory = 'data/checkpoints/bugloc-original/'
db_path = 'data/dataset.db'
dataset = None

In [None]:
class custom_args:    
    def __init__(self, data_directory, checkpoints_directory, batch_size=32, embedding_dim=32, dropout=0.2, epochs=50, only_test=False):
        self.data_directory = data_directory
        self.checkpoints_directory = checkpoints_directory
        self.batch_size = 16
        self.embedding_dim = embedding_dim
        self.dropout = dropout
        self.epochs = epochs
        self.only_test = only_test
        
args = custom_args(data_directory, checkpoints_directory)

In [None]:
## Load data
data_directory = args.data_directory
checkpoints_directory = args.checkpoints_directory
batch_size = args.batch_size
embedding_dim = args.embedding_dim

if dataset is None:
    dataset = Dataset(data_directory)
tl_dict = dataset.get_tl_dict()
rev_tl_dict = dataset.get_rev_tl_dict()
rev_problem_id_dict = get_rev_dict(dataset.get_problem_id_dict())
rev_test_dict = get_rev_dict(dataset.get_test_dict())

num_train, num_validation, num_test, num_all = dataset.data_size
print 'Training:', num_train, '\nValidation:', num_validation, '\nTest:', num_test, '\nAll:', num_all
print 'vocabulary size:', dataset.vocab_size

vocab_size = dataset.vocab_size
cnt_problem_ids = dataset.cnt_problem_IDs
test_suite_size = dataset.test_suite_size
max_subtrees = dataset.max_subtrees_per_program
max_nodes = dataset.max_nodes_per_subtree

print 'cnt_problem_ids:', cnt_problem_ids, 'test_suite_size:', test_suite_size
print 'max_subtrees:', max_subtrees, 'max_nodes:', max_nodes

In [None]:
programs, program_lengths, subtree_lengths, problem_ids, test_ids, verdicts, buggy_subtrees, program_ids = dataset.get_batch(start=0, end=num_all, which='all')

train_x = [programs[:num_train], np.array(problem_ids[:num_train]), np.array(test_ids[:num_train])]
train_y = keras.utils.to_categorical(verdicts[:num_train], num_classes=2)

valid_x = [programs[num_train:num_train+num_validation], np.array(problem_ids[num_train:num_train+num_validation]), np.array(test_ids[num_train:num_train+num_validation])]
valid_y = keras.utils.to_categorical(verdicts[num_train:num_train+num_validation], num_classes=2)

otest_x = [programs[num_train+num_validation:], np.array(problem_ids[num_train+num_validation:]), np.array(test_ids[num_train+num_validation:])]
otest_y = keras.utils.to_categorical(verdicts[num_train+num_validation:], num_classes=2)

print '#train:', len(train_y), '#valid:', len(valid_y), '#test:', len(otest_y)

In [None]:
eval_program_ids = set(program_ids[num_train+num_validation:])

In [None]:
eval_set = np.load('data/eval_set.npy').item()
print '#problems:', len(eval_set),

direct_eval_set = {}
for problem_id in eval_set:
    for program_id, row in eval_set[problem_id].items():
        direct_eval_set[program_id] = row

eval_set_program_ids = direct_eval_set.keys()
print '#programs:', len(eval_set_program_ids)

subtree_to_line_maps = np.load('data/subtree_to_line_map.npy').item()
print 'len(subtree_to_line_maps):', len(subtree_to_line_maps)

In [None]:
errs_in_finding_buggy_subtrees = set(np.load(os.path.join(args.data_directory, 'errs_in_finding_buggy_subtrees.npy')))
print len(errs_in_finding_buggy_subtrees)

## First load original model

In [None]:
# find checkpoint
ckpts = glob.glob(os.path.join(checkpoints_directory, "weights.*-*.hdf5"))
best_checkpoint, initial_epoch = None, 0
if len(ckpts) > 0:
    for ckpt in ckpts:
        # ckpt_epoch = int(ckpt.split('-')[0].split('.')[1])
        ckpt_epoch = int(ckpt.split('/')[-1].split('-')[0].split('.')[1])
        if initial_epoch < ckpt_epoch:
            initial_epoch = ckpt_epoch
            best_checkpoint = ckpt
            
print best_checkpoint

In [None]:
org_model = get_org_model(max_subtrees, max_nodes, embedding_dim, vocab_size, best_checkpoint)
print 'evaluating org model'
print 'valid'
print org_model.evaluate(valid_x, valid_y, verbose=1)

print '\ntest'
print org_model.evaluate(otest_x, otest_y, verbose=1)

print '\ntrain'
train_eval_cnt = 14337
small_train_x = [train_x[0][:train_eval_cnt], train_x[1][:train_eval_cnt], train_x[2][:train_eval_cnt]]
print org_model.evaluate(small_train_x, train_y[:train_eval_cnt], verbose=1)

## Break model in two parts -
### 1) Given program, get embedding

In [None]:
emb_model = get_emb_model(max_subtrees, max_nodes, embedding_dim, vocab_size)

In [None]:
## use org model to initialize emb_model weights correctly
emb_model.layers[1].set_weights([org_model.get_weights()[0]])

### 2) Given program embedding as input, do rest of the computation

In [None]:
model = get_rest_of_the_model(max_subtrees, max_nodes, embedding_dim, vocab_size)
model_updated_for_attribution = False

In [None]:
## use org model to initialize emb_model weights correctly
weights_list = org_model.get_weights()   # 1st correspond to embedding layer, which needs to be removed
model.set_weights(weights_list[1:])

## Gather failing test case examples

In [None]:
neg_program_ids, neg_programs, neg_problem_ids, neg_test_ids,  neg_verdicts, neg_buggy_subtrees, \
neg_buggy_line_to_subtrees = prepare_failing_data(data_directory, eval_program_ids-errs_in_finding_buggy_subtrees)

evaluation_dataset = (neg_program_ids, neg_programs, neg_problem_ids, neg_test_ids, neg_verdicts, neg_buggy_subtrees, neg_buggy_line_to_subtrees)
evaluation_dataset_length = len(neg_program_ids)

In [None]:
org_model_Y_pred, model_Y_pred = [], []
test_cnt = len(neg_verdicts)
for idx in range(test_cnt):
    neg_program_id = neg_program_ids[idx]
    neg_program = neg_programs[idx]
    neg_problem_id = neg_problem_ids[idx]
    neg_test_id = neg_test_ids[idx]
    neg_verdict = neg_verdicts[idx]
    neg_buggy_subtree = neg_buggy_subtrees[idx]

    example_x = [ np.array([neg_program]), np.array([[neg_problem_id]]), np.array([[neg_test_id]]) ]
    example_y = keras.utils.to_categorical([neg_verdict], num_classes=2)

    emb_program = emb_model.predict([example_x[0]])
    test_x = [emb_program, example_x[1], example_x[2]]
    test_y = example_y
   
    model_Y_pred.append(np.squeeze(model.predict(test_x, verbose=False)))
    org_model_Y_pred.append(np.squeeze(org_model.predict(example_x, verbose=False)))

print 'new model results:'
model_Y_pred = np.argmax(model_Y_pred, axis=1)
print '#correct predictions:', np.sum(neg_verdicts[:test_cnt] == model_Y_pred)
print(classification_report(neg_verdicts[:test_cnt], model_Y_pred))

print '\norg model results:'
org_model_Y_pred = np.argmax(org_model_Y_pred, axis=1)
print '#correct predictions:', np.sum(neg_verdicts[:test_cnt] == org_model_Y_pred)
print(classification_report(neg_verdicts[:test_cnt], org_model_Y_pred))

## Update model for attribution

In [None]:
if not model_updated_for_attribution:
    target_label_one_hot = K.placeholder(shape=(None,2,))
    softmax_output = model.layers[-1].output
    label_softmax_output = K.sum(softmax_output * target_label_one_hot)
    program_embedding_input = model.input[0]
    label_gradients = K.gradients(label_softmax_output, program_embedding_input)[0]
    model_updated_for_attribution = True

In [None]:
sess = K.get_session()
print sess

In [None]:
predictions_and_gradients = make_predictions_and_gradients(sess, model, softmax_output, label_gradients)

## Gather all correct programs

In [None]:
query='''SELECT p.program_id, program, user_id, subtree_list_ast_without_leaves, time_stamp FROM
        programs p INNER JOIN orgsource o ON o.program_id = p.program_id
        INNER JOIN test_run_summary trs ON trs.program_id = p.program_id
        WHERE trs.verdict="ALL_PASS" AND problem_id=?;'''

correct_programs = {}

with sqlite3.connect(db_path) as conn:
    c = conn.cursor()
    
    problem_ids_used = [str(row[0]) for row in c.execute('SELECT DISTINCT problem_id FROM orgsource;')]

    for problem_id in problem_ids_used:
        correct_programs[problem_id] = []
        vectorization_errors = 0
        size_mismatch_errors = 0

        for row in c.execute(query, (problem_id,)):
            program_id, program, user_id, subtree_list_ast, time_stamp = row
            subtree_list_ast = json.loads(subtree_list_ast)
            program = program.encode('utf-8','ignore')

            id_map = get_id_map(subtree_list_ast, program_id=program_id)
            norm_id_subtree_list_ast = normalize_ids(subtree_list_ast, id_map)
            try:
                vec_ast, buggy_subtree = vectorize_subtree_list_ast(tl_dict, norm_id_subtree_list_ast, dataset.max_subtrees_per_program, dataset.max_nodes_per_subtree, buggy_line=0)
            except KeyError:
                vectorization_errors += 1
            
            if vec_ast is not None:
                correct_programs[problem_id].append((program_id, program, user_id, time_stamp, id_map, vec_ast))
            else:
                size_mismatch_errors += 1

        print 'problem_id:', problem_id, '#correct_programs:', len(correct_programs[problem_id]),
        print 'vectorization_errors:', vectorization_errors, 'size_mismatch_errors:', size_mismatch_errors

    c.close()

## Get embeddings corresponding to correct programs

In [None]:
correct_program_embeddings = {}
correct_program_id_map = {}
_user_indices = {}
for problem_id in correct_programs:
    _user_indices[problem_id] = {}
    correct_program_id_map[problem_id] = []
    ast_list = []
    for idx, (program_id, program, user_id, time_stamp, id_map, vec_ast) in enumerate(correct_programs[problem_id]):
        ast_list.append(vec_ast)
        correct_program_id_map[problem_id].append(program_id)
        if user_id not in _user_indices[problem_id]:
            _user_indices[problem_id][user_id] = set()
        _user_indices[problem_id][user_id].add(idx)
        
    ast_batch, _, _ = dataset.prepare_batch(ast_list)
    embeddings_batch = get_embedded_program(sess, emb_model, ast_batch, None)
    correct_program_embeddings[problem_id] = embeddings_batch
    correct_program_id_map[problem_id] = np.array(correct_program_id_map[problem_id])
    print problem_id, len(correct_program_embeddings[problem_id]), len(_user_indices[problem_id])

## bug-localization

In [None]:
bug_to_line_map = np.load('data/bug_to_line_map.npy').item()
print 'len(bug_to_line_map):', len(bug_to_line_map)

In [None]:
all_eval_programs = set()
for idx in range(len(neg_program_ids)):
    neg_program_id = neg_program_ids[idx]
    all_eval_programs.add(neg_program_id)
print len(all_eval_programs)

In [None]:
wrong_predictions = []
empty_baselines = []

all_faulty_lines = {}
faulty_lines_found = {10:{}, 5:{}, 1:{}}

pairs_localized = {10:set(), 5:set(), 1:set()}
pairs_missed = {10:set(), 5:set(), 1:set()}
programs_localized = {10:set(), 5:set(), 1:set()}
all_eval_programs = set()
remaining_eval_programs = set()
max_k = 10

baseline_indices = []

todo_cnt = len(neg_program_ids)
done = 0
skipped = []

for idx in range(todo_cnt):
    neg_program_id = neg_program_ids[idx]
    neg_program = neg_programs[idx]
    neg_problem_id = neg_problem_ids[idx]
    neg_test_id = neg_test_ids[idx]
    neg_verdict = neg_verdicts[idx]
    neg_buggy_subtree = neg_buggy_subtrees[idx]
    neg_buggy_line_to_subtree = neg_buggy_line_to_subtrees[idx]

    if neg_program_id not in all_faulty_lines:
        all_faulty_lines[neg_program_id] = {}
    
    if neg_program_id in bug_to_line_map:
        try:
            all_faulty_lines[neg_program_id][neg_test_id] = deepcopy(bug_to_line_map[neg_program_id][rev_test_dict[neg_test_id]])
        except KeyError:
            skipped.append((neg_program_id, rev_problem_id_dict[neg_problem_id], rev_test_dict[neg_test_id]))
            print '#Skipped:', len(skipped), '\r',
            continue
    else:
        all_faulty_lines[neg_program_id][neg_test_id] = set(neg_buggy_line_to_subtree.keys())
        
    all_eval_programs.add(neg_program_id)
        
    for each in [10,5,1]:
        if neg_program_id not in faulty_lines_found[each]:
            faulty_lines_found[each][neg_program_id] = {neg_test_id:set()}
        else:
            assert neg_test_id not in faulty_lines_found[each][neg_program_id]
            faulty_lines_found[each][neg_program_id][neg_test_id] = set()
    
    assert len(all_faulty_lines[neg_program_id][neg_test_id])>0, neg_program_id
        

    example_x = [ np.array([neg_program]), np.array([[neg_problem_id]]), np.array([[neg_test_id]]) ]
    example_y = keras.utils.to_categorical([neg_verdict], num_classes=2)
    
    emb_inc_program = get_embedded_program(sess, emb_model, example_x[0], neg_program_id)

    top_label_id, score = top_label_id_and_score(emb_inc_program, example_x, predictions_and_gradients)
    correct_neg_prediction = np.equal(top_label_id, neg_verdict)

    if not correct_neg_prediction:
        wrong_predictions.append((neg_program_id,neg_test_id))
        continue
    else:
        remaining_eval_programs.add(neg_program_id)
    
    user_id_to_exclude = get_ref_program(neg_program_id)[1]
    emb_corr_programs, emb_corr_program_ids = get_correct_embeddings(rev_problem_id_dict[neg_problem_id], user_id_to_exclude)
    baselines, baseline_ids = get_baselines(emb_inc_program, emb_corr_programs, emb_corr_program_ids, k=25)
    v_baseline, v_baseline_id, v_baseline_index = get_verified_baseline(baselines, baseline_ids, neg_problem_id, neg_test_id)
    baseline_indices.append(v_baseline_index)
    
    if v_baseline is None:
        empty_baselines.append((neg_program_id,neg_test_id))
        continue

    baseline_emb = top_baseline = v_baseline
    
    attributions, predictions = integrated_gradients(emb_inc_program, example_x, example_y, predictions_and_gradients, baseline_emb, steps=100)
    subtree_attributions = np.squeeze(np.mean(np.amax(attributions, axis=-1), axis=-1))  # np.squeeze()
    top_k_subtrees, top_k_subtree_vals = get_top_k(subtree_attributions, max_k)

    top_k_lines, top_k_line_vals, program_length = get_top_k_lines(subtree_attributions, neg_program_id, k=max_k)
    
    for top_k in [10,5,1]:
        some_line_found = False
        for line in all_faulty_lines[neg_program_id][neg_test_id]:
            if line in top_k_lines[:top_k]:
                faulty_lines_found[top_k][neg_program_id][neg_test_id].add(line)
                some_line_found = True
                           
        if some_line_found:
            pairs_localized[top_k].add((neg_program_id,neg_test_id))
            programs_localized[top_k].add((neg_program_id))
        else:
            pairs_missed[top_k].add((neg_program_id,neg_test_id))
        

    remaining = todo_cnt - idx - 1
    denom = idx+1
    
    if idx%5==0 and idx>0:
        for top_k in [10,5,1]:
            print '%d|F:%4d, M:%4d, A:%5.2f%%' % (top_k, len(pairs_localized[top_k]), len(pairs_missed[top_k]), 100.0*len(pairs_localized[top_k])/denom),
        print '|| CNT:%4d, WP:%3d, EB:%3d \r' % (remaining, len(wrong_predictions), len(empty_baselines)),
        
    done += 1
        
for top_k in [10,5,1]:
    print '%d|F:%4d, M:%4d, A:%5.2f%%' % (top_k, len(pairs_localized[top_k]), len(pairs_missed[top_k]), 100.0*len(pairs_localized[top_k])/denom),
print '|| CNT:%4d, WP:%3d, EB:%3d \r' % (remaining, len(wrong_predictions), len(empty_baselines))

print 'skipped:', len(skipped), '\n'

In [None]:
total_pairs = sum(map(len, [pairs_localized[top_k], pairs_missed[top_k], wrong_predictions]))
print 'total_pairs:', total_pairs, 'pairs localized:'
print 'top_k localized'
for top_k in [10,5,1]:
    print top_k, len(pairs_localized[top_k]), '%4.2f%%' % (100.0*len(pairs_localized[top_k])/total_pairs)

In [None]:
prog_faulty_lines = {}
for program_id in all_faulty_lines:
    if program_id not in prog_faulty_lines: prog_faulty_lines[program_id] = set()
    for test_id in all_faulty_lines[program_id]:
        prog_faulty_lines[program_id].update(all_faulty_lines[program_id][test_id])
        
cnt_faulty_lines = 0
for program_id in prog_faulty_lines:
    cnt_faulty_lines += len(prog_faulty_lines[program_id])

prog_faulty_lines_found = {10:{},5:{},1:{}}
# print cnt_faulty_lines,
for top_k in [10,5,1]:
    for program_id in faulty_lines_found[top_k]:
        if program_id not in prog_faulty_lines_found[top_k]: prog_faulty_lines_found[top_k][program_id] = set()
        for test_id in faulty_lines_found[top_k][program_id]:
            prog_faulty_lines_found[top_k][program_id].update(faulty_lines_found[top_k][program_id][test_id])
    
    cnt_lines_found = 0
    for program_id in prog_faulty_lines_found[top_k]:
        cnt_lines_found += len(prog_faulty_lines_found[top_k][program_id])

#     print cnt_lines_found, '%4.2f%%' % (100.0*cnt_lines_found/cnt_faulty_lines),
# print '\n'

# for programs with more than 1 faulty lines
print 'Grouped by line diff counts:'
cnt_faulty_progs, cnt_faulty_progs_found = {}, {}
cnt_faulty_lines, cnt_lines_found = {}, {}
for top_k in [10,5,1]:
    cnt_faulty_progs[top_k], cnt_faulty_progs_found[top_k] = {'total':0}, {'total':0}
    cnt_faulty_lines[top_k], cnt_lines_found[top_k] = {'total':0}, {'total':0}
    for num_buggy_lines in range(1,6):
        cnt_faulty_progs[top_k][num_buggy_lines], cnt_faulty_progs_found[top_k][num_buggy_lines] = 0, 0
        cnt_faulty_lines[top_k][num_buggy_lines], cnt_lines_found[top_k][num_buggy_lines] = 0, 0
        for program_id in prog_faulty_lines:
            if len(prog_faulty_lines[program_id]) == num_buggy_lines:
                cnt_faulty_progs[top_k][num_buggy_lines] += 1
                cnt_faulty_lines[top_k][num_buggy_lines] += num_buggy_lines
                cnt_faulty_progs[top_k]['total'] += 1
                cnt_faulty_lines[top_k]['total'] += num_buggy_lines
                
                cnt_faulty_progs_found[top_k][num_buggy_lines] += 1 if len(prog_faulty_lines_found[top_k][program_id]) > 0 else 0
                cnt_lines_found[top_k][num_buggy_lines] += len(prog_faulty_lines_found[top_k][program_id])
                cnt_faulty_progs_found[top_k]['total'] += 1 if len(prog_faulty_lines_found[top_k][program_id]) > 0 else 0
                cnt_lines_found[top_k]['total'] += len(prog_faulty_lines_found[top_k][program_id])

print '\t\t\t\t top-10 \t\t top-5 \t\t top-1'
print 'diff|progs lines|', '%-22s|' % ('#10 PF \t #10 LF'), '%-22s|' % ('#5 PF \t #5 LF'), '%-22s|' % ('#1 PF \t #1 LF') 

for key in cnt_faulty_lines[10]:    
    if cnt_faulty_progs[10][key] == 0:
        continue
    print '{}\t'.format(key), '%4d %4d' % (cnt_faulty_progs[10][key], cnt_faulty_lines[10][key]),
    for top_k in [10,5,1]:
        print '%4d (%6.2f%%)' % (cnt_faulty_progs_found[top_k][key], (100.0*cnt_faulty_progs_found[top_k][key]/cnt_faulty_progs[top_k][key])),
        print '%4d (%6.2f%%)' % (cnt_lines_found[top_k][key], (100.0*cnt_lines_found[top_k][key]/cnt_faulty_lines[top_k][key]) ),
    print

print

baseline_indices = list(sorted(baseline_indices))
for percentile in [50,60,70,80,90,95,98,99]:
    i = int( percentile * len(baseline_indices) / 100.0 )
    print 'percentile:', percentile, 'baseline_indices:',  baseline_indices[i]

### results with wrong baselines counted explicitly

In [None]:
total_pairs = sum(map(len, [pairs_localized[top_k], pairs_missed[top_k]]))
print 'total_pairs', 10, 5, 1
print total_pairs,
for top_k in [10,5,1]:
    # print top_k, tests_localized[top_k], tests_localized[top_k] + tests_missed[top_k] + len(skipped) + wrong_predictions
    print len(pairs_localized[top_k]), '%4.2f%%' % (100.0*len(pairs_localized[top_k])/total_pairs),

In [None]:
prog_faulty_lines = {}
for program_id in all_faulty_lines:
    if program_id not in prog_faulty_lines: prog_faulty_lines[program_id] = set()
    for test_id in all_faulty_lines[program_id]:
        if (program_id, test_id) not in wrong_predictions:
            prog_faulty_lines[program_id].update(all_faulty_lines[program_id][test_id])
        
cnt_faulty_lines = 0
for program_id in prog_faulty_lines:
    cnt_faulty_lines += len(prog_faulty_lines[program_id])

prog_faulty_lines_found = {10:{},5:{},1:{}}
# print 'total top-10 top-5 top-1'
# print cnt_faulty_lines,
for top_k in [10,5,1]:
    for program_id in faulty_lines_found[top_k]:
        if program_id not in prog_faulty_lines_found[top_k]: prog_faulty_lines_found[top_k][program_id] = set()
        for test_id in faulty_lines_found[top_k][program_id]:
            prog_faulty_lines_found[top_k][program_id].update(faulty_lines_found[top_k][program_id][test_id])
    
    cnt_lines_found = 0
    for program_id in prog_faulty_lines_found[top_k]:
        cnt_lines_found += len(prog_faulty_lines_found[top_k][program_id])

#     print cnt_lines_found, '%5.2f%%' % (100.0*cnt_lines_found/cnt_faulty_lines),
# print '\n'


print 'Grouped by line diff counts:'
cnt_faulty_progs, cnt_faulty_progs_found = {}, {}
cnt_faulty_lines, cnt_lines_found = {}, {}
for top_k in [10,5,1]:
    cnt_faulty_progs[top_k], cnt_faulty_progs_found[top_k] = {'total':0}, {'total':0}
    cnt_faulty_lines[top_k], cnt_lines_found[top_k] = {'total':0}, {'total':0}
    for num_buggy_lines in range(1,6):
        cnt_faulty_progs[top_k][num_buggy_lines], cnt_faulty_progs_found[top_k][num_buggy_lines] = 0, 0
        cnt_faulty_lines[top_k][num_buggy_lines], cnt_lines_found[top_k][num_buggy_lines] = 0, 0
        for program_id in prog_faulty_lines:
            if len(prog_faulty_lines[program_id]) == num_buggy_lines:
                cnt_faulty_progs[top_k][num_buggy_lines] += 1
                cnt_faulty_lines[top_k][num_buggy_lines] += num_buggy_lines
                cnt_faulty_progs[top_k]['total'] += 1
                cnt_faulty_lines[top_k]['total'] += num_buggy_lines
                
                cnt_faulty_progs_found[top_k][num_buggy_lines] += 1 if len(prog_faulty_lines_found[top_k][program_id]) > 0 else 0
                cnt_lines_found[top_k][num_buggy_lines] += len(prog_faulty_lines_found[top_k][program_id])
                cnt_faulty_progs_found[top_k]['total'] += 1 if len(prog_faulty_lines_found[top_k][program_id]) > 0 else 0
                cnt_lines_found[top_k]['total'] += len(prog_faulty_lines_found[top_k][program_id])

print '\t\t\t\t top-10 \t\t top-5 \t\t top-1'
print 'diff|progs lines|', '%-22s|' % ('#10 PF \t #10 LF'), '%-22s|' % ('#5 PF \t #5 LF'), '%-22s|' % ('#1 PF \t #1 LF') 

for key in cnt_faulty_lines[10]:    
    if cnt_faulty_progs[10][key] == 0:
        continue
    print '{}\t'.format(key), '%4d %4d' % (cnt_faulty_progs[10][key], cnt_faulty_lines[10][key]),
    for top_k in [10,5,1]:
        print '%4d (%6.2f%%)' % (cnt_faulty_progs_found[top_k][key], (100.0*cnt_faulty_progs_found[top_k][key]/cnt_faulty_progs[top_k][key])),
        print '%4d (%6.2f%%)' % (cnt_lines_found[top_k][key], (100.0*cnt_lines_found[top_k][key]/cnt_faulty_lines[top_k][key]) ),
    print

## Store programs with correct prediction for comparison with baseline techniques

In [None]:
print len(wrong_predictions), len(remaining_eval_programs)
print wrong_predictions[0], list(remaining_eval_programs)[0], type(remaining_eval_programs)
wrong_predictions_ = map(lambda (x,y):(x,rev_test_dict[y]), wrong_predictions)
np.save('data/TCNN_wrong_classifications', wrong_predictions_)
np.save('data/TCNN_correct_classifications', remaining_eval_programs)
np.save('data/test_wise_faulty_lines', all_faulty_lines)

## Repeat with clustering of baselines

In [None]:
# Cluster correct embeddings
no_of_clusters = 5
cluster_store = {}
for problem_id in correct_programs:
    X = deepcopy(correct_program_embeddings[problem_id])
    emb_shape = np.shape(X)
    flat_len = emb_shape[1] * emb_shape[2] * emb_shape[3]
    X = np.reshape(X, (emb_shape[0], flat_len))
    
    kmeans = KMeans(n_clusters=no_of_clusters)
    kmeans.fit(X)
    kmeans_labels = kmeans.labels_
    cluster_centers = kmeans.cluster_centers_
    kmeans_labels_to_index_map = {}
    for idx in range(len(kmeans_labels)):
        if kmeans_labels[idx] not in kmeans_labels_to_index_map:
            kmeans_labels_to_index_map[kmeans_labels[idx]] = set()
        kmeans_labels_to_index_map[kmeans_labels[idx]].add(idx)
    cluster_store[problem_id] = [kmeans, kmeans_labels, cluster_centers, kmeans_labels_to_index_map]

In [None]:
c_wrong_predictions = []
c_empty_baselines = []

c_all_faulty_lines = {}
c_faulty_lines_found = {10:{}, 5:{}, 1:{}}

c_pairs_localized = {10:set(), 5:set(), 1:set()}
c_pairs_missed = {10:set(), 5:set(), 1:set()}
c_programs_localized = {10:set(), 5:set(), 1:set()}
c_all_eval_programs = set()
max_k = 10

c_baseline_indices = []

c_todo_cnt = len(neg_program_ids)
c_done = 0
c_skipped = []

for idx in range(c_todo_cnt):
    neg_program_id = neg_program_ids[idx]
    neg_program = neg_programs[idx]
    neg_problem_id = neg_problem_ids[idx]
    neg_test_id = neg_test_ids[idx]
    neg_verdict = neg_verdicts[idx]
    neg_buggy_subtree = neg_buggy_subtrees[idx]
    neg_buggy_line_to_subtree = neg_buggy_line_to_subtrees[idx]

    if neg_program_id not in c_all_faulty_lines:
        c_all_faulty_lines[neg_program_id] = {}
    
    if neg_program_id in bug_to_line_map:
        try:
            c_all_faulty_lines[neg_program_id][neg_test_id] = deepcopy(bug_to_line_map[neg_program_id][rev_test_dict[neg_test_id]])
        except KeyError:
            c_skipped.append((neg_program_id, rev_problem_id_dict[neg_problem_id], rev_test_dict[neg_test_id]))
            print '#Skipped:', len(c_skipped), '\r',
            continue
    else:
        c_all_faulty_lines[neg_program_id][neg_test_id] = set(neg_buggy_line_to_subtree.keys())
        
    c_all_eval_programs.add(neg_program_id)
        
    for each in [10,5,1]:
        if neg_program_id not in c_faulty_lines_found[each]:
            c_faulty_lines_found[each][neg_program_id] = {neg_test_id:set()}
        else:
            assert neg_test_id not in c_faulty_lines_found[each][neg_program_id]
            c_faulty_lines_found[each][neg_program_id][neg_test_id] = set()
    
    assert len(c_all_faulty_lines[neg_program_id][neg_test_id])>0, neg_program_id
        

    example_x = [ np.array([neg_program]), np.array([[neg_problem_id]]), np.array([[neg_test_id]]) ]
    example_y = keras.utils.to_categorical([neg_verdict], num_classes=2)
    
    emb_inc_program = get_embedded_program(sess, emb_model, example_x[0], neg_program_id)

    top_label_id, score = top_label_id_and_score(emb_inc_program, example_x, predictions_and_gradients)
    correct_neg_prediction = np.equal(top_label_id, neg_verdict)

    if not correct_neg_prediction:
        c_wrong_predictions.append((neg_program_id,neg_test_id))
        continue
    
    user_id_to_exclude = get_ref_program(neg_program_id)[1]
    emb_corr_programs, emb_corr_program_ids = get_correct_embeddings_using_clustering(rev_problem_id_dict[neg_problem_id], user_id_to_exclude, emb_inc_program)
    baselines, baseline_ids = get_baselines(emb_inc_program, emb_corr_programs, emb_corr_program_ids, k=25)
    v_baseline, v_baseline_id, v_baseline_index = get_verified_baseline(baselines, baseline_ids, neg_problem_id, neg_test_id)
    c_baseline_indices.append(v_baseline_index)
    
    if v_baseline is None:
        c_empty_baselines.append((neg_program_id,neg_test_id))
        continue

    baseline_emb = top_baseline = v_baseline
    
    attributions, predictions = integrated_gradients(emb_inc_program, example_x, example_y, predictions_and_gradients, baseline_emb, steps=100)
    subtree_attributions = np.squeeze(np.mean(np.amax(attributions, axis=-1), axis=-1))  # np.squeeze()
    top_k_subtrees, top_k_subtree_vals = get_top_k(subtree_attributions, max_k)

    top_k_lines, top_k_line_vals, program_length = get_top_k_lines(subtree_attributions, neg_program_id, k=max_k)
    
    for top_k in [10,5,1]:
        c_some_line_found = False
        for line in c_all_faulty_lines[neg_program_id][neg_test_id]:
            # line level attribution
            if line in top_k_lines[:top_k]:
                c_faulty_lines_found[top_k][neg_program_id][neg_test_id].add(line)
                c_some_line_found = True
                           
        if c_some_line_found:
            c_pairs_localized[top_k].add((neg_program_id,neg_test_id))
            c_programs_localized[top_k].add((neg_program_id))
        else:
            c_pairs_missed[top_k].add((neg_program_id,neg_test_id))
        
    c_remaining = c_todo_cnt - idx - 1
    c_denom = idx+1
    
    if idx%5==0 and idx>0:
        for top_k in [10,5,1]:
            print '%d|F:%4d, M:%4d, A:%5.2f%%' % (top_k, len(c_pairs_localized[top_k]), len(c_pairs_missed[top_k]), 100.0*len(c_pairs_localized[top_k])/c_denom),
        print '|| CNT:%4d, WP:%3d, EB:%3d \r' % (c_remaining, len(c_wrong_predictions), len(c_empty_baselines)),
        
    c_done += 1
        
for top_k in [10,5,1]:
    print '%d|F:%4d, M:%4d, A:%5.2f%%' % (top_k, len(c_pairs_localized[top_k]), len(c_pairs_missed[top_k]), 100.0*len(c_pairs_localized[top_k])/c_denom),
print '|| CNT:%4d, WP:%3d, EB:%3d \r' % (c_remaining, len(c_wrong_predictions), len(c_empty_baselines))

print 'skipped:', len(c_skipped), '\n'

In [None]:
c_total_pairs = sum(map(len, [c_pairs_localized[top_k], c_pairs_missed[top_k], c_wrong_predictions]))
print 'total_pairs:', 10, 5, 1
print c_total_pairs, 
for top_k in [10,5,1]:
    print len(c_pairs_localized[top_k]), '%4.2f%%' % (100.0*len(c_pairs_localized[top_k])/c_total_pairs),

In [None]:
c_prog_faulty_lines = {}
for program_id in c_all_faulty_lines:
    if program_id not in c_prog_faulty_lines: c_prog_faulty_lines[program_id] = set()
    for test_id in c_all_faulty_lines[program_id]:
        c_prog_faulty_lines[program_id].update(c_all_faulty_lines[program_id][test_id])
        
c_cnt_faulty_lines = 0
for program_id in c_prog_faulty_lines:
    c_cnt_faulty_lines += len(c_prog_faulty_lines[program_id])

c_prog_faulty_lines_found = {10:{},5:{},1:{}}
# print c_cnt_faulty_lines,
for top_k in [10,5,1]:
    for program_id in c_faulty_lines_found[top_k]:
        if program_id not in c_prog_faulty_lines_found[top_k]: c_prog_faulty_lines_found[top_k][program_id] = set()
        for test_id in c_faulty_lines_found[top_k][program_id]:
            c_prog_faulty_lines_found[top_k][program_id].update(c_faulty_lines_found[top_k][program_id][test_id])
    
    c_cnt_lines_found = 0
    for program_id in c_prog_faulty_lines_found[top_k]:
        c_cnt_lines_found += len(c_prog_faulty_lines_found[top_k][program_id])

#     print c_cnt_lines_found, '%4.2f%%' % (100.0*c_cnt_lines_found/c_cnt_faulty_lines), 
# print

# for programs with more than 1 faulty lines
print '\n', 'Grouped by line diff counts:'
c_cnt_faulty_progs, c_cnt_faulty_progs_found = {}, {}
c_cnt_faulty_lines, c_cnt_lines_found = {}, {}
for top_k in [10,5,1]:
    c_cnt_faulty_progs[top_k], c_cnt_faulty_progs_found[top_k] = {'total':0}, {'total':0}
    c_cnt_faulty_lines[top_k], c_cnt_lines_found[top_k] = {'total':0}, {'total':0}
    for num_buggy_lines in range(1,6):
        c_cnt_faulty_progs[top_k][num_buggy_lines], c_cnt_faulty_progs_found[top_k][num_buggy_lines] = 0, 0
        c_cnt_faulty_lines[top_k][num_buggy_lines], c_cnt_lines_found[top_k][num_buggy_lines] = 0, 0
        for program_id in c_prog_faulty_lines:
            if len(c_prog_faulty_lines[program_id]) == num_buggy_lines:
                c_cnt_faulty_progs[top_k][num_buggy_lines] += 1
                c_cnt_faulty_lines[top_k][num_buggy_lines] += num_buggy_lines
                c_cnt_faulty_progs[top_k]['total'] += 1
                c_cnt_faulty_lines[top_k]['total'] += num_buggy_lines
                
                c_cnt_faulty_progs_found[top_k][num_buggy_lines] += 1 if len(c_prog_faulty_lines_found[top_k][program_id]) > 0 else 0
                c_cnt_lines_found[top_k][num_buggy_lines] += len(c_prog_faulty_lines_found[top_k][program_id])
                c_cnt_faulty_progs_found[top_k]['total'] += 1 if len(c_prog_faulty_lines_found[top_k][program_id]) > 0 else 0
                c_cnt_lines_found[top_k]['total'] += len(c_prog_faulty_lines_found[top_k][program_id])
                
print '\t\t\t\t top-10 \t\t top-5 \t\t top-1'
print 'diff|progs lines|', '%-22s|' % ('#10 PF \t #10 LF'), '%-22s|' % ('#5 PF \t #5 LF'), '%-22s|' % ('#1 PF \t #1 LF') 

for key in c_cnt_faulty_lines[10]:    
    if c_cnt_faulty_progs[10][key] == 0:
        continue
    print '{}\t'.format(key), '%4d %4d' % (c_cnt_faulty_progs[10][key], c_cnt_faulty_lines[10][key]),
    for top_k in [10,5,1]:
        print '%4d (%6.2f%%)' % (c_cnt_faulty_progs_found[top_k][key], (100.0*c_cnt_faulty_progs_found[top_k][key]/c_cnt_faulty_progs[top_k][key])),
        print '%4d (%6.2f%%)' % (c_cnt_lines_found[top_k][key], (100.0*c_cnt_lines_found[top_k][key]/c_cnt_faulty_lines[top_k][key]) ),
    print

print 
c_baseline_indices = list(sorted(c_baseline_indices))
for percentile in [50,60,70,80,90,95,98,99]:
    i = int( percentile * len(c_baseline_indices) / 100.0 )
    print 'percentile:', percentile, 'baseline_indices:',  c_baseline_indices[i]

### results with explicit wrong baselines

In [None]:
c_total_pairs = sum(map(len, [c_pairs_localized[top_k], c_pairs_missed[top_k], c_empty_baselines]))
print 'total_pairs', 10, 5, 1
print c_total_pairs,
for top_k in [10,5,1]:
    print len(c_pairs_localized[top_k]), '%4.2f%%' % (100.0*len(c_pairs_localized[top_k])/c_total_pairs),

In [None]:
c_prog_faulty_lines = {}
for program_id in c_all_faulty_lines:
    if program_id not in c_prog_faulty_lines: c_prog_faulty_lines[program_id] = set()
    for test_id in c_all_faulty_lines[program_id]:
        if (program_id, test_id) not in c_wrong_predictions:
            c_prog_faulty_lines[program_id].update(c_all_faulty_lines[program_id][test_id])
        
c_cnt_faulty_lines = 0
for program_id in c_prog_faulty_lines:
    c_cnt_faulty_lines += len(c_prog_faulty_lines[program_id])

c_prog_faulty_lines_found = {10:{},5:{},1:{}}
# print '\ntotal top-10 top-5 top-1'
# print c_cnt_faulty_lines,
for top_k in [10,5,1]:
    for program_id in c_faulty_lines_found[top_k]:
        if program_id not in c_prog_faulty_lines_found[top_k]: c_prog_faulty_lines_found[top_k][program_id] = set()
        for test_id in c_faulty_lines_found[top_k][program_id]:
            c_prog_faulty_lines_found[top_k][program_id].update(c_faulty_lines_found[top_k][program_id][test_id])
    
    c_cnt_lines_found = 0
    for program_id in c_prog_faulty_lines_found[top_k]:
        c_cnt_lines_found += len(c_prog_faulty_lines_found[top_k][program_id])

#     print c_cnt_lines_found, '(%5.2f%%)' % (100.0*c_cnt_lines_found/c_cnt_faulty_lines),
# print '\n'

print 'Grouped by line diff counts:'
c_cnt_faulty_progs, c_cnt_faulty_progs_found = {}, {}
c_cnt_faulty_lines, c_cnt_lines_found = {}, {}
for top_k in [10,5,1]:
    c_cnt_faulty_progs[top_k], c_cnt_faulty_progs_found[top_k] = {'total':0}, {'total':0}
    c_cnt_faulty_lines[top_k], c_cnt_lines_found[top_k] = {'total':0}, {'total':0}
    for num_buggy_lines in range(1,6):
        c_cnt_faulty_progs[top_k][num_buggy_lines], c_cnt_faulty_progs_found[top_k][num_buggy_lines] = 0, 0
        c_cnt_faulty_lines[top_k][num_buggy_lines], c_cnt_lines_found[top_k][num_buggy_lines] = 0, 0
        for program_id in c_prog_faulty_lines:
            if len(c_prog_faulty_lines[program_id]) == num_buggy_lines:
                c_cnt_faulty_progs[top_k][num_buggy_lines] += 1
                c_cnt_faulty_lines[top_k][num_buggy_lines] += num_buggy_lines
                c_cnt_faulty_progs[top_k]['total'] += 1
                c_cnt_faulty_lines[top_k]['total'] += num_buggy_lines
                
                c_cnt_faulty_progs_found[top_k][num_buggy_lines] += 1 if len(c_prog_faulty_lines_found[top_k][program_id]) > 0 else 0
                c_cnt_lines_found[top_k][num_buggy_lines] += len(c_prog_faulty_lines_found[top_k][program_id])
                c_cnt_faulty_progs_found[top_k]['total'] += 1 if len(c_prog_faulty_lines_found[top_k][program_id]) > 0 else 0
                c_cnt_lines_found[top_k]['total'] += len(c_prog_faulty_lines_found[top_k][program_id])
                
print '\t\t\t\t top-10 \t\t top-5 \t\t top-1'
print 'diff|progs lines|', '%-22s|' % ('#10 PF \t #10 LF'), '%-22s|' % ('#5 PF \t #5 LF'), '%-22s|' % ('#1 PF \t #1 LF') 

for key in c_cnt_faulty_lines[10]:    
    if c_cnt_faulty_progs[10][key] == 0:
        continue
    print '{}\t'.format(key), '%4d %4d' % (c_cnt_faulty_progs[10][key], c_cnt_faulty_lines[10][key]),
    for top_k in [10,5,1]:
        print '%4d (%6.2f%%)' % (c_cnt_faulty_progs_found[top_k][key], (100.0*c_cnt_faulty_progs_found[top_k][key]/c_cnt_faulty_progs[top_k][key])),
        print '%4d (%6.2f%%)' % (c_cnt_lines_found[top_k][key], (100.0*c_cnt_lines_found[top_k][key]/c_cnt_faulty_lines[top_k][key]) ),
    print