In [11]:
import os, sys, time, json, sqlite3, re, difflib, numpy as np, cPickle as cp, glob, random, itertools
from collections import OrderedDict
import subprocess32 as subprocess
from functools import partial
np.random.seed(1307)
sys.path.append('..')
from pycparser import parse_file, c_ast

from util.helpers import get_rev_dict, remove_non_ascii, make_dir_if_not_exists as mkdir
from util.helpers import get_curr_time_string, tokens_to_source, clang_format_from_source as cf
from util.helpers import isolate_line, fetch_line, extract_line_number, get_lines, recompose_program
from util.ast_helpers import get_subtree_list, get_linearized_ast, get_ast
from concurrent.futures import ThreadPoolExecutor
from pprint import pprint
from util.c_tokenizer import C_Tokenizer
tokenize = C_Tokenizer().tokenize

In [12]:
db_path = '../data/dataset.db'
with sqlite3.connect(db_path) as conn:
    c = conn.cursor()

In [13]:
# Load eval set
eval_set = np.load(os.path.join('../data/', 'eval_set.npy')).item()

eval_dict = {}
for problem_id in eval_set:
    for program_id, row in eval_set[problem_id].items():
        eval_dict[program_id] = row
eval_set_program_ids = eval_dict.keys()

TCNN_correct_classifications = np.load('../data/TCNN_correct_classifications.npy').item()
test_wise_faulty_lines = np.load(os.path.join('../data/', 'test_wise_faulty_lines.npy')).item()

prog_faulty_lines = {}
for program_id in test_wise_faulty_lines:
    if program_id not in prog_faulty_lines: prog_faulty_lines[program_id] = set()
    for test_id in test_wise_faulty_lines[program_id]:
        prog_faulty_lines[program_id].update(test_wise_faulty_lines[program_id][test_id])

In [14]:
def get_id_map(ast, program_id=None):
    '''shuffles ids before assigning them indices using 
    program_id as randomness seed if program_id is not None'''
    
    ids = []
    for subtree, coord in ast:
        for node in subtree:
            if '_<id>_' in node and '@' in node:
                org_id = node.split('_<id>_')[1].split('@')[0]
                if org_id not in ids:
                    ids.append(org_id)
                    
    if program_id is not None:
        random.seed(program_id)
        random.shuffle(ids)

    id_map = {}
    for id_ in ids:
        id_map[id_] = len(id_map)
    return id_map

def normalize_ids(ast, id_map):
    new_ast = []
    for subtree, coord in ast:
        new_subtree = []
        for node in subtree:
            if '_<id>_' in node and '@' in node:
                org_id = node.split('_<id>_')[1].split('@')[0]
                new_subtree.append(node.replace('_<id>_' + org_id + '@', '_<id>_' + str(id_map[org_id]) + '@'))
            else:
                new_subtree.append(node)
        assert len(new_subtree) == len(subtree)
        new_ast.append((new_subtree, coord))
    return new_ast

def size_n_vocab_match(subtree_list_ast):
    global tl_dict, max_subtrees_per_program, max_nodes_per_subtree
    
    if len(subtree_list_ast) > max_subtrees_per_program:
        return False
    
    vec_ast = []
    
    for subtree, coord in subtree_list_ast:
        vec_subtree = []
        for token in subtree:
            try:
                vec_subtree.append(tl_dict[token])
            except KeyError:
                return None

        if len(vec_subtree) > max_nodes_per_subtree:
            return False

    return True

In [15]:
problem_ids = [str(row[0]) for row in c.execute('SELECT DISTINCT problem_id FROM orgsource;')]

query='''SELECT p.program_id, program, user_id, trs.verdict FROM
        programs p INNER JOIN orgsource o ON o.program_id = p.program_id
        INNER JOIN test_run_summary trs ON trs.program_id = p.program_id
        WHERE trs.verdict<>"ALL_FAIL" AND problem_id=?;'''

data_dict = {0:{}, 1:{}}
counts = {'correct':{'prob':0, 'user':0, 'prog':0}, 'incorrect':{'prob':0, 'user':0, 'prog':0}}

TCNN_correct_classifications_counter = 0

for problem_id in problem_ids:
    for verdict in [0,1]:
        data_dict[verdict][problem_id] = {}

    for row in c.execute(query, (problem_id,)):
        program_id, program, user_id, verdict = row
        program = program.encode('utf-8','ignore')
        verdict= 1 if verdict == 'ALL_PASS' else 0

        if verdict == 0: 
            if program_id in TCNN_correct_classifications:
                TCNN_correct_classifications_counter += 1
            else:
                continue
        

        if user_id not in data_dict[verdict][problem_id]:
            data_dict[verdict][problem_id][user_id] = []
        data_dict[verdict][problem_id][user_id] += [(program_id, program)]

## Do diff

In [6]:
def prepend_line_no(prog):
    lines=prog.split('\n')
    lines = ['[%2d] %s' % (idx+1, line) for idx, line in enumerate(lines)]
    return '\n'.join(lines)

def remove_all_white_space(line):
    return ''.join(line.split())

def normalize_brackets(program):
    program = program.replace('\r', '\n')
    lines = [line for line in program.split('\n') if len(line.strip()) > 0]
    
    if len(lines) == 1:
        raise ValueError()

    for i in range(len(lines)-1, -1, -1):
        line = lines[i]
        wsr_line = remove_all_white_space(line)
        if wsr_line == '}' or wsr_line == '}}' or wsr_line == '}}}' or wsr_line == '};' \
        or wsr_line == '}}}}' or wsr_line == '}}}}}' or wsr_line == '{' or wsr_line == '{{':
            if i > 0:
                lines[i-1] += ' ' + line.strip()
                lines[i]    = ''
            else:
                # can't handle this case!
                raise ValueError()

    # Remove empty lines
    for i in range(len(lines)-1, -1, -1):
        if lines[i] == '':
            del lines[i]

    for line in lines:
        assert(lines[i].strip() != '')

    return '\n'.join(lines)

def remove_empty_lines(lines):
    return [line for line in lines if len(line.strip()) > 0]

def do_diff(file1_data, file2_data, file1_name='INC', file2_name='REF'):
    file1_data, file2_data = map(normalize_brackets, [file1_data, file2_data])
    file1_lines = remove_empty_lines(file1_data.replace('\r', '\n').split('\n'))
    file2_lines = remove_empty_lines(file2_data.replace('\r', '\n').split('\n'))
    
    diff_generator = difflib.unified_diff(file1_lines, file2_lines, file1_name, file2_name, n=0)    
    output = []
    for line in diff_generator:
        output.append(line)
    output = '\n'.join(output)
    return output

def get_diff_len(diff_out):
    diff_lines = [line for line in diff_out.split('\n') if len(line.strip())>0 and \
                    not line.startswith('+++') and not line.startswith('---') and \
                    not line.startswith('@') and \
                    (line.startswith('+') or line.startswith('-'))]
    # ignore white space diffs: having only + or - diff indicators
    diff_lines = [line for line in diff_lines if len(line.strip())>1]
    return len(diff_lines)

def get_localization_info(diff_output):
    diff_lines = [line for line in diff_output.split('\n') if len(line.strip())>0 and \
                    not line.startswith('+++') and not line.startswith('---') and \
                    (line.startswith('+') or line.startswith('-') or line.startswith('@@'))]
    
    edit_locations = []
    
    for line in diff_lines:
        if line.startswith('@@'):                
            more = 0
            inc_file_line_token = line.strip().split()[1]
            if ',' in inc_file_line_token:
                line_no, offset = map(int, inc_file_line_token.split(','))
            else:
                line_no, offset = int(inc_file_line_token), 1

            if line_no < 0:
                edit_locations.append(abs(line_no))
                    
        elif line.startswith('-'):
            if len(line.strip()) > 1:
                if (abs(line_no)+more) not in edit_locations:
                    edit_locations.append(abs(line_no)+more)
            else:
                assert abs(line_no) > 2, abs(line_no)
                if (abs(line_no)-1) not in edit_locations:
                    edit_locations.append(abs(line_no)-1)
            more += 1
            assert more <= offset, 'more:%d, offset:%d' % (more, offset)

    return edit_locations


def get_prog(program_id, flag_prepend_line_no=True, clean_up=False):
    query='''SELECT program FROM orgsource WHERE program_id=?;'''

    with sqlite3.connect(db_path) as conn:
        c = conn.cursor()
        for row in c.execute(query, (program_id, )):
            program = row[0].encode('utf-8', 'ignore')
    
            if clean_up:
                program = normalize_brackets(program)
                program_lines = remove_empty_lines(program.replace('\r', '\n').split('\n'))
                program = '\n'.join(program_lines)

            if flag_prepend_line_no:
                return prepend_line_no(program)
            else:
                return program
                

### Diff with submissions from different students

In [7]:
inc_data = {}
corr_data = {}

inc_data_len = 0
corr_data_len = 0

for problem_id in data_dict[0].keys():
    inc_data[problem_id] = []
    for user in data_dict[0][problem_id].keys():
        for program_id, program in data_dict[0][problem_id][user]:
            if program_id in eval_set_program_ids:
                inc_data[problem_id].append( (user, program_id, program) )
                inc_data_len += 1
print 'inc_data_len', inc_data_len

for problem_id in data_dict[1].keys():
    corr_data[problem_id] = []
    for user in data_dict[1][problem_id].keys():
        for program_id, program in data_dict[1][problem_id][user]:
            corr_data[problem_id].append( (user, program_id, program) )
            corr_data_len += 1
print 'corr_data_len', corr_data_len

diff_lens = {}
diff_outputs = {}
min_diff = {}

for problem_id in inc_data.keys():
    diff_outputs[problem_id] = {}
    for (i_user, i_program_id, i_program), (c_user, c_program_id, c_program) in \
    itertools.product(inc_data[problem_id], corr_data[problem_id]):
        if i_user != c_user:
            diff_out = do_diff(i_program, c_program, i_program_id, c_program_id)
            if user not in diff_outputs[problem_id]:
                diff_outputs[problem_id][user] = []
            diff_outputs[problem_id][user] += [(i_program, c_program, diff_out)]
            diff_len = get_diff_len(diff_out)
            if i_program_id in diff_lens:
                if diff_len < diff_lens[i_program_id]:
                    diff_lens[i_program_id] = diff_len
                    min_diff[i_program_id] = (c_program_id, diff_out, diff_len)
            else:
                diff_lens[i_program_id] = diff_len
                min_diff[i_program_id] = (c_program_id, diff_out, diff_len)

inc_data_len 1449
corr_data_len 8192


In [8]:
def localize_bugs(buggy_lines, predictions):
    set_buggy_lines = set(buggy_lines)
    set_predictions = set(predictions)
    lines_found = len(set_buggy_lines & set_predictions)
    return lines_found

In [9]:
total_programs_found, total_programs = {1:0,5:0,10:0}, 0
total_lines_found, total_lines = {1:0,5:0,10:0}, 0

top_k = [1,5,10]

for i_program_id in min_diff:
    c_program_id, diff_out, diff_len = min_diff[i_program_id]
    edit_locations = get_localization_info(diff_out)
    lines_found = localize_bugs(prog_faulty_lines[i_program_id], edit_locations)
    if lines_found > 0:
        for k in top_k:
            if len(edit_locations) <= k:
                total_programs_found[k] += 1
                total_lines_found[k] += (lines_found)
    total_programs += 1
    total_lines += len(eval_dict[i_program_id][3])
    
print 'total_lines:', total_lines, 'total_programs:', total_programs, '\n'
    
for k in top_k:
    print 'k:%2d |' % k,
    print 'lines_found: %3d (%5.2f%%) |' % (total_lines_found[k], (100.0*total_lines_found[k]/total_lines)),
    print 'programs_found: %3d (%5.2f%%)' % (total_programs_found[k], (100.0*total_programs_found[k]/total_programs))

total_lines: 2496 total_programs: 1449 

k: 1 | lines_found:   0 ( 0.00%) | programs_found:   0 ( 0.00%)
k: 5 | lines_found: 136 ( 5.45%) | programs_found: 116 ( 8.01%)
k:10 | lines_found: 801 (32.09%) | programs_found: 600 (41.41%)


In [10]:
c.close()
conn.close()