In [1]:
import os, sys, time, json, sqlite3, re, difflib, numpy, cPickle as cp, glob, random, itertools, glob
from copy import deepcopy
import operator
from math import sqrt
import subprocess32 as subprocess
sys.path.append('..')
from util.helpers import make_dir_if_not_exists as mkdir

Using TensorFlow backend.


In [2]:
db_path = '../data/dataset.db'
with sqlite3.connect(db_path) as conn:
    cursor = c = conn.cursor()

In [3]:
# Load eval set
eval_set = numpy.load('../data/eval_set.npy').item()
bug_to_line_map = numpy.load('../data/bug_to_line_map.npy').item()

eval_dict = {}
for problem_id in eval_set:
    for program_id, row in eval_set[problem_id].items():
        eval_dict[program_id] = row

eval_set_program_ids = set(eval_dict.keys())
print '#programs:', len(eval_set_program_ids)

TCNN_correct_classifications = numpy.load('../data/TCNN_correct_classifications.npy').item()

eval_dict = {}
for problem_id in eval_set:
    for program_id, row in eval_set[problem_id].items():
            eval_dict[program_id] = set(row[3])

TCNN_wrong_classifications_list = numpy.load('../data/TCNN_wrong_classifications.npy')

TCNN_wrong_classifications = {}
for program_id, test_id in TCNN_wrong_classifications_list:
    program_id = int(program_id)
    if program_id not in TCNN_wrong_classifications:
        TCNN_wrong_classifications[program_id] = set()
    TCNN_wrong_classifications[program_id].add(test_id)

#programs: 2167


In [4]:
# program_id, buggy location
for key, value in eval_dict.items()[:10]:
    print key, '-', value, '|',

1048576 - set([10]) | 1048577 - set([8]) | 1064963 - set([24, 23, 22, 15]) | 1064961 - set([14]) | 1048585 - set([5, 6]) | 1048586 - set([10, 5]) | 1024011 - set([4, 6]) | 1064984 - set([6, 7]) | 1024025 - set([20, 21]) | 1024029 - set([8, 5]) |


In [5]:
def remove_empty_lines(lines):
    return [line for line in lines if len(line.strip()) > 0]

def prepend_line_no(prog):
    lines=prog.split('\n')
    lines = ['[%2d] %s' % (idx+1, line) for idx, line in enumerate(lines)]
    return '\n'.join(lines)

def remove_all_white_space(line):
    return ''.join(line.split())

def normalize_brackets(program):
    program = program.replace('\r', '\n')
    lines = [line for line in program.split('\n') if len(line.strip()) > 0]
    
    if len(lines) == 1:
        raise ValueError()

    for i in range(len(lines)-1, -1, -1):
        line = lines[i]
        wsr_line = remove_all_white_space(line)
        if wsr_line == '}' or wsr_line == '}}' or wsr_line == '}}}' or wsr_line == '};' \
        or wsr_line == '}}}}' or wsr_line == '}}}}}' or wsr_line == '{' or wsr_line == '{{':
            if i > 0:
                lines[i-1] += ' ' + line.strip()
                lines[i]    = ''
            else:
                # can't handle this case!
                raise ValueError()
                return ''

    # Remove empty lines
    for i in range(len(lines)-1, -1, -1):
        if lines[i] == '':
            del lines[i]

    for line in lines:
        assert(lines[i].strip() != '')

    return '\n'.join(lines)

def get_program(program_id, flag_prepend_line_no=False, clean_up=False):
    query='''SELECT program, problem_id FROM orgsource WHERE program_id=?;'''
    global cursor

    for row in cursor.execute(query, (program_id, )):
        program = row[0].encode('utf-8', 'ignore')
        problem_id = row[1]

        if clean_up:
            program = normalize_brackets(program)
            program_lines = remove_empty_lines(program.replace('\r', '\n').split('\n'))
            program = '\n'.join(program_lines)

        if flag_prepend_line_no:
            program = prepend_line_no(program)

        return program, str(problem_id)
    
def get_program_test_info(program_id):
    query='''SELECT problem_id, t.test_id, t.verdict FROM orgsource o INNER JOIN test_runs t 
            ON o.program_id=t.program_id WHERE o.program_id=?;'''

    global cursor

    rows = []
    for row in cursor.execute(query, (program_id, )):
        problem_id, test_id, verdict = row
        rows.append((problem_id, test_id, verdict))
    return rows
    
def get_test_count(problem_id):
    query = '''SELECT test_cnt FROM problems WHERE problem_id=?'''
    global cursor
    for row in cursor.execute(query, (problem_id, )):
        return row[0]

In [6]:
output_dir = '../data/eval-programs/'
mkdir(output_dir)

In [7]:
# Write programs to a directory
for program_id in TCNN_correct_classifications:
    program, problem_id = get_program(program_id, clean_up=True)
    if not os.path.exists(os.path.join(output_dir, problem_id)):
        mkdir(os.path.join(output_dir, problem_id))
    with open(os.path.join(output_dir, problem_id, '{}.c'.format(program_id)), 'w') as f:
        f.write(program)

## now run the following two commands in the ../data directory and wait for script to finish



In [8]:
#1 find tests -name '*.gc*' -delete
#2 ./get_coverage.sh eval-programs
#  NOTE: Make sure that there is no leading / after `./get_coverage.sh eval-programs`

## read gcov files

In [8]:
def get_coverage(gcov_file):
    
    def process_line(line):
        tag, line_no, code = line.strip().split(':', 2)
        return tag.strip(), int(line_no.strip()), code
    
    coverage = {}
        
    for idx, line in enumerate(gcov_file.split('\n')):
        if idx <= 4 or len(line.strip()) == 0:
            continue
            
        try:
            tag, line_no, code = process_line(line)
        except:
            print 'idx:', idx, 'line:', line
            print line.strip().split(':', 2)
            raise
        assert idx!=5 or line_no==1, gcov_file
        
        if tag == '-':
            continue
        elif tag == '#####':
            coverage[line_no] = 0
        else:  
            tag = int(tag) 
            coverage[line_no] = 1
            
    return coverage

In [9]:
# gcov doesn't run on programs which don't handle a run time exception!

gcov_files, all_coverage = {}, {}
done = 0
for problem_id in os.listdir(output_dir):
    gcov_files[problem_id] = {}
    all_coverage[problem_id] = {}
    for gcov_file in glob.glob(os.path.join('../data/tests/', problem_id, '*.gcov')):
        with open(gcov_file, 'r') as f:
            filename = gcov_file.split('/')[-1]
            test_name, filename = filename.split('-')
            filename = int(filename.split('.')[0])
            if filename not in gcov_files[problem_id]:
                gcov_files[problem_id][filename] = {}
                all_coverage[problem_id][filename] = {}
            coverage_file = f.read()
            gcov_files[problem_id][filename][test_name] = coverage_file
            all_coverage[problem_id][filename][test_name] = get_coverage(coverage_file)
            done += 1
            
print 'files', done, 'programs:', sum([len(gcov_files[problem_id]) for problem_id in gcov_files])

files 11642 programs: 1449


In [10]:
# collect test execution information    
test_execution_details = {}
for problem_id in all_coverage:
    test_execution_details[problem_id] = {}
    for program_id in all_coverage[problem_id]:
        test_execution_details[problem_id][program_id] = {}
        for _, test_id, verdict in get_program_test_info(program_id):
            test_execution_details[problem_id][program_id][test_id] = verdict
    
test_counts = {}
for problem_id in test_execution_details:
    test_counts[problem_id] = get_test_count(problem_id)

In [11]:
# eval_set_dict
program_ids_with_coverage = set()

for problem_id in all_coverage.keys():
    for program_id in all_coverage[problem_id]:
        program_ids_with_coverage.add(program_id)

In [12]:
# select all passing test_ids per program
passing_tests = {}
for problem_id in test_execution_details:
    for program_id in test_execution_details[problem_id]:
        for test_id in test_execution_details[problem_id][program_id]:
            if test_execution_details[problem_id][program_id][test_id]:
                if program_id not in passing_tests:
                    passing_tests[program_id] = []
                passing_tests[program_id].append(test_id)

In [13]:
selected_passing_tests = {}
for program_id in passing_tests:
    random.seed(program_id)
    selected_passing_tests[program_id] = random.choice(passing_tests[program_id])
    
print len(selected_passing_tests)

1449


In [14]:
def get_scores(af, ap, nf, np):
    tarantula, ochiai = {}, {}
    for key in af:
        afv, apv, nfv, npv = af[key], ap[key], nf[key], np[key]
        try:
            tarantula[key] = (afv/(afv+nfv)) / ( (afv/(afv+nfv)) + (apv/(apv+npv)) )
        except: 
            tarantula[key] = 0
        
        try:
            ochiai[key] = afv / sqrt( (afv + nfv) * (afv + apv) )
        except:
            ochiai[key] = 0
            
    sorted_tarantula = sorted(tarantula.items(), key=operator.itemgetter(1), reverse=True)
    sorted_ochiai = sorted(ochiai.items(), key=operator.itemgetter(1), reverse=True)    
    return map(lambda (x,y):x, sorted_tarantula), map(lambda (x,y):x, sorted_ochiai)

## Using one passing test

In [15]:
all_faulty_lines = {}
faulty_lines_found = { 'tarantula':{10:{}, 5:{}, 1:{}}, 'ochiai':{10:{}, 5:{}, 1:{}} }

pairs_localized = { 'tarantula':{10:set(), 5:set(), 1:set()}, 'ochiai':{10:set(), 5:set(), 1:set()} }
pairs_missed = { 'tarantula':{10:set(), 5:set(), 1:set()}, 'ochiai':{10:set(), 5:set(), 1:set()} }
programs_localized = { 'tarantula':{10:set(), 5:set(), 1:set()}, 'ochiai':{10:set(), 5:set(), 1:set()} }

skipped = []
continue_1 = 0

all_eval_programs = set()
coverage_not_found = set()

for problem_id in all_coverage.keys():
    
    for program_id in all_coverage[problem_id]:
        
        if program_id not in all_faulty_lines:
            all_faulty_lines[program_id] = {}
        
        for test_id in all_coverage[problem_id][program_id]:
            
            # passing test, continue
            if test_execution_details[problem_id][program_id][test_id]:  
                continue
                
            # TCNN made a wrong prediction, continue
            if program_id in TCNN_wrong_classifications and '{}{}'.format(problem_id,test_id) in TCNN_wrong_classifications[program_id]:
                continue
                    
            if program_id in bug_to_line_map:
                try:
                    all_faulty_lines[program_id][test_id] = deepcopy(bug_to_line_map[program_id]['{}{}'.format(problem_id,test_id)])
                except KeyError:
                    continue
            else:
                try:
                    all_faulty_lines[program_id][test_id] = eval_dict[program_id]
                except:
                    continue
                
            assert len(all_faulty_lines[program_id][test_id])>0, program_id
            
            all_eval_programs.add(program_id)
                
            for tech in ['tarantula', 'ochiai']:
                for each in [10,5,1]:
                    if program_id not in faulty_lines_found[tech][each]:
                        faulty_lines_found[tech][each][program_id] = {test_id:set()}
                    else:
                        assert test_id not in faulty_lines_found[tech][each][program_id]
                        faulty_lines_found[tech][each][program_id][test_id] = set()

            try:
                failing_coverage = all_coverage[problem_id][program_id][test_id]
                passing_test_id = selected_passing_tests[program_id]
                passing_coverage = all_coverage[problem_id][program_id][passing_test_id]
            except:
                continue
            
            af, ap, nf, np = {}, {}, {}, {}

            for coverage, verdict in zip([failing_coverage, passing_coverage],[0,1]):
                for key, appearance in coverage.items():
                    appearance = float(appearance)
                    af[key] = abs(appearance * (verdict-1)) if key not in af else af[key] + abs(appearance * (verdict-1))
                    ap[key] = abs(appearance * verdict) if key not in ap else ap[key] + abs(appearance * verdict)
                    nf[key] = abs((appearance-1) * (verdict-1)) if key not in nf else nf[key] + abs(appearance * (verdict-1))
                    np[key] = abs((appearance-1) * verdict) if key not in np else np[key] + abs(appearance * verdict)
                    
            sorted_tarantula, sorted_ochiai = get_scores(af, ap, nf, np)
            for tech, top_k_lines in zip(['tarantula', 'ochiai'], [sorted_tarantula, sorted_ochiai]):

                for top_k in [10,5,1]:
                    some_line_found = False
                    for line in all_faulty_lines[program_id][test_id]:
                        if line in top_k_lines[:top_k]:
                            faulty_lines_found[tech][top_k][program_id][test_id].add(line)
                            some_line_found = True

                    if some_line_found:
                        pairs_localized[tech][top_k].add((program_id, test_id))
                        programs_localized[tech][top_k].add((program_id))
                    else:
                        pairs_missed[tech][top_k].add((program_id, test_id))

In [16]:
print '#all_programs:', len(TCNN_correct_classifications), 'programs localized:'
for tech in ['tarantula', 'ochiai']:
    print tech.upper()
    for top_k in [10,5,1]:
        print '%2d' % top_k, '%4d' % len(programs_localized[tech][top_k]), '%4.2f%%' % (100.0*len(programs_localized[tech][top_k])/len(TCNN_correct_classifications))
    print

#all_programs: 1449 programs localized:
TARANTULA
10  958 66.11%
 5  461 31.82%
 1    6 0.41%

OCHIAI
10 1137 78.47%
 5  811 55.97%
 1  217 14.98%



## Using all passing tests

In [17]:
all_faulty_lines = {}
faulty_lines_found = { 'tarantula':{10:{}, 5:{}, 1:{}}, 'ochiai':{10:{}, 5:{}, 1:{}} }

pairs_localized = { 'tarantula':{10:set(), 5:set(), 1:set()}, 'ochiai':{10:set(), 5:set(), 1:set()} }
pairs_missed = { 'tarantula':{10:set(), 5:set(), 1:set()}, 'ochiai':{10:set(), 5:set(), 1:set()} }
programs_localized = { 'tarantula':{10:set(), 5:set(), 1:set()}, 'ochiai':{10:set(), 5:set(), 1:set()} }

skipped = []

all_eval_programs = set()
coverage_not_found = set()

for problem_id in all_coverage.keys():
    for program_id in all_coverage[problem_id]:
        
        if program_id not in all_faulty_lines:
            all_faulty_lines[program_id] = {}
        
        for test_id in all_coverage[problem_id][program_id]:
            
            if test_execution_details[problem_id][program_id][test_id]:
                continue
                
            if program_id in TCNN_wrong_classifications and '{}{}'.format(problem_id,test_id) in TCNN_wrong_classifications[program_id]:
                continue
                    
            if program_id in bug_to_line_map:
                try:
                    all_faulty_lines[program_id][test_id] = deepcopy(bug_to_line_map[program_id]['{}{}'.format(problem_id,test_id)])
                except KeyError:
                    continue
            else:
                all_faulty_lines[program_id][test_id] = eval_dict[program_id]
                
            assert len(all_faulty_lines[program_id][test_id])>0, program_id
            
            all_eval_programs.add(program_id)
                
            for tech in ['tarantula', 'ochiai']:
                for each in [10,5,1]:
                    if program_id not in faulty_lines_found[tech][each]:
                        faulty_lines_found[tech][each][program_id] = {test_id:set()}
                    else:
                        assert test_id not in faulty_lines_found[tech][each][program_id]
                        faulty_lines_found[tech][each][program_id][test_id] = set()

            af, ap, nf, np = {}, {}, {}, {}
            for tid in all_coverage[problem_id][program_id]:
                verdict = test_execution_details[problem_id][program_id][tid]
                coverage = all_coverage[problem_id][program_id][tid]
                if not (verdict or tid == test_id):
                    continue

                for key, appearance in coverage.items():
                    appearance = float(appearance)
                    af[key] = abs(appearance * (verdict-1)) if key not in af else af[key] + abs(appearance * (verdict-1))
                    ap[key] = abs(appearance * verdict) if key not in ap else ap[key] + abs(appearance * verdict)
                    nf[key] = abs((appearance-1) * (verdict-1)) if key not in nf else nf[key] + abs(appearance * (verdict-1))
                    np[key] = abs((appearance-1) * verdict) if key not in np else np[key] + abs(appearance * verdict)

            sorted_tarantula, sorted_ochiai = get_scores(af, ap, nf, np)


            for tech, top_k_lines in zip(['tarantula', 'ochiai'], [sorted_tarantula, sorted_ochiai]):

                for top_k in [10,5,1]:
                    some_line_found = False
                    for line in all_faulty_lines[program_id][test_id]:
                        if line in top_k_lines[:top_k]:
                            faulty_lines_found[tech][top_k][program_id][test_id].add(line)
                            some_line_found = True

                    if some_line_found:
                        pairs_localized[tech][top_k].add((program_id, test_id))
                        programs_localized[tech][top_k].add((program_id))
                    else:
                        pairs_missed[tech][top_k].add((program_id, test_id))

In [18]:
print '#all_programs:', len(TCNN_correct_classifications), 'programs localized:'
for tech in ['tarantula', 'ochiai']:
    print tech.upper()
    for top_k in [10,5,1]:
        print '%2d' % top_k, '%4d' % len(programs_localized[tech][top_k]), '%4.2f%%' % (100.0*len(programs_localized[tech][top_k])/len(TCNN_correct_classifications))
    print

#all_programs: 1449 programs localized:
TARANTULA
10 1151 79.43%
 5  809 55.83%
 1  346 23.88%

OCHIAI
10 1151 79.43%
 5  835 57.63%
 1  385 26.57%



In [19]:
c.close()
conn.close()