In [1]:
import sh
import json
import os
import sys
import re
import io
import pandas as pd

In [2]:
project_path = os.getcwd()

In [3]:
def print_progress(i, l, step=1):
    if i % step == 0:
            print("\r>> {}/{}".format(i + 1, l), end="")
            sys.stdout.flush()

In [4]:
if not os.path.exists(os.getcwd() + '/camel'):
    print("Path doesn\'t exists, cloning repo:")
    sh.git.clone("https://github.com/apache/camel")
else:
    print("Repository exists")

Repository exists


In [5]:
git = sh.git.bake(_cwd='camel')
git.checkout("trunk")
git.status()

On branch trunk
Your branch is up-to-date with 'origin/trunk'.


It took 2.50 seconds to enumerate untracked files. 'status -uno'
may speed it up, but you have to be careful not to forget to add
new files yourself (see 'git help status').
nothing to commit, working directory clean

In [6]:
def extract_bug(file):
    if not file.endswith('.json'):
            return None
    else:
        bug_json_string = open("issue_CAMEL/" + file).read()
        bug = json.loads(bug_json_string)
        bug_fields = bug.get('fields')
        
        if bug_fields['issuetype']['name'] != 'Bug':
            return None
   
        if bug_fields['resolution'] == None:
            return None 
                
        if bug_fields['resolution']['name'] != 'Fixed':
            return None
    return bug

In [7]:
#Extracting all the fixed bugs from the bug repo
bugs = dict() #Dictionary of bugs indexed by LUCENE+ID
dir_list = os.listdir(os.getcwd() + "/issue_CAMEL")
l = len(dir_list)
print("Progress:")
for idx, file in enumerate(dir_list):
    print_progress(idx,l,10)
    bug = extract_bug(file)
    if(bug == None):
        continue
    bugs[bug['key'].upper()] = bug

Progress:
>> 9001/9002

In [8]:
commits = []

log = git.log("--reverse", "--pretty=format:%H")
for c_hash in log:
    if(c_hash.endswith('\n')):
        c_hash = c_hash[:-1]
    commits.append(c_hash)
len(commits)

13992

In [9]:
def get_fixed_bugs(commit_hash):
    details = git.show("--name-only","--pretty=format:%B", commit_hash)
    #init = details.split(':')[0].upper()
    f_keys = []
    
    keys = re.findall("CAMEL-[0-9]+",str(details))
    if len(keys) != 0:
        for k in keys:
            if bugs.get(k) != None:
                #Then it fixes a bug
                f_keys.append(k)
    return f_keys

In [10]:
#Splits the bug fixing ones from the others

fix_commits = [] #List that will be filled with couples (commit, bugs-fixed)
non_fix_commits = [] #List that will be filled only with commit hashes
l = len(commits)
print("Progress:")
for idx, c in enumerate(commits):
    
    print_progress(idx, l, 10)
        
    details = git.show("--name-only","--pretty=format:%B", c)
    files = git.show("--name-only","--pretty=format:",c)
    
    
    f_keys = get_fixed_bugs(c)    
    if len(f_keys) != 0:
        fix_commits.append((c,f_keys))
    else:
        non_fix_commits.append(c)
        
#Some commits cite more than one issue...which lines are the bug fix, if one of the two is only an improvement ?
#HP: we consider all the removed lines as implicated (fix-inducing)

Progress:
>> 13991/13992

In [11]:
print(len(fix_commits)) #Number of LUCENE bug fixing commits
print(len(non_fix_commits)) #Number of LUCENE non bug fixing commits
print(len(commits)) #Number of LUCENE + SOLR commits
#We can see that on average half of the commits were only solr related, so we discarded them

2198
11794
13992


In [12]:
def get_bug_commits(commit_hash, line_num, bug_hash):    
    git.checkout(str(commit_hash + "^"))

    # Go through all the files that have been changed
    for key_file_name in line_num:
        ## Check if file is a java file if not continue with next file
        if(key_file_name.find('.java') == -1):
            continue
            
        blame_output = git.blame("-l",key_file_name)
        blame_output_split = blame_output.split('\n')
        
        # Get hash for all the lines that have been changed.
        for i in line_num[key_file_name]:
            hash_of_line = blame_output_split[i-1].split(' ')[0]
            if bug_hash.get(hash_of_line) == None:
                bug_hash[hash_of_line] = set([key_file_name])
            else:
                bug_hash[hash_of_line].add(key_file_name)
            
    return bug_hash

In [13]:
def get_added_lines(commitHash):
    file = open('diff_file.txt', 'w+', encoding="ISO-8859-1")
    git.diff("--no-color", commitHash + "^", commitHash, _out="diff_file.txt").wait()
    output_diff = file.read()
    file.close()
    
    # Get diff output
#     stream = io.StringIO()
#     git.diff("--no-color", commitHash + "^", commitHash, _out=stream)
#     output_diff = stream.getvalue()
#     stream.close()
    
    # Cut output up in separate lines
    lines = output_diff.split('\n')

    
    line_num = dict()
    start_line = 0
    name_of_file = ""
    i = 0
    name_file_set = set()

    # Go through the diff output to determine line number(s) of deleted lines
    for line in lines:
        # Header where the name of the file is located.
        if line[0:4] == "diff":
            name_of_file = line.split(' b/')[-1]


        # New section of changed code find starting line number
        elif line[0:2] == "@@":
            lineNum = re.findall('\-[0-9]+\,',line)
            start_line = int(re.findall('\-[0-9]+',line)[0][1:])
            i = 0

        # Lines that has been changed
        elif (line[0:2] == "- "):
            if line_num.get(name_of_file) == None:
                line_num[name_of_file] = []
                name_file_set.add(name_of_file)
            line_num[name_of_file].append(start_line + i);
            i += 1


        # Indentaion infront of a line of code that hasn't changed
        elif line[0:2] == "  ":
            i += 1
            
    return line_num

In [14]:
total = len(fix_commits)
bug_commits = dict()

print("Progress:")
for idx, fix_hash in enumerate(fix_commits):
    new_lines = get_added_lines(fix_hash[0])
    bug_commits = get_bug_commits(fix_hash[0], new_lines, bug_commits)

    print_progress(idx, total,1)
#     if idx == 10:
#         break


Progress:
>> 70/2198

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xca in position 6565: invalid continuation byte

In [None]:
hash_file_list = []
for key, file_name_set in bug_commits.items():
    for file_name in file_name_set:
        hash_file_list.append([key, file_name, 1])

In [None]:
df = pd.DataFrame(hash_file_list, columns=['commit_hash','file', 'implicated'])
df.to_csv("dataset/implicated_files.csv")

df
