In [1]:
import sh
import json
import os
import sys
import re
import io
import pandas as pd
from dateutil import parser

In [2]:
#LUCENE-SOLR
project_folder = "lucene"
output_folder = "out"

repository_folder = "lucene-solr"
repository_link = "https://github.com/apache/lucene-solr.git"
checkout_branch = "trunk"

issues_folders = ["issue_LUCENE", "issue_SOLR"]
issues_tags = ["LUCENE", "SOLR"]

In [None]:
#CAMEL
project_folder = "camel"
output_folder = "out"

repository_folder = "camel"
repository_link = "https://github.com/apache/camel"
checkout_branch = "master"

issues_folders = ["issue_CAMEL"]
issues_tags = ["CAMEL"]

In [3]:
def print_progress(i, l, step=1):
    if i % step == 0:
            print("\r>> {}/{}".format(i + 1, l), end="")
            sys.stdout.flush()

In [4]:
if not os.path.exists(os.getcwd() + '/' + project_folder + '/' + repository_folder):
    print("Path doesn\'t exists, cloning repo:")
    git = sh.git.bake(_cwd=project_folder)
    git.clone(repository_link)
else:
    print("Repository exists")

Repository exists


In [5]:
git = sh.git.bake(_cwd=project_folder + '/' + repository_folder)
git.checkout(checkout_branch)
git.status()

On branch trunk
Your branch is up-to-date with 'origin/trunk'.

nothing to commit, working directory clean

In [6]:
def extract_bug(file_path):
    if not file.endswith('.json'):
            return None
    else:
        bug_json_string = open(file_path).read()
        bug = json.loads(bug_json_string)
        bug_fields = bug.get('fields')
        
        if bug_fields['issuetype']['name'] != 'Bug':
            return None
   
        if bug_fields['resolution'] == None:
            return None 
                
        if bug_fields['resolution']['name'] != 'Fixed':
            return None
    return bug

In [10]:
def get_fixed_bugs(commit_hash):
    details = git.show("--name-only","--pretty=format:%B", commit_hash)
    #init = details.split(':')[0].upper()
    f_keys = []
    
    for tag in issues_tags:
        keys = re.findall(tag + "-[0-9]+",str(details))
        if len(keys) != 0:
            for k in keys:
                if bugs.get(k) != None:
                    #Then it fixes a bug
                    f_keys.append(k)
    return f_keys

In [7]:
#Extracting all the fixed bugs from the bug repo
bugs = dict() #Dictionary of bugs indexed by TAG+ID (e.g. LUCENE-1234)
for fld in issues_folders:
    path = project_folder + '/' + fld
    dir_list = os.listdir(os.getcwd() + '/' + path)
    l = len(dir_list)
    print()
    print("Progress for " + fld + ":")
    for idx, file in enumerate(dir_list):
        print_progress(idx,l,10)
        bug = extract_bug(path + '/' + file)
        if(bug == None):
            continue
        bugs[bug['key'].upper()] = bug

print()
print(len(bugs), "fixed bugs extracted.")


Progress for issue_LUCENE:
>> 6641/6641
Progress for issue_SOLR:
>> 7721/7728
4011 fixed bugs extracted.


In [8]:
commits = []

log = git.log("--reverse", "--pretty=format:%H")
for c_hash in log:
    if(c_hash.endswith('\n')):
        c_hash = c_hash[:-1]
    commits.append(c_hash)
len(commits)

16308

In [None]:
#Splits the bug fixing ones from the others

fix_commits = [] #List that will be filled with couples (commit, bugs-fixed)
non_fix_commits = [] #List that will be filled only with commit hashes
l = len(commits)
print("Progress:")
for idx, c in enumerate(commits):
    
    print_progress(idx, l, 10)
        
    details = git.show("--name-only","--pretty=format:%B", c)
    files = git.show("--name-only","--pretty=format:",c)
    
    
    f_keys = get_fixed_bugs(c)    
    if len(f_keys) != 0:
        fix_commits.append((c,f_keys))
    else:
        non_fix_commits.append(c)
        
#Some commits cite more than one issue...which lines are the bug fix, if one of the two is only an improvement ?
#HP: we consider all the removed lines as implicated (fix-inducing)

Progress:
>> 2251/16308

In [None]:
print(len(fix_commits)) #Number of LUCENE-SOLR bug fixing commits
print(len(non_fix_commits)) #Number of LUCENE-SOLR non bug fixing commits
print(len(commits)) #Number of LUCENE-SOLR commits
#We can see that on average half of the commits were only solr related, so we discarded them

In [None]:
def get_bug_commits(commit_hash, line_num, bug_hash):
    # checkout previous commit
    git.checkout("--force", commit_hash + "^")

    # Go through all the files that have been changed
    for key_file_name in line_num:
        ## Check if file is a java file if not continue with next file
        if(key_file_name.find('.java') == -1):
            continue
        
        # blame file to see when the lines where last changed (on previous commit)
        with open(project_folder + '_blame_file.txt', 'wb+') as file:
            # --root is added to see boundary commits as normal commits
            cmd = ["git " + "-C ./"+ project_folder + '/' + repository_folder + '/' +" blame -l --root " + key_file_name]
            gitBlame = subprocess.Popen(cmd, shell=True, universal_newlines=True, stdout=file)
            gitBlame.wait()
            file.flush()
            
        output_blame = ""
        with open(project_folder + '_blame_file.txt', 'rb+') as file:
            output_blame = file.read()

        # Cut output up in separate lines
        blame_output = output_blame.decode("latin1")        
        blame_output_split = blame_output.split('\n')
                
        # Get hash for all the lines that have been changed.
        for i in line_num[key_file_name]:
            line = blame_output_split[i-1]
            hash_of_line = line.split(' ')[0]
            
            if bug_hash.get(hash_of_line) == None:
                bug_hash[hash_of_line] = set([key_file_name])
            else:
                bug_hash[hash_of_line].add(key_file_name)
            
    return bug_hash

In [None]:
def get_added_lines(commitHash):
    
    # Get diff of current and previous commit and store this in a file
    with open(project_folder + '_diff_file.txt', 'wb+') as file:
        cmd = ["git " + "-C ./" + project_folder + '/' + repository_folder 
               + '/' + " diff --no-color " + commitHash +'^ ' + commitHash]
        gitDiff = subprocess.Popen(cmd, shell=True, universal_newlines=True, stdout=file)
        gitDiff.wait()
        file.flush()

    # Get diff of the commit
    output_diff = ""
    with open(project_folder + '_diff_file.txt', 'rb+') as file:
        output_diff = file.read()
    
    file.close()
    
    # Cut output up in separate lines
    lines = output_diff.decode("latin1").split('\n')
    
    line_num = dict()
    start_line = 0
    name_of_file = ""
    i = 0
    name_file_set = set()

    # Go through the diff output to determine line number(s) of deleted lines
    for line in lines:
        # Header where the name of the file is located.
        if line[0:4] == "diff":
            name_of_file = line.split(' b/')[-1]


        # New section of changed code find starting line number
        elif line[0:2] == "@@":
            lineNum = re.findall('\-[0-9]+\,',line)
            start_line = int(re.findall('\-[0-9]+',line)[0][1:])
            i = 0

        # Lines that has been changed
        elif (line[0:2] == "- "):
            if line_num.get(name_of_file) == None:
                # create set of lines that have been changed for each file
                line_num[name_of_file] = []
                name_file_set.add(name_of_file)
            # Add lines to the set that have been changed
            line_num[name_of_file].append(start_line + i);
            i += 1


        # Indentaion infront of a line of code that hasn't changed
        elif line[0:2] == "  ":
            i += 1
            
    return line_num

In [None]:
# Get time between bug and fix
# INPUT: Bug and fix hash
# OUTPUT: time between the commits
def get_time_between_commits(bug_hash, fix_hash):
    file = open(project_folder + '_time_bug.txt', 'w+', encoding="ISO-8859-1")
    git.show("-s", "--format=%ci", bug_hash, _out="time_bug.txt").wait()
    output_show_bug = file.read()
    file.close()
    
    file = open(project_folder + '_time_fix.txt', 'w+', encoding="ISO-8859-1")
    git.show("-s", "--format=%ci", fix_hash, _out="time_fix.txt").wait()
    output_show_fix = file.read()
    file.close()
    
    time_bug = parser.parse(output_show_bug).replace(tzinfo=None)
    time_fix = parser.parse(output_show_fix).replace(tzinfo=None)
    
    delta_time = time_fix-time_bug
    return delta_time


In [None]:
total = len(fix_commits)
fix_to_bug_dict = dict()
bug_dict = dict()

df = pd.DataFrame(columns=('SHA_fix', 'SHA_bug', 'file', 'time_in_days'))
location = 0

l = len(fix_commits)

# Get the implicated lines
new_lines = get_added_lines(fix_hash)
# Get the hash of the implicated lines and to which file they belong
bug_dict = get_bug_commits(fix_hash, new_lines, bug_dict)


# Calculate the time between commit and fix
for key, file_name in bug_dict.items():
    for file in file_name:
        time = get_time_between_commits(key, fix_hash)
        df.loc[location] = [fix_hash, key, file, time.days]
        location +=1

        
df

In [None]:
total = len(fix_commits)
fix_to_bug_dict = dict()
bug_dict = dict()
df = pd.DataFrame(columns=('SHA_fix', 'SHA_bug', 'file', 'time_in_days'))
location = 0

l = len(fix_commits)
# Iterate through all the fix commits
for idx, fix_hash in enumerate(fix_commits):
    bug_dict.clear()
    # Get the implicated lines
    new_lines = get_added_lines(fix_hash[0])
    # Get the hash of the implicated lines and to which file they belong
    bug_dict = get_bug_commits(fix_hash[0], new_lines, bug_dict)
    
    # Calculate the time between commit and fix
    for key, file_name in bug_dict.items():
        for file in file_name:
            time = get_time_between_commits(key, fix_hash[0])
            df.loc[location] = [fix_hash[0], key, file, time.days]
            location +=1
    
    print_progress(idx, l,1)

In [None]:
df.to_csv(output_folder + '/' +  project_folder + '_implicated_fix_time.csv', index = False)
df

In [None]:
sh.rm(project_folder + '_blame_file.txt')
sh.rm(project_folder + '_diff_file.txt')
sh.rm(project_folder + '_time_bug.txt')
sh.rm(project_folder + '_time_fix.txt')