In [1]:
import sh
import json
import os
import sys
import re
import io
import pandas as pd
from dateutil import parser

In [2]:
project_path = os.getcwd()

In [3]:
def print_progress(i, l, step=1):
    if i % step == 0:
            print("\r>> {}/{}".format(i + 1, l), end="")
            sys.stdout.flush()

In [4]:
if not os.path.exists(os.getcwd() + '/Lucene/lucene-solr'):
    print("Path doesn\'t exists, cloning repo:")
    os.chdir(project_path + "/Lucene")
    sh.git.clone("https://github.com/apache/lucene-solr.git")
    os.chdir(project_path)
else:
    print("Repository exists")

Repository exists


In [5]:
git = sh.git.bake(_cwd='Lucene/lucene-solr')
git.checkout("trunk")
git.status()

On branch trunk
Your branch is behind 'origin/trunk' by 113 commits, and can be fast-forwarded.
  (use "git pull" to update your local branch)
nothing to commit, working directory clean

In [6]:
def extract_bug(file):
    if not file.endswith('.json'):
            return None
    else:
        bug_json_string = open("Lucene/issue_LUCENE/" + file).read()
        bug = json.loads(bug_json_string)
        bug_fields = bug.get('fields')
        
        if bug_fields['issuetype']['name'] != 'Bug':
            return None
   
        if bug_fields['resolution'] == None:
            return None 
                
        if bug_fields['resolution']['name'] != 'Fixed':
            return None
    return bug

In [7]:
#Extracting all the fixed bugs from the bug repo
bugs = dict() #Dictionary of bugs indexed by LUCENE+ID
dir_list = os.listdir(os.getcwd() + "/Lucene/issue_LUCENE")
l = len(dir_list)
print("Progress:")
for idx, file in enumerate(dir_list):
    print_progress(idx,l,10)
    bug = extract_bug(file)
    if(bug == None):
        continue
    bugs[bug['key'].upper()] = bug


Progress:
>> 6641/6641

In [8]:
commits = []

log = git.log("--reverse", "--pretty=format:%H")
for c_hash in log:
    if(c_hash.endswith('\n')):
        c_hash = c_hash[:-1]
    commits.append(c_hash)
len(commits)

16066

In [9]:
#THIS IS SPECIFIC FOR LUCENE: It removes the SOLR commits, then splits the bug fixing ones from the others

fix_commits = [] #List that will be filled with couples (commit, bugs-fixed)
non_fix_commits = [] #List that will be filled only with commit hashes
l = len(commits)
print("Progress:")
for idx, c in enumerate(commits):
    
    print_progress(idx, l, 10)
        
    details = git.show("--name-only","--pretty=format:%B", c)
    files = git.show("--name-only","--pretty=format:",c)
    
    #Discarding commits that modify only solr files
    only_solr = True
    for file in files:
        if file.startswith('lucene'):
            only_solr = False

    if only_solr is True:
        continue
    
    init = details.split(':')[0].upper()
    
    #if init.find('SOLR') != -1:
    #    continue
    
    if init.startswith('LUCENE') or init.startswith('SOLR'):
        if init.find(', ') != -1:
            keys = init.split(', ')
        elif init.find(',') != -1:
            keys = init.split(',')
        else:
            keys = [init.split(' ')[0]]
            
        f_keys = []    
        for k in keys:
            if bugs.get(k) != None:
                #Then it fixes a bug
                f_keys.append(k)
        
        if len(f_keys) != 0:
            fix_commits.append((c,f_keys))
        else:
            non_fix_commits.append(c)
    else:
        non_fix_commits.append(c)
        
#Some commits cite more than one issue...which lines are the bug fix, if one of the two is only an improvement ?
#HP: we consider all the removed lines as implicated (fix-inducing)

Progress:
>> 16061/16066

In [10]:
print(len(fix_commits)) #Number of LUCENE bug fixing commits
print(len(non_fix_commits)) #Number of LUCENE non bug fixing commits
print(len(commits)) #Number of LUCENE + SOLR commits
#We can see that on average half of the commits were only solr related, so we discarded them

1613
6688
16066


In [40]:

# INPUT: fix_hash - Hash of the fix commit
#        line_num - All implicated lines
#        bug_hash_dict - Dictionary of all the implicated commits with the implicated files
# OUTPUT:bug_hash_dict - Dictionary of all the implicated commits with the implicated files 
def get_bug_commits(fix_hash, line_num, bug_hash_dict):    
    git.checkout(str(fix_hash + "^"))

    # Go through all the files that have been changed
    for key_file_name in line_num:
        ## Check if file is a java file if not continue with next file
        if(key_file_name.find('.java') == -1):
            continue
        
        ## If the file is part of the SOLR files then continue with next file
        if(key_file_name.find('solr') != -1):
            continue
            
        blame_output = git.blame("-l",key_file_name)
        blame_output_split = blame_output.split('\n')
        
        # Get hash for all the lines that have been changed.
        for i in line_num[key_file_name]:
            hash_of_implicated_line = blame_output_split[i-1].split(' ')[0]
            if bug_hash_dict.get(hash_of_implicated_line) == None:
                bug_hash_dict[hash_of_implicated_line] = set([key_file_name])
            else:
                bug_hash_dict[hash_of_implicated_line].add(key_file_name)
            
    return bug_hash_dict

In [41]:

# INPUT: Commit hash
# OUTPUT: All lines that were changed (implicated lines)
def get_added_lines(commitHash):
    file = open('diff_file.txt', 'w+', encoding="ISO-8859-1")
    git.diff("--no-color", commitHash + "^", commitHash, _out="diff_file.txt").wait()
    output_diff = file.read()
    file.close()
    
    # Cut output up in separate lines
    lines = output_diff.split('\n')

    
    line_num = dict()
    start_line = 0
    name_of_file = ""
    i = 0
    name_file_set = set()

    # Go through the diff output to determine line number(s) of deleted lines
    for line in lines:
        # Header where the name of the file is located.
        if line[0:4] == "diff":
            name_of_file = line.split(' b/')[-1]


        # New section of changed code find starting line number
        elif line[0:2] == "@@":
            lineNum = re.findall('\-[0-9]+\,',line)
            start_line = int(re.findall('\-[0-9]+',line)[0][1:])
            i = 0

        # Lines that has been changed
        elif (line[0:2] == "- "):
            if line_num.get(name_of_file) == None:
                line_num[name_of_file] = []
                name_file_set.add(name_of_file)
            line_num[name_of_file].append(start_line + i);
            i += 1


        # Indentaion infront of a line of code that hasn't changed
        elif line[0:2] == "  ":
            i += 1
            
    return line_num

In [34]:
# Get time between bug and fix
# INPUT: Bug and fix hash
# OUTPUT: time between the commits
def get_time_between_commits(bug_hash, fix_hash):
    file = open('time_bug.txt', 'w+', encoding="ISO-8859-1")
    git.show("-s", "--format=%ci", bug_hash, _out="time_bug.txt").wait()
    output_show_bug = file.read()
    file.close()
    
    file = open('time_fix.txt', 'w+', encoding="ISO-8859-1")
    git.show("-s", "--format=%ci", fix_hash, _out="time_fix.txt").wait()
    output_show_fix = file.read()
    file.close()
    
    time_bug = parser.parse(output_show_bug).replace(tzinfo=None)
    time_fix = parser.parse(output_show_fix).replace(tzinfo=None)
    
    delta_time = time_fix-time_bug
    return delta_time


In [21]:
total = len(fix_commits)
fix_to_bug_dict = dict()
bug_dict = dict()

df = pd.DataFrame(columns=('SHA_fix', 'SHA_bug', 'file', 'time_in_days'))
location = 0

l = len(fix_commits)

# Get the implicated lines
new_lines = get_added_lines(fix_hash)
# Get the hash of the implicated lines and to which file they belong
bug_dict = get_bug_commits(fix_hash, new_lines, bug_dict)


# Calculate the time between commit and fix
for key, file_name in bug_dict.items():
    for file in file_name:
        time = get_time_between_commits(key, fix_hash)
        df.loc[location] = [fix_hash, key, file, time.days]
        location +=1

        
df

bed318cd3a0b861269abf771d309f5110497ccf2


Unnamed: 0,SHA_fix,SHA_bug,file,time_in_days
0,bed318cd3a0b861269abf771d309f5110497ccf2,c27ba2e27271c167868a190861ee2f3cf391a6ee,lucene/contrib/misc/src/java/org/apache/lucene...,60


In [50]:
total = len(fix_commits)
fix_to_bug_dict = dict()
bug_dict = dict()
df = pd.DataFrame(columns=('SHA_fix', 'SHA_bug', 'file', 'time_in_days'))
location = 0

l = len(fix_commits)
# Iterate through all the fix commits
for idx, fix_hash in enumerate(fix_commits):
    bug_dict.clear()
    # Get the implicated lines
    new_lines = get_added_lines(fix_hash[0])
    # Get the hash of the implicated lines and to which file they belong
    bug_dict = get_bug_commits(fix_hash[0], new_lines, bug_dict)
    
    # Calculate the time between commit and fix
    for key, file_name in bug_dict.items():
        for file in file_name:
            time = get_time_between_commits(key, fix_hash[0])
            df.loc[location] = [fix_hash[0], key, file, time.days]
            location +=1
    
    print_progress(idx, l,1)

>> 1613/1613

In [55]:
df.to_csv("dataset/lucene_implicated_fix_time.csv")
df

Unnamed: 0,SHA_fix,SHA_bug,file,time_in_days
0,1750b26b33a42bb3f86609a1d3167f00181e64d9,3ee0ace1ba6b9bff3ffaa278c0bba07e6064057d,lucene/src/java/org/apache/lucene/store/Direct...,5
1,1750b26b33a42bb3f86609a1d3167f00181e64d9,3ee0ace1ba6b9bff3ffaa278c0bba07e6064057d,lucene/src/java/org/apache/lucene/store/FSDire...,5
2,1750b26b33a42bb3f86609a1d3167f00181e64d9,3ee0ace1ba6b9bff3ffaa278c0bba07e6064057d,lucene/src/java/org/apache/lucene/store/FileSw...,5
3,1750b26b33a42bb3f86609a1d3167f00181e64d9,3ee0ace1ba6b9bff3ffaa278c0bba07e6064057d,lucene/src/java/org/apache/lucene/store/MMapDi...,5
4,1750b26b33a42bb3f86609a1d3167f00181e64d9,3ee0ace1ba6b9bff3ffaa278c0bba07e6064057d,lucene/src/java/org/apache/lucene/index/Segmen...,5
5,1750b26b33a42bb3f86609a1d3167f00181e64d9,3ee0ace1ba6b9bff3ffaa278c0bba07e6064057d,lucene/src/java/org/apache/lucene/store/NIOFSD...,5
6,1750b26b33a42bb3f86609a1d3167f00181e64d9,3ee0ace1ba6b9bff3ffaa278c0bba07e6064057d,lucene/src/test/org/apache/lucene/store/MockRA...,5
7,1750b26b33a42bb3f86609a1d3167f00181e64d9,3ee0ace1ba6b9bff3ffaa278c0bba07e6064057d,lucene/src/java/org/apache/lucene/index/Direct...,5
8,1750b26b33a42bb3f86609a1d3167f00181e64d9,3ee0ace1ba6b9bff3ffaa278c0bba07e6064057d,lucene/src/java/org/apache/lucene/index/IndexW...,5
9,1750b26b33a42bb3f86609a1d3167f00181e64d9,3ee0ace1ba6b9bff3ffaa278c0bba07e6064057d,lucene/src/java/org/apache/lucene/store/Simple...,5
