In [1]:
import sh
import json
import os
import sys
import re
import io
import pandas as pd
import subprocess

In [2]:
project_path = os.getcwd()
print(project_path)
project = "hadoop"
issues = "issue_HADOOP"
ISSUE_ID = "HADOOP"
repository = "https://github.com/apache/hadoop.git"

/Users/kajdreef/Documents/Programmeren/Master/IN4334-Mining_Software_repositories/dataset_creation/hadoop


In [3]:
def print_progress(i, l, step=1):
    if i % step == 0:
            print("\r>> {}/{}".format(i + 1, l), end="")
            sys.stdout.flush()

In [4]:
if not os.path.exists(os.getcwd() + '/' + project):
    print("Path doesn\'t exists, cloning repo:")
    sh.git.clone(repository)
else:
    print("Repository exists")

Repository exists


In [5]:
git = sh.git.bake(_cwd=project)
git.checkout("master")
git.status()

On branch master
Your branch is up-to-date with 'origin/master'.
nothing to commit, working directory clean

In [6]:
def extract_bug(file):
    if not file.endswith('.json'):
            return None
    else:
        bug_json_string = open(issues + '/' + file).read()
        bug = json.loads(bug_json_string)
        bug_fields = bug.get('fields')
        
        if bug_fields['issuetype']['name'] != 'Bug':
            return None
   
        if bug_fields['resolution'] == None:
            return None 
                
        if bug_fields['resolution']['name'] != 'Fixed':
            return None
    return bug

In [7]:
#Extracting all the fixed bugs from the bug repo
bugs = dict() #Dictionary of bugs indexed by HADOOP+ID
dir_list = os.listdir(os.getcwd() + '/' + issues)
l = len(dir_list)
print("Progress:")
for idx, file in enumerate(dir_list):
    print_progress(idx,l,10)
    bug = extract_bug(file)
    if(bug == None):
        continue
    bugs[bug['key'].upper()] = bug

Progress:
>> 10191/10191

In [8]:
commits = []

log = git.log("--reverse", "--pretty=format:%H")
for c_hash in log:
    if(c_hash.endswith('\n')):
        c_hash = c_hash[:-1]
    commits.append(c_hash)
len(commits)

9714

In [9]:
def get_fixed_bugs(commit_hash):
    details = git.show("--name-only","--pretty=format:%B", commit_hash)
    #init = details.split(':')[0].upper()
    f_keys = []
    keys = re.findall(ISSUE_ID + "-[0-9]+",str(details))
    if len(keys) != 0:
        for k in keys:
            if bugs.get(k) != None:
                #Then it fixes a bug
                f_keys.append(k)
    return f_keys

In [10]:
#Splits the bug fixing ones from the others

fix_commits = [] #List that will be filled with couples (commit, bugs-fixed)
non_fix_commits = [] #List that will be filled only with commit hashes
l = len(commits)
print("Progress:")
for idx, c in enumerate(commits):
    
    print_progress(idx, l, 10)
        
    details = git.show("--name-only","--pretty=format:%B", c)
    files = git.show("--name-only","--pretty=format:",c)
    
    
    f_keys = get_fixed_bugs(c)    
    if len(f_keys) != 0:
        fix_commits.append((c,f_keys))
    else:
        non_fix_commits.append(c)
        
#Some commits cite more than one issue...which lines are the bug fix, if one of the two is only an improvement ?
#HP: we consider all the removed lines as implicated (fix-inducing)

Progress:
>> 9711/9714

In [11]:
print(len(fix_commits)) #Number of HADOOP bug fixing commits
print(len(non_fix_commits)) #Number of HADOOP non bug fixing commits
print(len(commits)) #Number of HADOOP commits
#We can see that on average half of the commits were only solr related, so we discarded them

1443
8271
9714


In [22]:
def get_added_lines(commitHash):
    
    # Get diff of current and previous commit and store this in a file
    with open('diff_file.txt', 'wb+') as file:
        cmd = ["git " + "-C ./" + project + " diff --no-color " + commitHash +'^ ' + commitHash]
        gitDiff = subprocess.Popen(cmd, shell=True, universal_newlines=True, stdout=file)
        gitDiff.wait()
        file.flush()

    # Get diff of the commit
    output_diff = ""
    with open('diff_file.txt', 'rb+') as file:
        output_diff = file.read()
    
    file.close()
    
    # Cut output up in separate lines
    lines = output_diff.decode("latin1").split('\n')
    
    line_num = dict()
    start_line = 0
    name_of_file = ""
    i = 0
    name_file_set = set()

    # Go through the diff output to determine line number(s) of deleted lines
    for line in lines:
        # Header where the name of the file is located.
        if line[0:4] == "diff":
            name_of_file = line.split(' b/')[-1]


        # New section of changed code find starting line number
        elif line[0:2] == "@@":
            lineNum = re.findall('\-[0-9]+\,',line)
            start_line = int(re.findall('\-[0-9]+',line)[0][1:])
            i = 0

        # Lines that has been changed
        elif (line[0:2] == "- "):
            if line_num.get(name_of_file) == None:
                # create set of lines that have been changed for each file
                line_num[name_of_file] = []
                name_file_set.add(name_of_file)
            # Add lines to the set that have been changed
            line_num[name_of_file].append(start_line + i);
            i += 1


        # Indentaion infront of a line of code that hasn't changed
        elif line[0:2] == "  ":
            i += 1
            
    return line_num

In [23]:
def get_bug_commits(commit_hash, line_num, bug_hash):
    # checkout previous commit
    git.checkout("--force", commit_hash + "^")

    # Go through all the files that have been changed
    for key_file_name in line_num:
        ## Check if file is a java file if not continue with next file
        if(key_file_name.find('.java') == -1):
            continue
        
        # blame file to see when the lines where last changed (on previous commit)
        with open('blame_file.txt', 'wb+') as file:
            # --root is added to see boundary commits as normal commits
            cmd = ["git " + "-C ./"+ project +" blame -l --root " + key_file_name]
            gitBlame = subprocess.Popen(cmd, shell=True, universal_newlines=True, stdout=file)
            gitBlame.wait()
            file.flush()
            
        output_blame = ""
        with open('blame_file.txt', 'rb+') as file:
            output_blame = file.read()

        # Cut output up in separate lines
        blame_output = output_blame.decode("latin1")        
        blame_output_split = blame_output.split('\n')
                
        # Get hash for all the lines that have been changed.
        for i in line_num[key_file_name]:
            line = blame_output_split[i-1]
            hash_of_line = line.split(' ')[0]
            
            if bug_hash.get(hash_of_line) == None:
                bug_hash[hash_of_line] = set([key_file_name])
            else:
                bug_hash[hash_of_line].add(key_file_name)
            
    return bug_hash

In [24]:
total = len(fix_commits)
bug_commits = dict()

print("Progress:")
for idx, fix_hash in enumerate(fix_commits):
    new_lines = get_added_lines(fix_hash[0])
    bug_commits = get_bug_commits(fix_hash[0], new_lines, bug_commits)

    print_progress(idx, total,1)



Progress:
>> 1443/1443

In [25]:
hash_file_list = []
for key, file_name_set in bug_commits.items():
    for file_name in file_name_set:
        hash_file_list.append([key, file_name, 1])

In [26]:
df = pd.DataFrame(hash_file_list, columns=['commit_hash','file', 'implicated'])
df.to_csv("dataset/implicated_files.csv")

df
# df[19:20:]['commit_hash']

Unnamed: 0,commit_hash,file,implicated
0,666a8e160035d60b7fd3634e4d703d8e28a1af6c,src/test/core/org/apache/hadoop/fs/TestListFil...,1
1,666a8e160035d60b7fd3634e4d703d8e28a1af6c,src/java/org/apache/hadoop/fs/FileSystem.java,1
2,666a8e160035d60b7fd3634e4d703d8e28a1af6c,src/java/org/apache/hadoop/fs/FilterFileSystem...,1
3,666a8e160035d60b7fd3634e4d703d8e28a1af6c,src/java/org/apache/hadoop/fs/ChecksumFileSyst...,1
4,bec76354fd7ef5f9f1e1abe2adb3407e102e57a8,hadoop-common-project/hadoop-common/src/main/j...,1
5,f3bdc03270b2dd1d66959e54ca556921986d7d23,hadoop-common-project/hadoop-common/src/main/j...,1
6,1409441e45d1b114f3c061370cefc8c9c8669f64,hadoop-hdfs-project/hadoop-hdfs/src/test/java/...,1
7,d83061b71b305dda003002a26df1f29dd6634a45,hadoop-common-project/hadoop-common/src/main/j...,1
8,d83061b71b305dda003002a26df1f29dd6634a45,hadoop-common-project/hadoop-common/src/test/j...,1
9,d83061b71b305dda003002a26df1f29dd6634a45,hadoop-hdfs-project/hadoop-hdfs/src/main/java/...,1
