In [1]:
import sh
import json
import os
import glob2
from collections import Counter
import csv
import sys
import re
from collections import namedtuple
from types import SimpleNamespace

In [2]:
def print_progress(i, l, step=1):
    if i % step == 0:
            print("\r>> {}/{}".format(i + 1, l), end="")
            sys.stdout.flush()

In [None]:
#LUCENE-SOLR
project_folder = "lucene"
output_folder = "out"

repository_folder = "lucene-solr"
repository_link = "https://github.com/apache/lucene-solr.git"
checkout_branch = "trunk"

issues_folders = ["issue_LUCENE", "issue_SOLR"]
issues_tags = ["LUCENE", "SOLR"]
end = "2015-01-01"
start = "Mar 18 2010"

implicated_list_file = output_folder + '/' + project_folder + '_implicated_files.csv'

In [3]:
#CAMEL
project_folder = "camel"
output_folder = "out"

repository_folder = "camel"
repository_link = "https://github.com/apache/camel"
checkout_branch = "master"

issues_folders = ["issue_CAMEL"]
issues_tags = ["CAMEL"]
end = "2015-01-01"
start = "Mar 20 2007"

implicated_list_file = output_folder + '/' + project_folder + '_implicated_files.csv'

In [None]:
#ZOOKEEPER
project_folder = "zookeeper"
output_folder = "out"

repository_folder = "zookeeper"
repository_link = "https://github.com/apache/zookeeper"
checkout_branch = "trunk"

issues_folders = ["issue_ZOOKEEPER"]
issues_tags = ["ZOOKEEPER"]
end = "2015-01-01"
start = "Nov 3 2007"

implicated_list_file = output_folder + '/' + project_folder + '_implicated_files.csv'

In [None]:
#MAVEN
project_folder = "maven"
output_folder = "out"

repository_folder = "maven"
repository_link = "https://github.com/apache/maven"
checkout_branch = "master"

issues_folders = ["issue_MNG"]
issues_tags = ["MNG"]
end = "2015-01-01"
start = "2004-01-01"

implicated_list_file = output_folder + '/' + project_folder + '_implicated_files.csv'

In [None]:
#MAHOUT
project_folder = "mahout"
output_folder = "out"

repository_folder = "mahout"
repository_link = "https://github.com/apache/mahout"
checkout_branch = "master"

issues_folders = ["issue_MAHOUT"]
issues_tags = ["MAHOUT"]
end = "2015-01-01"
start = "Feb 20 2008"

implicated_list_file = output_folder + '/' + project_folder + '_implicated_files.csv'

In [4]:
if not os.path.exists(os.getcwd() + '/' + project_folder + '/' + repository_folder):
    print("Path doesn\'t exists, cloning repo:")
    git = sh.git.bake(_cwd=project_folder)
    git.clone(repository_link)
else:
    print("Repository exists")

Repository exists


In [5]:
git = sh.git.bake(_cwd=project_folder + '/' + repository_folder)
git.checkout(checkout_branch)
git.status()

On branch master
Your branch is up-to-date with 'origin/master'.

nothing to commit, working directory clean

In [6]:
commits = []

log = git.log("--reverse", "--pretty=format:%H", '--before="{}"'.format(end), '--after="{}"'.format(start))
for c_hash in log:
    if(c_hash.endswith('\n')):
        c_hash = c_hash[:-1]
    commits.append(c_hash)
len(commits)

18367

In [7]:
Changes = namedtuple('Changes', 'added, deleted')

def count_changes(commit_hash, file_matching_regexp = '.*\.java'):
    files = git.show("--name-only","--pretty=format:",commit_hash)
    changes_count = dict() #This will be filled with tuples: (lines_added, lines_deleted)
    for f in files:
        f = f[:-1]
        m = re.findall(file_matching_regexp, f)
        if len(m) == 0:
                continue
        diff = str(git.diff("--stat", commit_hash + "^", commit_hash, "--", f))
        m = re.findall("[0-9]+ insertion", diff)
        if len(m) == 0:
            added = 0
        else:
            added = int(m[0].split(' ')[0])
        m = re.findall("[0-9]+ deletion", diff)
        if len(m) == 0:
            deleted = 0
        else:
            deleted = int(m[0].split(' ')[0])

        changes_count[f] = Changes(added, deleted)

    return changes_count

In [8]:
def get_authorship(blame_output, authors_list):
    authorship = dict()
    for i in authors_list:
        m = re.findall(i, str(blame))
        authorship[i] = len(m)
    return authorship

In [9]:
def count_comments(blame_output):
    #Tested: the result of counting the lines of comments on the blame output or on the source code is the same
    regex = re.compile("(?://[^\n]*|/\*(?:(?!\*/).)*\*/)", re.DOTALL)
    m = regex.findall(str(blame_output))
    return sum([len(c.split('\n')) for c in m])

In [10]:
def get_complete_authorship(sha, file_path):
    blame = git.blame("-e", sha, "--", file_path)
    m = re.findall("\(<.+?>", str(blame))
    authors = []
    for a in m:
        authors.append(a[2:-1])
    #print(set(authors))
    authorship = dict()
    for i in set(authors):
        m = re.findall(i, str(blame))
        #print(m)
        authorship[i] = len(m)
    return authorship

In [11]:
def update_counters(file_dict, file_name, author, changes):
#Create/update file entries in the dict()
    if file_dict.get(file_name) == None:
        file_dict[file_name] = SimpleNamespace()
        f = file_dict[file_name]
        f.author_dict = dict()
        f.tot_lines_added = 0
        f.tot_lines_deleted = 0
        f.tot_commits = 0
         
    f = file_dict[file_name]
    f.tot_lines_added += changes.added
    f.tot_lines_deleted += changes.deleted
    f.tot_commits += 1
     
#Create/update author entries in the dict() for that file
    if f.author_dict.get(author) == None:
        f.author_dict[author] = SimpleNamespace()
        a = f.author_dict[author]
        a.lines_added = 0
        a.lines_deleted = 0
        a.commits = 0
        
    a = f.author_dict[author]
    a.lines_added += changes.added
    a.lines_deleted += changes.deleted
    a.commits += 1

In [12]:
def extract_bug(file_path):
    if not file.endswith('.json'):
            return None
    else:
        bug_json_string = open(file_path).read()
        bug = json.loads(bug_json_string)
        bug_fields = bug.get('fields')
        
        if bug_fields['issuetype']['name'] != 'Bug':
            return None
   
        if bug_fields['resolution'] == None:
            return None 
                
        if bug_fields['resolution']['name'] != 'Fixed':
            return None
    return bug

In [13]:
#Extracting all the fixed bugs from the bug repo
bugs = dict() #Dictionary of bugs indexed by TAG+ID (e.g. LUCENE-1234)
for fld in issues_folders:
    path = project_folder + '/' + fld
    dir_list = os.listdir(os.getcwd() + '/' + path)
    l = len(dir_list)
    print()
    print("Progress for " + fld + ":")
    for idx, file in enumerate(dir_list):
        print_progress(idx,l,10)
        bug = extract_bug(path + '/' + file)
        if(bug == None):
            continue
        bugs[bug['key'].upper()] = bug

print()
print(len(bugs), "fixed bugs extracted.")


Progress for issue_CAMEL:
>> 9001/9002
2525 fixed bugs extracted.


In [14]:
from datetime import datetime
filename = output_folder + '/' + project_folder + '_issue_dates.csv'
file = open(filename, 'w')
wr = csv.writer(file, quoting=csv.QUOTE_ALL)
wr.writerow(['issue_key', 'created', 'resolutiondate', 'days_to_solve'])
for key,bug in bugs.items():
    c_date = bug['fields']['created']
    r_date = bug['fields']['resolutiondate']
    c_date = datetime.strptime(c_date[:10], "%Y-%m-%d")
    r_date = datetime.strptime(r_date[:10], "%Y-%m-%d")
    #print(c_date.date(), r_date.date(), (r_date - c_date).days)
    wr.writerow([bug['key'], c_date.date(), r_date.date(), (r_date - c_date).days])
#resolutiondate

In [15]:
def get_fixed_bugs(commit_hash):
    details = git.show("--name-only","--pretty=format:%B", commit_hash)
    #init = details.split(':')[0].upper()
    f_keys = []
    
    for tag in issues_tags:
        keys = re.findall(tag + "-[0-9]+",str(details))
        if len(keys) != 0:
            for k in keys:
                if bugs.get(k) != None:
                    #Then it fixes a bug
                    f_keys.append(k)
    return f_keys

In [16]:
def get_affected_versions(fixed_bug_keys):
    affected_versions = []
    for b in fixed_bug_keys:
        versions = bugs[b]['fields']['versions']
        if len(versions) == 0:
            continue
        else:    
            for v in versions:
                n = v['name']
                affected_versions.append(n)
    return affected_versions
#e.g. 
#get_affected_versions(['LUCENE-2244', 'LUCENE-3255'])

In [20]:
l = len(commits)

file_dict = dict()

columns = '''project, file, sha, author, 
author_file_tot_added, author_file_added_this_commit, file_tot_added, 
author_file_tot_deleted, author_file_deleted_this_commit, file_tot_deleted, 
author_file_commits, file_tot_commits, 
current_lines_authored, current_file_size, current_comment_lines, 
max_current_author, total_current_authors, 
commit_date, 
bug_fix, fixed_bugs, affected_versions'''

filename = output_folder + '/' + project_folder + '_out.csv'
file = open(filename, 'w')
wr = csv.writer(file, quoting=csv.QUOTE_ALL)
wr.writerow(columns.replace('\n', '').split(', '))

for idx, commit_hash in enumerate(commits):
    print_progress(idx, l)
    s = str(git.show("--name-only", '--format="%aN <%aE>"', commit_hash)).split('\n')[0].split('<')
    author_name = s[0][1:-1]
    #author_email = s[1][:-2]
    date = str(git.show("-s", "--format=%ci", commit_hash))[:-1].replace("\n", '')
    try:
        file_changes = count_changes(commit_hash)
    except:
        #Bad revision commit_hash^ sometimes happened, skip these commits
        print("\rSkipped commit number {}".format(idx))
        continue
    
    
    #Checking if it is a bug_fix and the affected versions
    bug_fix = 0
    f_keys = get_fixed_bugs(commit_hash)
    if len(f_keys) != 0:
        bug_fix = 1
            
    for file_name, changes in file_changes.items():
        #print(key, value)
        update_counters(file_dict, file_name, author_name, changes)
        
        f = file_dict[file_name]
        #Authorship and size:
        try:
            blame = git.blame(commit_hash, "--", file_name)
            authorship = get_authorship(blame, f.author_dict.keys())
            # IF YOU DON'T CONSIDER THE WHOLE HISTORY, THEN THESE AUTHORED LINES COUNT WILL
            # NOT SUM TO THE TOTAL SIZE, BECAUSE THERE CAN BE STILL IN THE FILE LINES FROM AN AUTHOR
            # THAT NEVER CONTRIBUTED IN OUR TIME WINDOW
            size = len(blame.split("\n")) - 1
            comments = count_comments(blame)
            
            #Authorship metrics
            compl_authorship = get_complete_authorship(commit_hash, file_name)
            total_current_authors = len(compl_authorship)
            max_authorship = max(compl_authorship.values())

        except:
            #This happens if the file has been deleted by the commit
            size = 0
            comments = 0

        for author, author_counter in f.author_dict.items():
            wr.writerow([project_folder, #project
                         file_name, #file
                         commit_hash, #sha
                         author, #author
                         #Lines added
                         author_counter.lines_added, #author_file_tot_added
                         (changes.added if author == author_name else 0), #author_file_added_this_commit
                         f.tot_lines_added, #file_tot_added
                         #Lines deleted
                         author_counter.lines_deleted, #author_file_tot_deleted
                         (changes.deleted if author == author_name else 0), #author_file_deleted_this_commit
                         f.tot_lines_deleted, #file_tot_deleted
                         #Commits
                         author_counter.commits, #author_file_commits
                         f.tot_commits, #file_tot_commits
                         #Authorship
                         (authorship[author] if size != 0 else 0), #current_lines_authored
                         size, #current_file_size
                         comments, #current_comment_lines
                         max_authorship, #max_current_author 
                         total_current_authors, #total_current_authors
                         #Other stuff
                         date, #commit_date
                         bug_fix, #bug_fix
                         f_keys, #fixed_bugs
                         get_affected_versions(f_keys) #affected_versions
                        ]) 

        #print()
        #print(file_dict.get(key))
file.close()

Skipped commit number 16271
>> 18367/18367

In [21]:
import pandas as pd
df = pd.read_csv(filename)
df

Unnamed: 0,project,file,sha,author,author_file_tot_added,author_file_added_this_commit,file_tot_added,author_file_tot_deleted,author_file_deleted_this_commit,file_tot_deleted,...,file_tot_commits,current_lines_authored,current_file_size,current_comment_lines,max_current_author,total_current_authors,commit_date,bug_fix,fixed_bugs,affected_versions
0,camel,camel-core/src/main/java/org/apache/camel/proc...,d0cba2ad2abda2a4e6a18b29a43f86405452f031,James Strachan,95,95,95,0,0,0,...,1,95,95,23,95,1,2007-03-21 09:56:11 +0000,0,[],[]
1,camel,camel-core/src/main/java/org/apache/camel/proc...,d0cba2ad2abda2a4e6a18b29a43f86405452f031,James Strachan,129,129,129,0,0,0,...,1,129,129,39,129,1,2007-03-21 09:56:11 +0000,0,[],[]
2,camel,camel-core/src/main/java/org/apache/camel/proc...,d0cba2ad2abda2a4e6a18b29a43f86405452f031,James Strachan,226,226,226,0,0,0,...,1,226,226,75,226,1,2007-03-21 09:56:11 +0000,0,[],[]
3,camel,camel-core/src/main/java/org/apache/camel/impl...,97431e9df68c8b869675bdb37e611140c4d977ad,James Strachan,9,9,9,9,9,9,...,1,128,128,21,128,1,2007-03-21 12:33:32 +0000,0,[],[]
4,camel,camel-core/src/main/java/org/apache/camel/buil...,97431e9df68c8b869675bdb37e611140c4d977ad,James Strachan,32,32,32,0,0,0,...,1,98,102,43,98,2,2007-03-21 12:33:32 +0000,0,[],[]
5,camel,camel-core/src/main/java/org/apache/camel/proc...,97431e9df68c8b869675bdb37e611140c4d977ad,James Strachan,149,20,149,0,0,0,...,2,149,149,45,149,1,2007-03-21 12:33:32 +0000,0,[],[]
6,camel,camel-core/src/main/java/org/apache/camel/proc...,97431e9df68c8b869675bdb37e611140c4d977ad,James Strachan,231,5,231,0,0,0,...,2,231,231,75,231,1,2007-03-21 12:33:32 +0000,0,[],[]
7,camel,camel-core/src/main/java/org/apache/camel/buil...,97431e9df68c8b869675bdb37e611140c4d977ad,James Strachan,45,45,45,0,0,0,...,1,45,45,22,45,1,2007-03-21 12:33:32 +0000,0,[],[]
8,camel,camel-core/src/main/java/org/apache/camel/buil...,97431e9df68c8b869675bdb37e611140c4d977ad,James Strachan,60,60,60,3,3,3,...,1,235,241,27,235,2,2007-03-21 12:33:32 +0000,0,[],[]
9,camel,camel-core/src/main/java/org/apache/camel/buil...,97431e9df68c8b869675bdb37e611140c4d977ad,James Strachan,3,3,3,3,3,3,...,1,138,144,44,138,2,2007-03-21 12:33:32 +0000,0,[],[]


In [22]:
df2 = pd.read_csv(implicated_list_file)
df2

Unnamed: 0,sha,file,implicated
0,b093da3c4bcaa0edad52e47d174c7e768043f47c,camel-core/src/main/java/org/apache/camel/util...,1
1,0c13d3011e90353c91160f19f0e54d63a94ee881,components/camel-xmpp/src/main/java/org/apache...,1
2,2b965b2cae22357e0f7a389ec365e153e4f778fd,camel-core/src/main/java/org/apache/camel/comp...,1
3,ac1c73167096bb11ba47e8fd11260556f75c4a5a,camel-core/src/main/java/org/apache/camel/impl...,1
4,5dd8d352fef28f8cc31232c49d78e0e8d206696c,camel-core/src/main/java/org/apache/camel/util...,1
5,b8f68ff7e4567bc20254d6178e855bd54e51c148,components/camel-spring/src/test/java/org/apac...,1
6,39c121ce766021d2e3a427c8fcd1c59e0c92658d,components/camel-hbase/src/main/java/org/apach...,1
7,df18cc87de42ac547fddf57b40518573dc2ab4f5,camel-core/src/main/java/org/apache/camel/impl...,1
8,df18cc87de42ac547fddf57b40518573dc2ab4f5,camel-core/src/test/java/org/apache/camel/impl...,1
9,bfd2939754b49975232b78ecb52e040f56c22744,components/camel-http/src/main/java/org/apache...,1


In [23]:
df = df.merge(df2, on=['sha', 'file'], how='left').fillna(0)
df

Unnamed: 0,project,file,sha,author,author_file_tot_added,author_file_added_this_commit,file_tot_added,author_file_tot_deleted,author_file_deleted_this_commit,file_tot_deleted,...,current_lines_authored,current_file_size,current_comment_lines,max_current_author,total_current_authors,commit_date,bug_fix,fixed_bugs,affected_versions,implicated
0,camel,camel-core/src/main/java/org/apache/camel/proc...,d0cba2ad2abda2a4e6a18b29a43f86405452f031,James Strachan,95,95,95,0,0,0,...,95,95,23,95,1,2007-03-21 09:56:11 +0000,0,[],[],0
1,camel,camel-core/src/main/java/org/apache/camel/proc...,d0cba2ad2abda2a4e6a18b29a43f86405452f031,James Strachan,129,129,129,0,0,0,...,129,129,39,129,1,2007-03-21 09:56:11 +0000,0,[],[],1
2,camel,camel-core/src/main/java/org/apache/camel/proc...,d0cba2ad2abda2a4e6a18b29a43f86405452f031,James Strachan,226,226,226,0,0,0,...,226,226,75,226,1,2007-03-21 09:56:11 +0000,0,[],[],0
3,camel,camel-core/src/main/java/org/apache/camel/impl...,97431e9df68c8b869675bdb37e611140c4d977ad,James Strachan,9,9,9,9,9,9,...,128,128,21,128,1,2007-03-21 12:33:32 +0000,0,[],[],0
4,camel,camel-core/src/main/java/org/apache/camel/buil...,97431e9df68c8b869675bdb37e611140c4d977ad,James Strachan,32,32,32,0,0,0,...,98,102,43,98,2,2007-03-21 12:33:32 +0000,0,[],[],0
5,camel,camel-core/src/main/java/org/apache/camel/proc...,97431e9df68c8b869675bdb37e611140c4d977ad,James Strachan,149,20,149,0,0,0,...,149,149,45,149,1,2007-03-21 12:33:32 +0000,0,[],[],1
6,camel,camel-core/src/main/java/org/apache/camel/proc...,97431e9df68c8b869675bdb37e611140c4d977ad,James Strachan,231,5,231,0,0,0,...,231,231,75,231,1,2007-03-21 12:33:32 +0000,0,[],[],0
7,camel,camel-core/src/main/java/org/apache/camel/buil...,97431e9df68c8b869675bdb37e611140c4d977ad,James Strachan,45,45,45,0,0,0,...,45,45,22,45,1,2007-03-21 12:33:32 +0000,0,[],[],0
8,camel,camel-core/src/main/java/org/apache/camel/buil...,97431e9df68c8b869675bdb37e611140c4d977ad,James Strachan,60,60,60,3,3,3,...,235,241,27,235,2,2007-03-21 12:33:32 +0000,0,[],[],0
9,camel,camel-core/src/main/java/org/apache/camel/buil...,97431e9df68c8b869675bdb37e611140c4d977ad,James Strachan,3,3,3,3,3,3,...,138,144,44,138,2,2007-03-21 12:33:32 +0000,0,[],[],0


In [24]:
df.to_csv(output_folder + '/' + project_folder + '_dataset.csv', index = False)

In [None]:
#Example of file with a lot of owners and commits
#df_test = df[df['file'] == 'lucene/core/src/java/org/apache/lucene/index/IndexWriter.java']
#df_test[df_test['sha'] == '5ec48108df8997430e3e8b47c056d0d63c6d2db3']