In [9]:
import sh
import json
import os
import glob2
from collections import Counter
import csv
import sys
import re
from collections import namedtuple
from types import SimpleNamespace

In [10]:
def print_progress(i, l, step=1):
    if i % step == 0:
            print("\r>> {}/{}".format(i + 1, l), end="")
            sys.stdout.flush()

In [11]:
#LUCENE-SOLR
project_folder = "lucene"
output_folder = "out"

repository_folder = "lucene-solr"
repository_link = "https://github.com/apache/lucene-solr.git"
checkout_branch = "trunk"

issues_folders = ["issue_LUCENE", "issue_SOLR"]
issues_tags = ["LUCENE", "SOLR"]
end = "2015-01-01"
start = "Mar 18 2010"

implicated_list_file = output_folder + '/' + project_folder + '_implicated_files.csv'

In [8]:
#CAMEL
project_folder = "camel"
output_folder = "out"

repository_folder = "camel"
repository_link = "https://github.com/apache/camel"
checkout_branch = "master"

issues_folders = ["issue_CAMEL"]
issues_tags = ["CAMEL"]
end = "2015-01-01"
start = "Mar 20 2007"

implicated_list_file = output_folder + '/' + project_folder + '_implicated_files.csv'

In [12]:
if not os.path.exists(os.getcwd() + '/' + project_folder + '/' + repository_folder):
    print("Path doesn\'t exists, cloning repo:")
    git = sh.git.bake(_cwd=project_folder)
    git.clone(repository_link)
else:
    print("Repository exists")

Repository exists


In [13]:
git = sh.git.bake(_cwd=project_folder + '/' + repository_folder)
git.checkout(checkout_branch)
git.status()

On branch trunk
Your branch is up-to-date with 'origin/trunk'.

nothing to commit, working directory clean

In [14]:
commits = []

log = git.log("--reverse", "--pretty=format:%H", '--before="{}"'.format(end), '--after="{}"'.format(start))
for c_hash in log:
    if(c_hash.endswith('\n')):
        c_hash = c_hash[:-1]
    commits.append(c_hash)
len(commits)

13841

In [15]:
Changes = namedtuple('Changes', 'added, deleted')

def count_changes(commit_hash, file_matching_regexp = '.*\.java'):
    files = git.show("--name-only","--pretty=format:",commit_hash)
    changes_count = dict() #This will be filled with tuples: (lines_added, lines_deleted)
    for f in files:
        f = f[:-1]
        m = re.findall(file_matching_regexp, f)
        if len(m) == 0:
                continue
        diff = str(git.diff("--stat", commit_hash + "^", commit_hash, "--", f))
        m = re.findall("[0-9]+ insertion", diff)
        if len(m) == 0:
            added = 0
        else:
            added = int(m[0].split(' ')[0])
        m = re.findall("[0-9]+ deletion", diff)
        if len(m) == 0:
            deleted = 0
        else:
            deleted = int(m[0].split(' ')[0])

        changes_count[f] = Changes(added, deleted)

    return changes_count

In [16]:
def get_authorship(blame_output, authors_list):
    authorship = dict()
    for i in authors_list:
        m = re.findall(i, str(blame))
        authorship[i] = len(m)
    return authorship

In [17]:
def count_comments(blame_output):
    #Tested: the result of counting the lines of comments on the blame output or on the source code is the same
    regex = re.compile("(?://[^\n]*|/\*(?:(?!\*/).)*\*/)", re.DOTALL)
    m = regex.findall(str(blame_output))
    return sum([len(c.split('\n')) for c in m])

In [18]:
def get_complete_authorship(sha, file_path):
    blame = git.blame("-e", sha, "--", file_path)
    m = re.findall("\(<.+?>", str(blame))
    authors = []
    for a in m:
        authors.append(a[2:-1])
    #print(set(authors))
    authorship = dict()
    for i in set(authors):
        m = re.findall(i, str(blame))
        #print(m)
        authorship[i] = len(m)
    return authorship

In [19]:
def update_counters(file_dict, file_name, author, changes):
#Create/update file entries in the dict()
    if file_dict.get(file_name) == None:
        file_dict[file_name] = SimpleNamespace()
        f = file_dict[file_name]
        f.author_dict = dict()
        f.tot_lines_added = 0
        f.tot_lines_deleted = 0
        f.tot_commits = 0
         
    f = file_dict[file_name]
    f.tot_lines_added += changes.added
    f.tot_lines_deleted += changes.deleted
    f.tot_commits += 1
     
#Create/update author entries in the dict() for that file
    if f.author_dict.get(author) == None:
        f.author_dict[author] = SimpleNamespace()
        a = f.author_dict[author]
        a.lines_added = 0
        a.lines_deleted = 0
        a.commits = 0
        
    a = f.author_dict[author]
    a.lines_added += changes.added
    a.lines_deleted += changes.deleted
    a.commits += 1

In [20]:
def extract_bug(file_path):
    if not file.endswith('.json'):
            return None
    else:
        bug_json_string = open(file_path).read()
        bug = json.loads(bug_json_string)
        bug_fields = bug.get('fields')
        
        if bug_fields['issuetype']['name'] != 'Bug':
            return None
   
        if bug_fields['resolution'] == None:
            return None 
                
        if bug_fields['resolution']['name'] != 'Fixed':
            return None
    return bug

In [21]:
#Extracting all the fixed bugs from the bug repo
bugs = dict() #Dictionary of bugs indexed by TAG+ID (e.g. LUCENE-1234)
for fld in issues_folders:
    path = project_folder + '/' + fld
    dir_list = os.listdir(os.getcwd() + '/' + path)
    l = len(dir_list)
    print()
    print("Progress for " + fld + ":")
    for idx, file in enumerate(dir_list):
        print_progress(idx,l,10)
        bug = extract_bug(path + '/' + file)
        if(bug == None):
            continue
        bugs[bug['key'].upper()] = bug

print()
print(len(bugs), "fixed bugs extracted.")


Progress for issue_LUCENE:
>> 6641/6641
Progress for issue_SOLR:
>> 7721/7728
4011 fixed bugs extracted.


In [22]:
from datetime import datetime
filename = output_folder + '/' + project_folder + '_issue_dates.csv'
file = open(filename, 'w')
wr = csv.writer(file, quoting=csv.QUOTE_ALL)
wr.writerow(['issue_key', 'created', 'resolutiondate', 'days_to_solve'])
for key,bug in bugs.items():
    c_date = bug['fields']['created']
    r_date = bug['fields']['resolutiondate']
    c_date = datetime.strptime(c_date[:10], "%Y-%m-%d")
    r_date = datetime.strptime(r_date[:10], "%Y-%m-%d")
    #print(c_date.date(), r_date.date(), (r_date - c_date).days)
    wr.writerow([bug['key'], c_date.date(), r_date.date(), (r_date - c_date).days])
#resolutiondate

In [23]:
def get_fixed_bugs(commit_hash):
    details = git.show("--name-only","--pretty=format:%B", commit_hash)
    #init = details.split(':')[0].upper()
    f_keys = []
    
    for tag in issues_tags:
        keys = re.findall(tag + "-[0-9]+",str(details))
        if len(keys) != 0:
            for k in keys:
                if bugs.get(k) != None:
                    #Then it fixes a bug
                    f_keys.append(k)
    return f_keys

In [24]:
def get_affected_versions(fixed_bug_keys):
    affected_versions = []
    for b in fixed_bug_keys:
        versions = bugs[b]['fields']['versions']
        if len(versions) == 0:
            continue
        else:    
            for v in versions:
                n = v['name']
                affected_versions.append(n)
    return affected_versions
#e.g. 
#get_affected_versions(['LUCENE-2244', 'LUCENE-3255'])

In [25]:
l = len(commits)

file_dict = dict()

columns = '''project, file, sha, author, 
author_file_tot_added, author_file_added_this_commit, file_tot_added, 
author_file_tot_deleted, author_file_deleted_this_commit, file_tot_deleted, 
author_file_commits, file_tot_commits, 
current_lines_authored, current_file_size, current_comment_lines, 
max_current_author, total_current_authors, 
commit_date, 
bug_fix, fixed_bugs, affected_versions'''

filename = output_folder + '/' + project_folder + '_out.csv'
file = open(filename, 'w')
wr = csv.writer(file, quoting=csv.QUOTE_ALL)
wr.writerow(columns.replace('\n', '').split(', '))

for idx, commit_hash in enumerate(commits):
    print_progress(idx, l)
    s = str(git.show("--name-only", '--format="%aN <%aE>"', commit_hash)).split('\n')[0].split('<')
    author_name = s[0][1:-1]
    #author_email = s[1][:-2]
    date = str(git.show("-s", "--format=%ci", commit_hash))[:-1].replace("\n", '')
    file_changes = count_changes(commit_hash)
    
    #Checking if it is a bug_fix and the affected versions
    bug_fix = 0
    f_keys = get_fixed_bugs(commit_hash)
    if len(f_keys) != 0:
        bug_fix = 1
            
    for file_name, changes in file_changes.items():
        #print(key, value)
        update_counters(file_dict, file_name, author_name, changes)
        
        f = file_dict[file_name]
        #Authorship and size:
        try:
            blame = git.blame(commit_hash, "--", file_name)
            authorship = get_authorship(blame, f.author_dict.keys())
            # IF YOU DON'T CONSIDER THE WHOLE HISTORY, THEN THESE AUTHORED LINES COUNT WILL
            # NOT SUM TO THE TOTAL SIZE, BECAUSE THERE CAN BE STILL IN THE FILE LINES FROM AN AUTHOR
            # THAT NEVER CONTRIBUTED IN OUR TIME WINDOW
            size = len(blame.split("\n")) - 1
            comments = count_comments(blame)
            
            #Authorship metrics
            compl_authorship = get_complete_authorship(commit_hash, file_name)
            total_current_authors = len(compl_authorship)
            max_authorship = max(compl_authorship.values())

        except:
            #This happens if the file has been deleted by the commit
            size = 0
            comments = 0

        for author, author_counter in f.author_dict.items():
            wr.writerow([project_folder, #project
                         file_name, #file
                         commit_hash, #sha
                         author, #author
                         #Lines added
                         author_counter.lines_added, #author_file_tot_added
                         (changes.added if author == author_name else 0), #author_file_added_this_commit
                         f.tot_lines_added, #file_tot_added
                         #Lines deleted
                         author_counter.lines_deleted, #author_file_tot_deleted
                         (changes.deleted if author == author_name else 0), #author_file_deleted_this_commit
                         f.tot_lines_deleted, #file_tot_deleted
                         #Commits
                         author_counter.commits, #author_file_commits
                         f.tot_commits, #file_tot_commits
                         #Authorship
                         (authorship[author] if size != 0 else 0), #current_lines_authored
                         size, #current_file_size
                         comments, #current_comment_lines
                         max_authorship, #max_current_author 
                         total_current_authors, #total_current_authors
                         #Other stuff
                         date, #commit_date
                         bug_fix, #bug_fix
                         f_keys, #fixed_bugs
                         get_affected_versions(f_keys) #affected_versions
                        ]) 

        #print()
        #print(file_dict.get(key))
file.close()

>> 13841/13841

In [26]:
import pandas as pd
df = pd.read_csv(filename)
df

Unnamed: 0,project,file,sha,author,author_file_tot_added,author_file_added_this_commit,file_tot_added,author_file_tot_deleted,author_file_deleted_this_commit,file_tot_deleted,...,file_tot_commits,current_lines_authored,current_file_size,current_comment_lines,max_current_author,total_current_authors,commit_date,bug_fix,fixed_bugs,affected_versions
0,lucene,solr/src/test/org/apache/solr/request/SimpleFa...,ce61ff7c1bc9ced9b927e1399810c18bbc3f7d8b,Yonik Seeley,17,17,17,7,7,7,...,1,17,835,118,818,2,2010-03-19 15:43:07 +0000,0,[],[]
1,lucene,solr/src/test/org/apache/solr/SolrTestCaseJ4.java,ce61ff7c1bc9ced9b927e1399810c18bbc3f7d8b,Yonik Seeley,431,431,431,0,0,0,...,1,431,431,138,431,1,2010-03-19 15:43:07 +0000,0,[],[]
2,lucene,solr/src/test/org/apache/solr/BasicFunctionali...,cb5feb49b12c26f532d069a36ef14996a87c7fa7,Mark Robert Miller,54,54,54,59,59,59,...,1,679,686,137,679,2,2010-03-19 21:53:34 +0000,0,[],[]
3,lucene,solr/src/test/org/apache/solr/SolrTestCaseJ4.java,cb5feb49b12c26f532d069a36ef14996a87c7fa7,Mark Robert Miller,4,4,435,0,0,0,...,2,4,435,138,431,2,2010-03-19 21:53:34 +0000,0,[],[]
4,lucene,solr/src/test/org/apache/solr/SolrTestCaseJ4.java,cb5feb49b12c26f532d069a36ef14996a87c7fa7,Yonik Seeley,431,0,435,0,0,0,...,2,431,435,138,431,2,2010-03-19 21:53:34 +0000,0,[],[]
5,lucene,solr/src/test/org/apache/solr/BaseDistributedS...,f17809ac6f7c3217f5ed27e4524098593d08b84b,Robert Muir,5,5,5,14,14,14,...,1,5,543,36,538,2,2010-03-19 21:59:22 +0000,0,[],[]
6,lucene,solr/src/test/org/apache/solr/analysis/TestWor...,e9f6093efdae1a81bddc43d19ad611b402ea9236,Robert Muir,29,29,29,10,10,10,...,1,29,432,79,403,2,2010-03-19 22:32:21 +0000,0,[],[]
7,lucene,solr/src/test/org/apache/solr/client/solrj/Lar...,9c809d64bc38b720a16b1aef49ae3d8b772a6645,Yonik Seeley,3,3,3,3,3,3,...,1,3,115,25,112,2,2010-03-20 01:09:28 +0000,0,[],[]
8,lucene,solr/src/test/org/apache/solr/schema/BadIndexS...,9c809d64bc38b720a16b1aef49ae3d8b772a6645,Yonik Seeley,1,1,1,1,1,1,...,1,1,85,19,84,2,2010-03-20 01:09:28 +0000,0,[],[]
9,lucene,solr/src/test/org/apache/solr/search/TestSort....,9c809d64bc38b720a16b1aef49ae3d8b772a6645,Yonik Seeley,1,1,1,1,1,1,...,1,1,198,32,197,2,2010-03-20 01:09:28 +0000,0,[],[]


In [27]:
df2 = pd.read_csv(implicated_list_file)
df2

Unnamed: 0,sha,file,implicated
0,b9abe3e600bc7177638f6e8e396e3b46da17665f,lucene/analysis/common/src/java/org/apache/luc...,1
1,d54099b87f8600e5a08966e44194d26109875d68,solr/core/src/test/org/apache/solr/cloud/ZkCLI...,1
2,58ff57393e992633f8df9c7dc10eee21dd629500,solr/src/test/org/apache/solr/handler/TestCSVL...,1
3,1ba83ee5c62b8bbff03de672f737dd6c6965e800,solr/core/src/java/org/apache/solr/core/SolrCo...,1
4,fb85bfe0abc68b209e0e9a5404f1b847ae468ad9,solr/core/src/java/org/apache/solr/search/Expo...,1
5,fb85bfe0abc68b209e0e9a5404f1b847ae468ad9,solr/core/src/test/org/apache/solr/response/Te...,1
6,fb85bfe0abc68b209e0e9a5404f1b847ae468ad9,solr/core/src/java/org/apache/solr/response/So...,1
7,fb85bfe0abc68b209e0e9a5404f1b847ae468ad9,solr/core/src/java/org/apache/solr/search/QPar...,1
8,9085ef830d30c162f52d7510be1b66699e390e8f,lucene/core/src/java/org/apache/lucene/index/S...,1
9,9085ef830d30c162f52d7510be1b66699e390e8f,lucene/core/src/java/org/apache/lucene/index/R...,1


In [28]:
df = df.merge(df2, on=['sha', 'file'], how='left').fillna(0)
df

Unnamed: 0,project,file,sha,author,author_file_tot_added,author_file_added_this_commit,file_tot_added,author_file_tot_deleted,author_file_deleted_this_commit,file_tot_deleted,...,current_lines_authored,current_file_size,current_comment_lines,max_current_author,total_current_authors,commit_date,bug_fix,fixed_bugs,affected_versions,implicated
0,lucene,solr/src/test/org/apache/solr/request/SimpleFa...,ce61ff7c1bc9ced9b927e1399810c18bbc3f7d8b,Yonik Seeley,17,17,17,7,7,7,...,17,835,118,818,2,2010-03-19 15:43:07 +0000,0,[],[],1
1,lucene,solr/src/test/org/apache/solr/SolrTestCaseJ4.java,ce61ff7c1bc9ced9b927e1399810c18bbc3f7d8b,Yonik Seeley,431,431,431,0,0,0,...,431,431,138,431,1,2010-03-19 15:43:07 +0000,0,[],[],0
2,lucene,solr/src/test/org/apache/solr/BasicFunctionali...,cb5feb49b12c26f532d069a36ef14996a87c7fa7,Mark Robert Miller,54,54,54,59,59,59,...,679,686,137,679,2,2010-03-19 21:53:34 +0000,0,[],[],1
3,lucene,solr/src/test/org/apache/solr/SolrTestCaseJ4.java,cb5feb49b12c26f532d069a36ef14996a87c7fa7,Mark Robert Miller,4,4,435,0,0,0,...,4,435,138,431,2,2010-03-19 21:53:34 +0000,0,[],[],0
4,lucene,solr/src/test/org/apache/solr/SolrTestCaseJ4.java,cb5feb49b12c26f532d069a36ef14996a87c7fa7,Yonik Seeley,431,0,435,0,0,0,...,431,435,138,431,2,2010-03-19 21:53:34 +0000,0,[],[],0
5,lucene,solr/src/test/org/apache/solr/BaseDistributedS...,f17809ac6f7c3217f5ed27e4524098593d08b84b,Robert Muir,5,5,5,14,14,14,...,5,543,36,538,2,2010-03-19 21:59:22 +0000,0,[],[],0
6,lucene,solr/src/test/org/apache/solr/analysis/TestWor...,e9f6093efdae1a81bddc43d19ad611b402ea9236,Robert Muir,29,29,29,10,10,10,...,29,432,79,403,2,2010-03-19 22:32:21 +0000,0,[],[],0
7,lucene,solr/src/test/org/apache/solr/client/solrj/Lar...,9c809d64bc38b720a16b1aef49ae3d8b772a6645,Yonik Seeley,3,3,3,3,3,3,...,3,115,25,112,2,2010-03-20 01:09:28 +0000,0,[],[],1
8,lucene,solr/src/test/org/apache/solr/schema/BadIndexS...,9c809d64bc38b720a16b1aef49ae3d8b772a6645,Yonik Seeley,1,1,1,1,1,1,...,1,85,19,84,2,2010-03-20 01:09:28 +0000,0,[],[],0
9,lucene,solr/src/test/org/apache/solr/search/TestSort....,9c809d64bc38b720a16b1aef49ae3d8b772a6645,Yonik Seeley,1,1,1,1,1,1,...,1,198,32,197,2,2010-03-20 01:09:28 +0000,0,[],[],0


In [29]:
df.to_csv(output_folder + '/' + project_folder + '_dataset.csv', index = False)

In [30]:
#Example of file with a lot of owners and commits
#df_test = df[df['file'] == 'lucene/core/src/java/org/apache/lucene/index/IndexWriter.java']
#df_test[df_test['sha'] == '5ec48108df8997430e3e8b47c056d0d63c6d2db3']