In [16]:
import sh
import json
import os
import glob2
from collections import Counter
import csv
import sys
import re
from collections import namedtuple
from types import SimpleNamespace

In [17]:
def print_progress(i, l, step=1):
    if i % step == 0:
            print("\r>> {}/{}".format(i + 1, l), end="")
            sys.stdout.flush()

In [18]:
if not os.path.exists(os.getcwd() + '/lucene-solr'):
    print("Path doesn\'t exists, cloning repo:")
    sh.git.clone("https://github.com/apache/lucene-solr.git")
else:
    print("Repository exists")

Repository exists


In [19]:
git = sh.git.bake(_cwd='lucene-solr')
git.checkout("trunk")
git.status()

On branch trunk
Your branch is up-to-date with 'origin/trunk'.

Untracked files:
  (use "git add <file>..." to include in what will be committed)

	[31mdiff.txt[m


It took 6.26 seconds to enumerate untracked files. 'status -uno'
may speed it up, but you have to be careful not to forget to add
new files yourself (see 'git help status').
nothing added to commit but untracked files present (use "git add" to track)

In [20]:
commits = []

log = git.log("--reverse", "--pretty=format:%H")
for c_hash in log:
    if(c_hash.endswith('\n')):
        c_hash = c_hash[:-1]
    commits.append(c_hash)
len(commits)

16125

In [21]:
#This is specific for LUCENE, it removes the commits that are only SOLR related
lucene_commits = []
l = len(commits)
for idx, c in enumerate(commits):
    
    print_progress(idx, l, 10)
        
    details = git.show("--name-only","--pretty=format:%B", c)
    files = git.show("--name-only","--pretty=format:",c)
    
    #Discarding commits that modify only solr files
    only_solr = True
    for file in files:
        if file.startswith('lucene'):
            only_solr = False

    if only_solr is True:
        continue
    
    lucene_commits.append(c)
    
len(lucene_commits)

>> 16121/16125

8316

In [22]:
# '@@ -[0-9]+,[0-9]+ \+[0-9]+,[0-9]+ @@'  REGEX TO MATCH THE DIFF LINE INDEXES
#def count_changes(commit_hash, file_matching_regexp):
#    diff = str(git.diff("--stat", commit_hash + "^", commit_hash))
#    
#    changes_count = dict() #This will be filled with tuples: (lines_added, lines_deleted)
#    for l in diff.split('\n'):
#        split = l.split("|")
#        
#        m = re.findall(file_matching_regexp, split[0]) # Only lucene files
#        if len(m) == 0:
#            continue
#        file_name = m[0]
#        
#        m = re.findall("-+", split[1])
#        if len(m) == 0:
#            deleted = 0
#       else:
#           deleted = len(m[0])
#       
#       m = re.findall("\++", split[1])
#       if len(m) == 0:
#           added = 0
#       else:
#           added = len(m[0])

#        changes_count[file_name] = ((added, deleted))
    
#    return changes_count

In [60]:
Changes = namedtuple('Changes', 'added, deleted')

def count_changes(commit_hash, file_matching_regexp):
    files = git.show("--name-only","--pretty=format:",commit_hash)
    changes_count = dict() #This will be filled with tuples: (lines_added, lines_deleted)
    for f in files:
        f = f[:-1]
        m = re.findall(file_matching_regexp, f)
        if len(m) == 0:
                continue
        diff = str(git.diff("--stat", commit_hash + "^", commit_hash, "--", f))
        m = re.findall("[0-9]+ insertion", diff)
        if len(m) == 0:
            added = 0
        else:
            added = int(m[0].split(' ')[0])
        m = re.findall("[0-9]+ deletion", diff)
        if len(m) == 0:
            deleted = 0
        else:
            deleted = int(m[0].split(' ')[0])

        changes_count[f] = Changes(added, deleted)

    return changes_count

In [61]:
#TEST
#commit_hash = lucene_commits[4]
#print(commit_hash)
commit_hash = "5190820f3472486143c1ac5b3aa24a0bd297fc41"
#command = "git diff --stat " + commit_hash + "^ " + commit_hash
s = str(git.show('--name-only','--format="%aN <%aE>"', commit_hash)).split('\n')[0].split('<')
author_name = s[0][1:-1]
author_email = s[1][:-2]
file_changes = count_changes(commit_hash, 'lucene.*\.java')
date = git.show("-s", "--format=%ci", commit_hash)
print(author_email)
print(author_name)
print(date)
file_changes

simonw@apache.org
Simon Willnauer
2012-02-12 15:44:41 +0000



{'lucene/core/src/java/org/apache/lucene/index/DocumentsWriter.java': Changes(added=7, deleted=26),
 'lucene/core/src/java/org/apache/lucene/index/DocumentsWriterFlushControl.java': Changes(added=53, deleted=27),
 'lucene/core/src/java/org/apache/lucene/index/DocumentsWriterPerThreadPool.java': Changes(added=94, deleted=82),
 'lucene/core/src/java/org/apache/lucene/index/FlushByRamOrCountsPolicy.java': Changes(added=1, deleted=1),
 'lucene/core/src/java/org/apache/lucene/index/FlushPolicy.java': Changes(added=4, deleted=6),
 'lucene/core/src/java/org/apache/lucene/index/IndexWriter.java': Changes(added=1, deleted=1),
 'lucene/core/src/java/org/apache/lucene/index/IndexWriterConfig.java': Changes(added=10, deleted=3),
 'lucene/core/src/java/org/apache/lucene/index/ThreadAffinityDocumentsWriterThreadPool.java': Changes(added=1, deleted=6),
 'lucene/core/src/test/org/apache/lucene/index/TestFlushByRamOrCountsPolicy.java': Changes(added=6, deleted=6)}

In [25]:
def update_counters(file_dict, file_name, author, changes):
#Create/update file entries in the dict()
    if file_dict.get(file_name) == None:
        file_dict[file_name] = SimpleNamespace()
        f = file_dict[file_name]
        f.author_dict = dict()
        f.tot_lines_added = 0
        f.tot_lines_deleted = 0
        f.tot_commits = 0
         
    f = file_dict[file_name]
    f.tot_lines_added += changes.added
    f.tot_lines_deleted += changes.deleted
    f.tot_commits += 1
     
#Create/update author entries in the dict() for that file
    if f.author_dict.get(author) == None:
        f.author_dict[author] = SimpleNamespace()
        a = f.author_dict[author]
        a.lines_added = 0
        a.lines_deleted = 0
        a.commits = 0
        
    a = f.author_dict[author]
    a.lines_added += changes.added
    a.lines_deleted += changes.deleted
    a.commits += 1

In [26]:
def extract_bug(file):
    if not file.endswith('.json'):
            return None
    else:
        bug_json_string = open("issue_LUCENE/" + file).read()
        bug = json.loads(bug_json_string)
        bug_fields = bug.get('fields')
        
        if bug_fields['issuetype']['name'] != 'Bug':
            return None
   
        if bug_fields['resolution'] == None:
            return None 
                
        if bug_fields['resolution']['name'] != 'Fixed':
            return None
    return bug

In [27]:
#Extracting all the fixed bugs from the bug repo
bugs = dict() #Dictionary of bugs indexed by LUCENE+ID
dir_list = os.listdir(os.getcwd() + "/issue_LUCENE")
l = len(dir_list)
print("Progress:")
for idx, file in enumerate(dir_list):
    print_progress(idx,l,10)
    bug = extract_bug(file)
    if(bug == None):
        continue
    bugs[bug['key'].upper()] = bug

Progress:
>> 6641/6641

In [28]:
def get_fixed_bugs(commit_hash):
    details = git.show("--name-only","--pretty=format:%B", commit_hash)
    init = details.split(':')[0].upper()
    f_keys = [] 
    if init.startswith('LUCENE') or init.startswith('SOLR'):  #Because it could be SOLR-123, LUCENE-234, ...
        if init.find(', ') != -1:
            keys = init.split(', ')
        elif init.find(',') != -1:
            keys = init.split(',')
        else:
            keys = [init.split(' ')[0]]
               
        for k in keys:
            if bugs.get(k) != None:
                #Then it fixes a bug
                f_keys.append(k)
    return f_keys

In [29]:
def get_affected_versions(fixed_bug_keys):
    affected_versions = []
    for b in fixed_bug_keys:
        versions = bugs[b]['fields']['versions']
        if len(versions) == 0:
            continue
        else:    
            for v in versions:
                n = v['name']
                affected_versions.append(n)
    return affected_versions
#get_affected_versions(['LUCENE-2244', 'LUCENE-3255'])

In [62]:
project_name = "lucene"

l = len(lucene_commits)

file_dict = dict()
#FileCounter = namedtuple('FileCounter', '''author_dict, 
#                                             tot_lines_added, tot_lines_deleted, 
#                                             tot_commits''')
#AuthorCounter = namedtuple('AuthorCounter', '''lines_added, 
#                                                 lines_deleted, 
#                                                 commits''')
columns = '''project, file, sha, author, 
author_file_tot_added, author_file_added_this_commit, file_tot_added, 
author_file_tot_deleted, author_file_deleted_this_commit, file_tot_deleted, 
author_file_commits, file_tot_commits, commit_date, 
bug_fix, fixed_bugs, affected_versions'''

#Row = namedtuple('Row', columns)
filename = 'out.csv'
file = open(filename, 'w')
wr = csv.writer(file, quoting=csv.QUOTE_ALL)
wr.writerow(columns.replace('\n', '').split(', '))

for idx, commit_hash in enumerate(lucene_commits):
    print_progress(idx, l)
    s = str(git.show("--name-only", '--format="%aN <%aE>"', commit_hash)).split('\n')[0].split('<')
    author_name = s[0][1:-1]
    #author_email = s[1][:-2]
    date = str(git.show("-s", "--format=%ci", commit_hash))[:-1].replace("\n", '')
    file_changes = count_changes(commit_hash, 'lucene.*\.java')
    
    #Checking if it is a bug_fix and the affected versions
    bug_fix = 0
    f_keys = get_fixed_bugs(commit_hash)
    if len(f_keys) != 0:
        bug_fix = 1
            
    for file_name, changes in file_changes.items():
        #print(key, value)
        update_counters(file_dict, file_name, author_name, changes)
        f = file_dict[file_name]
        for author, author_counter in f.author_dict.items():
            wr.writerow([project_name, #project
                         file_name, #file
                         commit_hash, #sha
                         author, #author
                         #Lines added
                         author_counter.lines_added, #author_file_tot_added
                         (changes.added if author == author_name else 0), #author_file_added_this_commit
                         f.tot_lines_added, #file_tot_added
                         #Lines deleted
                         author_counter.lines_deleted, #author_file_tot_deleted
                         (changes.deleted if author == author_name else 0), #author_file_deleted_this_commit
                         f.tot_lines_deleted, #file_tot_deleted
                         #Commits
                         author_counter.commits, #author_file_commits
                         f.tot_commits, #file_tot_commits
                         #Other stuff
                         date, #commit_date
                         bug_fix, #bug_fix
                         f_keys, #fixed_bugs
                         get_affected_versions(f_keys) #affected_versions
                        ]) 
        
        #Now I write the csv line:

        #print()
        #print(file_dict.get(key))
file.close()

>> 8316/8316

In [63]:
import pandas as pd
df = pd.read_csv(filename)
df

Unnamed: 0,project,file,sha,author,author_file_tot_added,author_file_added_this_commit,file_tot_added,author_file_tot_deleted,author_file_deleted_this_commit,file_tot_deleted,author_file_commits,file_tot_commits,commit_date,bug_fix,fixed_bugs,affected_versions
0,lucene,lucene/contrib/analyzers/common/src/test/org/a...,3ee0ace1ba6b9bff3ffaa278c0bba07e6064057d,Mark Robert Miller,53,53,53,0,0,0,1,1,2010-03-17 20:51:29 +0000,0,[],[]
1,lucene,lucene/contrib/analyzers/common/src/java/org/t...,3ee0ace1ba6b9bff3ffaa278c0bba07e6064057d,Mark Robert Miller,1116,1116,1116,0,0,0,1,1,2010-03-17 20:51:29 +0000,0,[],[]
2,lucene,lucene/contrib/instantiated/src/test/org/apach...,3ee0ace1ba6b9bff3ffaa278c0bba07e6064057d,Mark Robert Miller,619,619,619,0,0,0,1,1,2010-03-17 20:51:29 +0000,0,[],[]
3,lucene,lucene/src/java/org/apache/lucene/index/IntBlo...,3ee0ace1ba6b9bff3ffaa278c0bba07e6064057d,Mark Robert Miller,65,65,65,0,0,0,1,1,2010-03-17 20:51:29 +0000,0,[],[]
4,lucene,lucene/backwards/src/java/org/apache/lucene/se...,3ee0ace1ba6b9bff3ffaa278c0bba07e6064057d,Mark Robert Miller,86,86,86,0,0,0,1,1,2010-03-17 20:51:29 +0000,0,[],[]
5,lucene,lucene/contrib/benchmark/src/java/org/apache/l...,3ee0ace1ba6b9bff3ffaa278c0bba07e6064057d,Mark Robert Miller,114,114,114,0,0,0,1,1,2010-03-17 20:51:29 +0000,0,[],[]
6,lucene,lucene/backwards/src/test/org/apache/lucene/in...,3ee0ace1ba6b9bff3ffaa278c0bba07e6064057d,Mark Robert Miller,268,268,268,0,0,0,1,1,2010-03-17 20:51:29 +0000,0,[],[]
7,lucene,lucene/backwards/src/java/org/apache/lucene/st...,3ee0ace1ba6b9bff3ffaa278c0bba07e6064057d,Mark Robert Miller,210,210,210,0,0,0,1,1,2010-03-17 20:51:29 +0000,0,[],[]
8,lucene,lucene/src/java/org/apache/lucene/search/Slopp...,3ee0ace1ba6b9bff3ffaa278c0bba07e6064057d,Mark Robert Miller,217,217,217,0,0,0,1,1,2010-03-17 20:51:29 +0000,0,[],[]
9,lucene,lucene/contrib/queryparser/src/java/org/apache...,3ee0ace1ba6b9bff3ffaa278c0bba07e6064057d,Mark Robert Miller,222,222,222,0,0,0,1,1,2010-03-17 20:51:29 +0000,0,[],[]


In [64]:
df2 = pd.read_csv('implicated_files.csv')
df2

Unnamed: 0,sha,file,implicated
0,ee69d9294ad820ce3c6c6225d1c3229f3d5aa411,lucene/suggest/src/java/org/apache/lucene/sear...,1
1,ee69d9294ad820ce3c6c6225d1c3229f3d5aa411,lucene/core/src/java/org/apache/lucene/util/fs...,1
2,ee69d9294ad820ce3c6c6225d1c3229f3d5aa411,lucene/suggest/src/test/org/apache/lucene/sear...,1
3,5660a0e2d1969921dbe321c33491baabc20db80a,lucene/core/src/test/org/apache/lucene/search/...,1
4,8461a847c981b9cf5f9d6453349dfc5a48b8fa36,lucene/analysis/phonetic/src/test/org/apache/l...,1
5,7e4580920093ab9312d2b9889b74527d07cbf7d4,lucene/core/src/java/org/apache/lucene/codecs/...,1
6,7e4580920093ab9312d2b9889b74527d07cbf7d4,lucene/codecs/src/java/org/apache/lucene/codec...,1
7,7e4580920093ab9312d2b9889b74527d07cbf7d4,lucene/core/src/java/org/apache/lucene/codecs/...,1
8,f0f1569b41b5b164c4202b97e6cbd2c7abfdb0db,lucene/queries/src/test/org/apache/lucene/quer...,1
9,3acf2ce9ac0a85f86e143ae7dbaa920d987624c4,lucene/core/src/test/org/apache/lucene/index/T...,1


In [65]:
df = df.merge(df2, on=['sha', 'file'], how='left').fillna(0)
df

Unnamed: 0,project,file,sha,author,author_file_tot_added,author_file_added_this_commit,file_tot_added,author_file_tot_deleted,author_file_deleted_this_commit,file_tot_deleted,author_file_commits,file_tot_commits,commit_date,bug_fix,fixed_bugs,affected_versions,implicated
0,lucene,lucene/contrib/analyzers/common/src/test/org/a...,3ee0ace1ba6b9bff3ffaa278c0bba07e6064057d,Mark Robert Miller,53,53,53,0,0,0,1,1,2010-03-17 20:51:29 +0000,0,[],[],0
1,lucene,lucene/contrib/analyzers/common/src/java/org/t...,3ee0ace1ba6b9bff3ffaa278c0bba07e6064057d,Mark Robert Miller,1116,1116,1116,0,0,0,1,1,2010-03-17 20:51:29 +0000,0,[],[],0
2,lucene,lucene/contrib/instantiated/src/test/org/apach...,3ee0ace1ba6b9bff3ffaa278c0bba07e6064057d,Mark Robert Miller,619,619,619,0,0,0,1,1,2010-03-17 20:51:29 +0000,0,[],[],1
3,lucene,lucene/src/java/org/apache/lucene/index/IntBlo...,3ee0ace1ba6b9bff3ffaa278c0bba07e6064057d,Mark Robert Miller,65,65,65,0,0,0,1,1,2010-03-17 20:51:29 +0000,0,[],[],0
4,lucene,lucene/backwards/src/java/org/apache/lucene/se...,3ee0ace1ba6b9bff3ffaa278c0bba07e6064057d,Mark Robert Miller,86,86,86,0,0,0,1,1,2010-03-17 20:51:29 +0000,0,[],[],0
5,lucene,lucene/contrib/benchmark/src/java/org/apache/l...,3ee0ace1ba6b9bff3ffaa278c0bba07e6064057d,Mark Robert Miller,114,114,114,0,0,0,1,1,2010-03-17 20:51:29 +0000,0,[],[],0
6,lucene,lucene/backwards/src/test/org/apache/lucene/in...,3ee0ace1ba6b9bff3ffaa278c0bba07e6064057d,Mark Robert Miller,268,268,268,0,0,0,1,1,2010-03-17 20:51:29 +0000,0,[],[],0
7,lucene,lucene/backwards/src/java/org/apache/lucene/st...,3ee0ace1ba6b9bff3ffaa278c0bba07e6064057d,Mark Robert Miller,210,210,210,0,0,0,1,1,2010-03-17 20:51:29 +0000,0,[],[],0
8,lucene,lucene/src/java/org/apache/lucene/search/Slopp...,3ee0ace1ba6b9bff3ffaa278c0bba07e6064057d,Mark Robert Miller,217,217,217,0,0,0,1,1,2010-03-17 20:51:29 +0000,0,[],[],1
9,lucene,lucene/contrib/queryparser/src/java/org/apache...,3ee0ace1ba6b9bff3ffaa278c0bba07e6064057d,Mark Robert Miller,222,222,222,0,0,0,1,1,2010-03-17 20:51:29 +0000,0,[],[],0


In [66]:
df.to_csv('lucene_dataset_v1.csv', index = False)

In [67]:
#Example of file with a lot of owners and commits
df_test = df[df['file'] == 'lucene/core/src/java/org/apache/lucene/index/IndexWriter.java']
df_test[df_test['sha'] == '5ec48108df8997430e3e8b47c056d0d63c6d2db3']

Unnamed: 0,project,file,sha,author,author_file_tot_added,author_file_added_this_commit,file_tot_added,author_file_tot_deleted,author_file_deleted_this_commit,file_tot_deleted,author_file_commits,file_tot_commits,commit_date,bug_fix,fixed_bugs,affected_versions,implicated
95803,lucene,lucene/core/src/java/org/apache/lucene/index/I...,5ec48108df8997430e3e8b47c056d0d63c6d2db3,Martijn van Groningen,15,0,4881,8,0,827,3,47,2012-05-23 20:10:47 +0000,0,[],[],0
95804,lucene,lucene/core/src/java/org/apache/lucene/index/I...,5ec48108df8997430e3e8b47c056d0d63c6d2db3,Christopher John Male,4,0,4881,4,0,827,1,47,2012-05-23 20:10:47 +0000,0,[],[],0
95805,lucene,lucene/core/src/java/org/apache/lucene/index/I...,5ec48108df8997430e3e8b47c056d0d63c6d2db3,Robert Muir,56,0,4881,48,0,827,15,47,2012-05-23 20:10:47 +0000,0,[],[],0
95806,lucene,lucene/core/src/java/org/apache/lucene/index/I...,5ec48108df8997430e3e8b47c056d0d63c6d2db3,Simon Willnauer,1,0,4881,1,0,827,1,47,2012-05-23 20:10:47 +0000,0,[],[],0
95807,lucene,lucene/core/src/java/org/apache/lucene/index/I...,5ec48108df8997430e3e8b47c056d0d63c6d2db3,Steven Rowe,4087,0,4881,0,0,827,1,47,2012-05-23 20:10:47 +0000,0,[],[],0
95808,lucene,lucene/core/src/java/org/apache/lucene/index/I...,5ec48108df8997430e3e8b47c056d0d63c6d2db3,Michael McCandless,716,11,4881,764,5,827,25,47,2012-05-23 20:10:47 +0000,0,[],[],0
95809,lucene,lucene/core/src/java/org/apache/lucene/index/I...,5ec48108df8997430e3e8b47c056d0d63c6d2db3,Ryan McKinley,2,0,4881,2,0,827,1,47,2012-05-23 20:10:47 +0000,0,[],[],0
