In [1]:
import sh
import json
import os
import glob2
from collections import Counter
import csv
import sys
import re
from collections import namedtuple
from types import SimpleNamespace

In [2]:
def print_progress(i, l, step=1):
    if i % step == 0:
            print("\r>> {}/{}".format(i + 1, l), end="")
            sys.stdout.flush()

In [3]:
if not os.path.exists(os.getcwd() + '/camel'):
    print("Path doesn\'t exists, cloning repo:")
    sh.git.clone("https://github.com/apache/camel")
else:
    print("Repository exists")

Repository exists


In [4]:
git = sh.git.bake(_cwd='camel')
git.checkout("trunk")
git.status()

On branch trunk
Your branch is up-to-date with 'origin/trunk'.

nothing to commit, working directory clean

In [36]:
commits = []
#end = "2015-01-01"
start = "Mar 20 2007" # To skip the first commits
log = git.log("--reverse", "--pretty=format:%H", 
              #'--before="{}"'.format(end), 
              '--after="{}"'.format(start)
             )
for c_hash in log:
    if(c_hash.endswith('\n')):
        c_hash = c_hash[:-1]
    commits.append(c_hash)
len(commits)

13970

In [9]:
Changes = namedtuple('Changes', 'added, deleted')

def count_changes(commit_hash, file_matching_regexp):
    files = git.show("--name-only","--pretty=format:",commit_hash)
    changes_count = dict() #This will be filled with tuples: (lines_added, lines_deleted)
    for f in files:
        f = f[:-1]
        m = re.findall(file_matching_regexp, f)
        if len(m) == 0:
                continue
        diff = str(git.diff("--stat", commit_hash + "^", commit_hash, "--", f))
        m = re.findall("[0-9]+ insertion", diff)
        if len(m) == 0:
            added = 0
        else:
            added = int(m[0].split(' ')[0])
        m = re.findall("[0-9]+ deletion", diff)
        if len(m) == 0:
            deleted = 0
        else:
            deleted = int(m[0].split(' ')[0])

        changes_count[f] = Changes(added, deleted)

    return changes_count

In [10]:
def get_authorship(blame_output, authors_list):
    authorship = dict()
    for i in authors_list:
        m = re.findall(i, str(blame))
        authorship[i] = len(m)
    return authorship

In [11]:
def count_comments(blame_output):
    #Tested: the result of counting the lines of comments on the blame output or on the source code is the same
    regex = re.compile("(?://[^\n]*|/\*(?:(?!\*/).)*\*/)", re.DOTALL)
    m = regex.findall(str(blame_output))
    return sum([len(c.split('\n')) for c in m])

In [12]:
def get_complete_authorship(sha, file_path):
    blame = git.blame("-e", sha, "--", file_path)
    m = re.findall("\(<.+?>", str(blame))
    authors = []
    for a in m:
        authors.append(a[2:-1])
    #print(set(authors))
    authorship = dict()
    for i in set(authors):
        m = re.findall(i, str(blame))
        #print(m)
        authorship[i] = len(m)
    return authorship

In [16]:
#TEST
#commit_hash = commits[4]
#print(commit_hash)
commit_hash = "dca7fe326efa61e6462b95b134e49bdc8ba7f81e"
f = "camel-core/src/test/java/org/apache/camel/RouteBuilderTest.java"
#command = "git diff --stat " + commit_hash + "^ " + commit_hash
s = str(git.show('--name-only','--format="%aN <%aE>"', commit_hash)).split('\n')[0].split('<')
author_name = s[0][1:-1]
author_email = s[1][:-2]
file_changes = count_changes(commit_hash, '.*\.java')
date = git.show("-s", "--format=%ci", commit_hash)
print(author_email)
print(author_name)
print(date)
blame = git.blame(commit_hash, "--", f)
m = re.findall("\(.+20", str(blame))
authors = []
for i in m:
    authors.append(i.split(" 2")[0][1:])
authors = set(authors)

authorship = get_authorship(blame, authors)

size = len(blame.split("\n")) - 1
comments = count_comments(blame)

print("Comments:", comments, "Size: ", size, "Code: ", size - comments)
comments_to_code = comments / float((size - comments))
comments_to_total = comments / size
print("Comments to code: ", comments_to_code)
print("Comments to total: ", comments_to_total)

print(authorship)
#print(authorship.keys())

auth_dict = get_complete_authorship(commit_hash, f)
total = len(auth_dict)
max_authorship = max(auth_dict.values())

print("Max current author: ", max_authorship, "Total current authors: ", total)
file_changes

chirino@apache.org
Hiram R. Chirino
2007-03-20 00:22:28 +0000

Comments: 33 Size:  304 Code:  271
Comments to code:  0.12177121771217712
Comments to total:  0.10855263157894737
{'Hiram R. Chirino': 127, 'James Strachan  ': 177}
Max current author:  177 Total current authors:  2


{'camel-core/src/main/java/org/apache/camel/CamelContainer.java': Changes(added=1, deleted=0),
 'camel-core/src/main/java/org/apache/camel/queue/QueueComponent.java': Changes(added=1, deleted=0),
 'camel-core/src/main/java/org/apache/camel/queue/QueueEndpoint.java': Changes(added=1, deleted=0),
 'camel-core/src/test/java/org/apache/camel/RouteBuilderTest.java': Changes(added=57, deleted=18),
 'camel-xbean/src/test/java/org/apache/camel/xbean/XBeanRouteBuilderTest.java': Changes(added=72, deleted=0)}

In [17]:
def update_counters(file_dict, file_name, author, changes):
#Create/update file entries in the dict()
    if file_dict.get(file_name) == None:
        file_dict[file_name] = SimpleNamespace()
        f = file_dict[file_name]
        f.author_dict = dict()
        f.tot_lines_added = 0
        f.tot_lines_deleted = 0
        f.tot_commits = 0
         
    f = file_dict[file_name]
    f.tot_lines_added += changes.added
    f.tot_lines_deleted += changes.deleted
    f.tot_commits += 1
     
#Create/update author entries in the dict() for that file
    if f.author_dict.get(author) == None:
        f.author_dict[author] = SimpleNamespace()
        a = f.author_dict[author]
        a.lines_added = 0
        a.lines_deleted = 0
        a.commits = 0
        
    a = f.author_dict[author]
    a.lines_added += changes.added
    a.lines_deleted += changes.deleted
    a.commits += 1

In [18]:
def extract_bug(file):
    if not file.endswith('.json'):
            return None
    else:
        bug_json_string = open("issue_CAMEL/" + file).read()
        bug = json.loads(bug_json_string)
        bug_fields = bug.get('fields')
        
        if bug_fields['issuetype']['name'] != 'Bug':
            return None
   
        if bug_fields['resolution'] == None:
            return None 
                
        if bug_fields['resolution']['name'] != 'Fixed':
            return None
    return bug

In [19]:
#Extracting all the fixed bugs from the bug repo
bugs = dict() #Dictionary of bugs indexed by the bug key
dir_list = os.listdir(os.getcwd() + "/issue_CAMEL")
l = len(dir_list)
print("Progress:")
for idx, file in enumerate(dir_list):
    print_progress(idx,l,10)
    bug = extract_bug(file)
    if(bug == None):
        continue
    bugs[bug['key'].upper()] = bug  

Progress:
>> 9001/9002

In [20]:
from datetime import datetime
filename = 'dataset/issue_dates.csv'
file = open(filename, 'w')
wr = csv.writer(file, quoting=csv.QUOTE_ALL)
wr.writerow(['issue_key', 'created', 'resolutiondate', 'days_to_solve'])
for key,bug in bugs.items():
    c_date = bug['fields']['created']
    r_date = bug['fields']['resolutiondate']
    c_date = datetime.strptime(c_date[:10], "%Y-%m-%d")
    r_date = datetime.strptime(r_date[:10], "%Y-%m-%d")
    #print(c_date.date(), r_date.date(), (r_date - c_date).days)
    wr.writerow([bug['key'], c_date.date(), r_date.date(), (r_date - c_date).days])
#resolutiondate

In [30]:
#TEST FOR BUG EXTRACTION FROM COMMIT
commit_hash = "13760a4a5f1672d54997aa8ede30aefed951505a"
details = git.show("--name-only","--pretty=format:%B", commit_hash)
f_keys = []
m = re.findall("CAMEL-[0-9]+",str(details))
m

['CAMEL-9313', 'CAMEL-9377']

In [37]:
def get_fixed_bugs(commit_hash):
    details = git.show("--name-only","--pretty=format:%B", commit_hash)
    #init = details.split(':')[0].upper()
    f_keys = []
    
    keys = re.findall("CAMEL-[0-9]+",str(details))
    if len(keys) != 0:
        for k in keys:
            if bugs.get(k) != None:
                #Then it fixes a bug
                f_keys.append(k)
    return f_keys

In [38]:
def get_affected_versions(fixed_bug_keys):
    affected_versions = []
    for b in fixed_bug_keys:
        versions = bugs[b]['fields']['versions']
        if len(versions) == 0:
            continue
        else:    
            for v in versions:
                n = v['name']
                affected_versions.append(n)
    return affected_versions

In [41]:
project_name = "camel"

l = len(commits)

file_dict = dict()
#FileCounter = namedtuple('FileCounter', '''author_dict, 
#                                             tot_lines_added, tot_lines_deleted, 
#                                             tot_commits''')
#AuthorCounter = namedtuple('AuthorCounter', '''lines_added, 
#                                                 lines_deleted, 
#                                                 commits''')
columns = '''project, file, sha, author, 
author_file_tot_added, author_file_added_this_commit, file_tot_added, 
author_file_tot_deleted, author_file_deleted_this_commit, file_tot_deleted, 
author_file_commits, file_tot_commits, 
current_lines_authored, current_file_size, current_comment_lines, 
max_current_author, total_current_authors, 
commit_date, 
bug_fix, fixed_bugs, affected_versions'''

#Row = namedtuple('Row', columns)
filename = 'out.csv'
file = open(filename, 'w')
wr = csv.writer(file, quoting=csv.QUOTE_ALL)
wr.writerow(columns.replace('\n', '').split(', '))

for idx, commit_hash in enumerate(commits):
    print_progress(idx, l)
    s = str(git.show("--name-only", '--format="%aN <%aE>"', commit_hash)).split('\n')[0].split('<')
    author_name = s[0][1:-1]
    #author_email = s[1][:-2]
    date = str(git.show("-s", "--format=%ci", commit_hash))[:-1].replace("\n", '')
    file_changes = count_changes(commit_hash, '.*\.java')
    
    #Checking if it is a bug_fix and the affected versions
    bug_fix = 0
    f_keys = get_fixed_bugs(commit_hash)
    if len(f_keys) != 0:
        bug_fix = 1
            
    for file_name, changes in file_changes.items():
        #print(key, value)
        update_counters(file_dict, file_name, author_name, changes)
        
        f = file_dict[file_name]
        #Authorship and size:
        try:
            blame = git.blame(commit_hash, "--", file_name)
            authorship = get_authorship(blame, f.author_dict.keys())
            # IF YOU DON'T CONSIDER THE WHOLE HISTORY, THEN THESE AUTHORED LINES COUNT WILL
            # NOT SUM TO THE TOTAL SIZE, BECAUSE THERE CAN BE STILL IN THE FILE LINES FROM AN AUTHOR
            # THAT NEVER CONTRIBUTED IN OUR TIME WINDOW
            size = len(blame.split("\n")) - 1
            comments = count_comments(blame)
            
            #Authorship metrics
            compl_authorship = get_complete_authorship(commit_hash, file_name)
            total_current_authors = len(compl_authorship)
            max_authorship = max(compl_authorship.values())

        except:
            #This happens if the file has been deleted by the commit
            size = 0
            comments = 0

        for author, author_counter in f.author_dict.items():
            wr.writerow([project_name, #project
                         file_name, #file
                         commit_hash, #sha
                         author, #author
                         #Lines added
                         author_counter.lines_added, #author_file_tot_added
                         (changes.added if author == author_name else 0), #author_file_added_this_commit
                         f.tot_lines_added, #file_tot_added
                         #Lines deleted
                         author_counter.lines_deleted, #author_file_tot_deleted
                         (changes.deleted if author == author_name else 0), #author_file_deleted_this_commit
                         f.tot_lines_deleted, #file_tot_deleted
                         #Commits
                         author_counter.commits, #author_file_commits
                         f.tot_commits, #file_tot_commits
                         #Authorship
                         (authorship[author] if size != 0 else 0), #current_lines_authored
                         size, #current_file_size
                         comments, #current_comment_lines
                         max_authorship, #max_current_author 
                         total_current_authors, #total_current_authors
                         #Other stuff
                         date, #commit_date
                         bug_fix, #bug_fix
                         f_keys, #fixed_bugs
                         get_affected_versions(f_keys) #affected_versions
                        ]) 

        #print()
        #print(file_dict.get(key))
file.close()

>> 13970/13970

In [42]:
import pandas as pd
df = pd.read_csv(filename)
df

Unnamed: 0,project,file,sha,author,author_file_tot_added,author_file_added_this_commit,file_tot_added,author_file_tot_deleted,author_file_deleted_this_commit,file_tot_deleted,...,file_tot_commits,current_lines_authored,current_file_size,current_comment_lines,max_current_author,total_current_authors,commit_date,bug_fix,fixed_bugs,affected_versions
0,camel,camel-core/src/main/java/org/apache/camel/Comp...,9bca570e177686e4eb8fddf05004ee0e3f153dff,James Strachan,6,6,6,1,1,1,...,1,6,43,32,37,2,2007-03-20 01:15:57 +0000,0,[],[]
1,camel,camel-core/src/main/java/org/apache/camel/queu...,9bca570e177686e4eb8fddf05004ee0e3f153dff,James Strachan,8,8,8,2,2,2,...,1,48,59,27,48,2,2007-03-20 01:15:57 +0000,0,[],[]
2,camel,camel-core/src/main/java/org/apache/camel/Came...,9bca570e177686e4eb8fddf05004ee0e3f153dff,James Strachan,73,73,73,28,28,28,...,1,134,141,37,134,2,2007-03-20 01:15:57 +0000,0,[],[]
3,camel,camel-jms/src/test/java/org/apache/camel/jms/J...,9bca570e177686e4eb8fddf05004ee0e3f153dff,James Strachan,21,21,21,1,1,1,...,1,49,49,23,49,1,2007-03-20 01:15:57 +0000,0,[],[]
4,camel,camel-core/src/main/java/org/apache/camel/buil...,9bca570e177686e4eb8fddf05004ee0e3f153dff,James Strachan,13,13,13,7,7,7,...,1,140,144,39,140,2,2007-03-20 01:15:57 +0000,0,[],[]
5,camel,camel-jms/src/main/java/org/apache/camel/jms/J...,9bca570e177686e4eb8fddf05004ee0e3f153dff,James Strachan,19,19,19,27,27,27,...,1,98,98,20,98,1,2007-03-20 01:15:57 +0000,0,[],[]
6,camel,camel-jms/src/main/java/org/apache/camel/jms/B...,9bca570e177686e4eb8fddf05004ee0e3f153dff,James Strachan,28,28,28,0,0,0,...,1,28,28,21,28,1,2007-03-20 01:15:57 +0000,0,[],[]
7,camel,camel-jms/src/main/java/org/apache/camel/jms/J...,9bca570e177686e4eb8fddf05004ee0e3f153dff,James Strachan,6,6,6,6,6,6,...,1,65,89,40,65,2,2007-03-20 01:15:57 +0000,0,[],[]
8,camel,camel-core/src/main/java/org/apache/camel/Rout...,9bca570e177686e4eb8fddf05004ee0e3f153dff,James Strachan,30,30,30,0,0,0,...,1,30,30,23,30,1,2007-03-20 01:15:57 +0000,0,[],[]
9,camel,camel-core/src/main/java/org/apache/camel/Endp...,9bca570e177686e4eb8fddf05004ee0e3f153dff,James Strachan,16,16,16,0,0,0,...,1,55,56,39,55,2,2007-03-20 01:15:57 +0000,0,[],[]


In [43]:
df2 = pd.read_csv('dataset/implicated_files.csv')
df2

OSError: File b'dataset/implicated_files.csv' does not exist

In [None]:
df = df.merge(df2, on=['sha', 'file'], how='left').fillna(0)
df

In [None]:
df.to_csv('camel_dataset.csv', index = False)

In [None]:
#Example of file with a lot of owners and commits
#df_test = df[df['file'] == 'lucene/core/src/java/org/apache/lucene/index/IndexWriter.java']
#df_test[df_test['sha'] == '5ec48108df8997430e3e8b47c056d0d63c6d2db3']