In [1]:
%config Completer.use_jedi = False
%matplotlib inline

In [2]:
import numpy as np
import pandas as pd
import os
from os import listdir
from os.path import isfile
import datetime
import matplotlib
import matplotlib.pyplot as plt

data_path = '../data/'

# Data Aggregation

In [3]:
years = ['2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017', '2018', '2019']
dfs = []
for y in years:
    df = pd.read_csv(os.path.join('../data', y + '.csv'), index_col=None)
    print('a df of size {} appended.'.format(len(df)))
    dfs.append(df)
df = pd.concat(dfs, axis=0, ignore_index=True)
df.drop_duplicates(subset=['Issue key'])
df['project'] = df['Issue key'].str.split('-').str[0]
df = df[df['project'] != 'ROCKETMQ']
df

a df of size 2503 appended.
a df of size 3321 appended.
a df of size 3821 appended.
a df of size 4888 appended.
a df of size 10599 appended.
a df of size 8233 appended.
a df of size 7450 appended.
a df of size 6529 appended.
a df of size 5582 appended.
a df of size 4077 appended.


Unnamed: 0,Issue Type,Custom field (Patch Info),Issue key,Issue id,Parent id,Summary,Assignee,Reporter,Priority,Status,Resolution,Created,Updated,Due Date,project
0,Bug,,CAMEL-2687,12487331,12487337.0,exec component fails after receiving empty output,davsclaus,chuck,Major,Closed,Fixed,30/Apr/10 15:54,24/Apr/11 10:00,,CAMEL
1,Bug,,CASSANDRA-1040,12463432,,read failure during flush,jbellis,jbellis,Urgent,Resolved,Fixed,30/Apr/10 13:33,16/Apr/19 09:33,,CASSANDRA
2,Bug,,MAPREDUCE-1747,12463406,,Remove documentation for the 'unstable' job-ac...,vinodkv,vinodkv,Blocker,Closed,Fixed,30/Apr/10 06:55,24/Aug/10 21:21,,MAPREDUCE
3,Bug,,MAPREDUCE-1744,12463383,,DistributedCache creates its own FileSytem ins...,dking,dking,Major,Closed,Fixed,29/Apr/10 22:36,05/Mar/12 02:49,,MAPREDUCE
4,Bug,,CASSANDRA-1038,12463375,,Not all column families are created,gdusbabek,brandon.williams,Normal,Resolved,Fixed,29/Apr/10 21:17,16/Apr/19 09:33,,CASSANDRA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56998,Bug,,KAFKA-7781,13207270,,Add validation check for Topic retention.ms pr...,ckamal,ckamal,Major,Resolved,Fixed,02/Jan/19 11:42,13/Jan/19 13:19,,KAFKA
56999,Bug,,FLINK-11251,13207253,,Incompatible metric name on prometheus reporter,tonywei,tonywei,Major,Closed,Fixed,02/Jan/19 10:06,09/Jan/19 08:25,,FLINK
57000,Bug,,MESOS-9505,13207222,,`make check` failed with linking errors when c...,chhsia0,jieyu,Major,Resolved,Fixed,02/Jan/19 07:03,10/Jan/19 23:44,,MESOS
57001,Bug,,FLINK-11246,13207200,,Fix distinct AGG visibility issues,dian.fu,sunjincheng121,Major,Closed,Fixed,02/Jan/19 02:39,02/Jan/19 05:48,,FLINK


In [4]:
df.groupby(['project']).size().reset_index(name='count').sort_values('count', ascending=False)

Unnamed: 0,project,count
7,HIVE,7931
12,SPARK,7648
5,HBASE,7085
2,CASSANDRA,5358
3,FLINK,4166
6,HDFS,3672
1,CAMEL,3276
8,IGNITE,3256
9,KAFKA,3038
11,MESOS,2955


In [6]:
projects = df['project'].sort_values().unique()
projects

array(['AMQ', 'CAMEL', 'CASSANDRA', 'FLINK', 'GROOVY', 'HBASE', 'HDFS',
       'HIVE', 'IGNITE', 'KAFKA', 'MAPREDUCE', 'MESOS', 'SPARK',
       'ZEPPELIN', 'ZOOKEEPER'], dtype=object)

In [7]:
# fixed issues with corresponding fixing commits on GitHub

found = []
for p in projects:
    f = pd.read_csv(os.path.join('../data', '{}.csv'.format(p)))
    f['project'] = p
    found.append(f)

found = pd.concat(found)
found

Unnamed: 0,issue_key,commit_id,project
0,AMQ-2556,eedda6f2d1a3fa3e06793ca9e0d3a0891415f75a,AMQ
1,AMQ-2560,2d9959a6f6f33f7138606073e425a74261ec3125,AMQ
2,AMQ-2563,50dc8803459187d84625553514c622040991eecc,AMQ
3,AMQ-2564,10a976674edc472181635c5a68f5446067b6216b,AMQ
4,AMQ-2566,a1a676647c5a1ccec17fa1b00a70979c5c73df51,AMQ
...,...,...,...
697,ZOOKEEPER-2442,0c4b4483338d9c71f313f66d3d5bd7912a8ea30a,ZOOKEEPER
698,ZOOKEEPER-2282,1e7fae31af634ccfb85cde17fa634d188145b9b7,ZOOKEEPER
699,ZOOKEEPER-2563,a6c36b69cc72d7d67e392dab5360007d6f737bef,ZOOKEEPER
700,ZOOKEEPER-3471,4951a090d7c946f57ac5ab09b5d48a5d7831001d,ZOOKEEPER


In [8]:
notfound = df[~(df['Issue key'].isin(found['issue_key']))][['project', 'Issue key']].rename(columns={'Issue key': 'issue_key'}).sort_values('project')
notfound

Unnamed: 0,project,issue_key
4069,AMQ,AMQ-3127
19214,AMQ,AMQ-5245
19216,AMQ,AMQ-5244
19260,AMQ,AMQ-5243
19291,AMQ,AMQ-5240
...,...,...
33551,ZOOKEEPER,ZOOKEEPER-2450
21872,ZOOKEEPER,ZOOKEEPER-1997
974,ZOOKEEPER,ZOOKEEPER-833
33225,ZOOKEEPER,ZOOKEEPER-2334


In [9]:
found.groupby(['project']).size().reset_index(name='count').sort_values('count', ascending=False)

Unnamed: 0,project,count
7,HIVE,6733
12,SPARK,6457
5,HBASE,5798
2,CASSANDRA,4276
3,FLINK,3076
6,HDFS,3043
1,CAMEL,2913
8,IGNITE,2815
9,KAFKA,2314
4,GROOVY,1990


In [10]:
# remove MESOS because of bad commit message formats

df = df[df['project'] != 'MESOS']
found = found[found['project'] != 'MESOS']

In [11]:
projects = df['project'].sort_values().unique()
projects

array(['AMQ', 'CAMEL', 'CASSANDRA', 'FLINK', 'GROOVY', 'HBASE', 'HDFS',
       'HIVE', 'IGNITE', 'KAFKA', 'MAPREDUCE', 'SPARK', 'ZEPPELIN',
       'ZOOKEEPER'], dtype=object)

# Commit Links

In [12]:
dfs = []
for p in projects:
    tdf = pd.read_csv('../data/commit_links_{}.csv'.format(p))
    print(len(tdf))
    dfs.append(tdf)

len(dfs)

4060
6355
14621
12318
5105
21856
8042
34070
14903
12004
3955
20586
2898
2045


14

In [13]:
commit_links = pd.concat(dfs)
commit_links.reset_index(drop=True)

Unnamed: 0,fix_hash,fix_date,bug_hash,bug_date,project
0,a1a676647c5a1ccec17fa1b00a70979c5c73df51,1263464274,a677c21240ac154a9fdd4cb1aca9001a5cb16472,1152262629,AMQ
1,7ceb4cbc432afeea89928a6333f1f215a351eee4,1263807801,16e060bd7ef6ecb8ce92b9f461d1886a806a8f37,1263380867,AMQ
2,7ceb4cbc432afeea89928a6333f1f215a351eee4,1263807801,48764becbdb72c94fd50601464f6e9e848490ed2,1259177709,AMQ
3,7ceb4cbc432afeea89928a6333f1f215a351eee4,1263807801,f9d5449f47c2067dfe9606133d085ea5e0da734d,1236945548,AMQ
4,7ceb4cbc432afeea89928a6333f1f215a351eee4,1263807801,c808bebd00064c6c1d2c702f10e249e6e2f99254,1258125727,AMQ
...,...,...,...,...,...
162813,efbd660e1c4b90a8f538f2cccb5dcb7094cf9a22,1606132394,0957b8404e1ecfc5703d7c2827752773b7dc23be,1256919553,ZOOKEEPER
162814,7fad7ea33365304f8c268279689a6cbeed6698bc,1610526351,dc40617ce0ab9161ca8ad22c6c010d198d9abae8,1404769424,ZOOKEEPER
162815,7fad7ea33365304f8c268279689a6cbeed6698bc,1610526351,62d5b08a2593c0ca772714cef7e933786208281f,1199927327,ZOOKEEPER
162816,f39caf6fd717acced2e8eb2bbdf98e92395858c5,1615066178,d6a12a80babf1e226cb56695bbc5479ea4f92c78,1538744553,ZOOKEEPER


# Filtering

## 1. Bug commit date before bug report

In [14]:
fix_bugreport = pd.merge(found, df[['Issue key', 'Created']], how='inner', left_on='issue_key', right_on='Issue key')[['commit_id', 'Created']]
fix_bugreport

Unnamed: 0,commit_id,Created
0,eedda6f2d1a3fa3e06793ca9e0d3a0891415f75a,04/Jan/10 14:01
1,2d9959a6f6f33f7138606073e425a74261ec3125,08/Jan/10 00:07
2,50dc8803459187d84625553514c622040991eecc,08/Jan/10 20:52
3,10a976674edc472181635c5a68f5446067b6216b,12/Jan/10 14:32
4,a1a676647c5a1ccec17fa1b00a70979c5c73df51,13/Jan/10 13:08
...,...,...
44197,0c4b4483338d9c71f313f66d3d5bd7912a8ea30a,03/Jun/16 23:05
44198,1e7fae31af634ccfb85cde17fa634d188145b9b7,28/Sep/15 17:19
44199,a6c36b69cc72d7d67e392dab5360007d6f737bef,08/Sep/16 06:19
44200,4951a090d7c946f57ac5ab09b5d48a5d7831001d,28/Jul/19 17:50


In [15]:
link_bugreport = pd.merge(commit_links, fix_bugreport, how='inner', left_on='fix_hash', right_on='commit_id').drop_duplicates(['fix_hash', 'bug_hash'])
link_bugreport

Unnamed: 0,fix_hash,fix_date,bug_hash,bug_date,project,commit_id,Created
0,a1a676647c5a1ccec17fa1b00a70979c5c73df51,1263464274,a677c21240ac154a9fdd4cb1aca9001a5cb16472,1152262629,AMQ,a1a676647c5a1ccec17fa1b00a70979c5c73df51,13/Jan/10 13:08
1,7ceb4cbc432afeea89928a6333f1f215a351eee4,1263807801,16e060bd7ef6ecb8ce92b9f461d1886a806a8f37,1263380867,AMQ,7ceb4cbc432afeea89928a6333f1f215a351eee4,18/Jan/10 09:37
2,7ceb4cbc432afeea89928a6333f1f215a351eee4,1263807801,48764becbdb72c94fd50601464f6e9e848490ed2,1259177709,AMQ,7ceb4cbc432afeea89928a6333f1f215a351eee4,18/Jan/10 09:37
3,7ceb4cbc432afeea89928a6333f1f215a351eee4,1263807801,f9d5449f47c2067dfe9606133d085ea5e0da734d,1236945548,AMQ,7ceb4cbc432afeea89928a6333f1f215a351eee4,18/Jan/10 09:37
4,7ceb4cbc432afeea89928a6333f1f215a351eee4,1263807801,c808bebd00064c6c1d2c702f10e249e6e2f99254,1258125727,AMQ,7ceb4cbc432afeea89928a6333f1f215a351eee4,18/Jan/10 09:37
...,...,...,...,...,...,...,...
166233,efbd660e1c4b90a8f538f2cccb5dcb7094cf9a22,1606132394,0957b8404e1ecfc5703d7c2827752773b7dc23be,1256919553,ZOOKEEPER,efbd660e1c4b90a8f538f2cccb5dcb7094cf9a22,09/Dec/19 21:13
166234,7fad7ea33365304f8c268279689a6cbeed6698bc,1610526351,dc40617ce0ab9161ca8ad22c6c010d198d9abae8,1404769424,ZOOKEEPER,7fad7ea33365304f8c268279689a6cbeed6698bc,11/Jun/19 19:11
166235,7fad7ea33365304f8c268279689a6cbeed6698bc,1610526351,62d5b08a2593c0ca772714cef7e933786208281f,1199927327,ZOOKEEPER,7fad7ea33365304f8c268279689a6cbeed6698bc,11/Jun/19 19:11
166236,f39caf6fd717acced2e8eb2bbdf98e92395858c5,1615066178,d6a12a80babf1e226cb56695bbc5479ea4f92c78,1538744553,ZOOKEEPER,f39caf6fd717acced2e8eb2bbdf98e92395858c5,14/Feb/17 03:33


In [16]:
link_bugreport['Created'] = pd.to_datetime(link_bugreport['Created'])
link_bugreport['bug_date'] = pd.to_datetime(link_bugreport['bug_date'], unit='s')
link_bugreport['fix_date'] = pd.to_datetime(link_bugreport['fix_date'], unit='s')

In [17]:
link_bugreport = link_bugreport[link_bugreport['bug_date'] < link_bugreport['Created']]
link_bugreport = link_bugreport[link_bugreport['bug_date'] < link_bugreport['fix_date']]
link_bugreport

Unnamed: 0,fix_hash,fix_date,bug_hash,bug_date,project,commit_id,Created
0,a1a676647c5a1ccec17fa1b00a70979c5c73df51,2010-01-14 10:17:54,a677c21240ac154a9fdd4cb1aca9001a5cb16472,2006-07-07 08:57:09,AMQ,a1a676647c5a1ccec17fa1b00a70979c5c73df51,2010-01-13 13:08:00
1,7ceb4cbc432afeea89928a6333f1f215a351eee4,2010-01-18 09:43:21,16e060bd7ef6ecb8ce92b9f461d1886a806a8f37,2010-01-13 11:07:47,AMQ,7ceb4cbc432afeea89928a6333f1f215a351eee4,2010-01-18 09:37:00
2,7ceb4cbc432afeea89928a6333f1f215a351eee4,2010-01-18 09:43:21,48764becbdb72c94fd50601464f6e9e848490ed2,2009-11-25 19:35:09,AMQ,7ceb4cbc432afeea89928a6333f1f215a351eee4,2010-01-18 09:37:00
3,7ceb4cbc432afeea89928a6333f1f215a351eee4,2010-01-18 09:43:21,f9d5449f47c2067dfe9606133d085ea5e0da734d,2009-03-13 11:59:08,AMQ,7ceb4cbc432afeea89928a6333f1f215a351eee4,2010-01-18 09:37:00
4,7ceb4cbc432afeea89928a6333f1f215a351eee4,2010-01-18 09:43:21,c808bebd00064c6c1d2c702f10e249e6e2f99254,2009-11-13 15:22:07,AMQ,7ceb4cbc432afeea89928a6333f1f215a351eee4,2010-01-18 09:37:00
...,...,...,...,...,...,...,...
166232,6a8728d98307f7d52cf6dbadb78149e01b1d0bf5,2020-07-28 08:29:28,12f70403c6fdc4a94e6bfa0a66ddabe7f81c3afc,2011-09-28 20:17:16,ZOOKEEPER,6a8728d98307f7d52cf6dbadb78149e01b1d0bf5,2018-08-07 00:46:00
166233,efbd660e1c4b90a8f538f2cccb5dcb7094cf9a22,2020-11-23 11:53:14,0957b8404e1ecfc5703d7c2827752773b7dc23be,2009-10-30 16:19:13,ZOOKEEPER,efbd660e1c4b90a8f538f2cccb5dcb7094cf9a22,2019-12-09 21:13:00
166234,7fad7ea33365304f8c268279689a6cbeed6698bc,2021-01-13 08:25:51,dc40617ce0ab9161ca8ad22c6c010d198d9abae8,2014-07-07 21:43:44,ZOOKEEPER,7fad7ea33365304f8c268279689a6cbeed6698bc,2019-06-11 19:11:00
166235,7fad7ea33365304f8c268279689a6cbeed6698bc,2021-01-13 08:25:51,62d5b08a2593c0ca772714cef7e933786208281f,2008-01-10 01:08:47,ZOOKEEPER,7fad7ea33365304f8c268279689a6cbeed6698bc,2019-06-11 19:11:00


## 2. Fix commits with less than mean+std fixes

In [18]:
fix_count = link_bugreport.groupby('fix_hash').size().reset_index(name='fixcount').sort_values('fixcount', ascending=False)
fix_count

Unnamed: 0,fix_hash,fixcount
11671,53e47e9191d717b3eec495e6246cd957a8d33c7d,589
28214,ca30235b01b41ba76c3f47236dd16ca6be8bedf0,433
22129,9e35993c0e7e5f2b032d5543c02cdd3401c64a6a,369
32390,e80b3092a638562c4e5070891ff3ce881d418ff6,344
3628,1945e2f67e5b09cdda40146b87e1ba492f897196,280
...,...,...
15178,6c5ea192c75072ba3f7369dfc23592d6ed0c319f,1
15177,6c5db69991ffbed6d3aba14a5e58be862bdd1cd4,1
15176,6c5be6edc25f71f8af439d1a7b6550bcb50b4718,1
24513,afe23b60bbbf4938cb7c471f20a59b51a7ad26a8,1


In [19]:
print(fix_count['fixcount'].median())
print(fix_count['fixcount'].mean())
print(fix_count['fixcount'].std())

max_fix = int(fix_count['fixcount'].mean() + fix_count['fixcount'].std())
print(max_fix)

2.0
4.132400370193791
9.734292413127971
13


In [20]:
fix_count['abs_deviation'] = (fix_count['fixcount'] - fix_count['fixcount'].median()).abs()
fix_count

Unnamed: 0,fix_hash,fixcount,abs_deviation
11671,53e47e9191d717b3eec495e6246cd957a8d33c7d,589,587.0
28214,ca30235b01b41ba76c3f47236dd16ca6be8bedf0,433,431.0
22129,9e35993c0e7e5f2b032d5543c02cdd3401c64a6a,369,367.0
32390,e80b3092a638562c4e5070891ff3ce881d418ff6,344,342.0
3628,1945e2f67e5b09cdda40146b87e1ba492f897196,280,278.0
...,...,...,...
15178,6c5ea192c75072ba3f7369dfc23592d6ed0c319f,1,1.0
15177,6c5db69991ffbed6d3aba14a5e58be862bdd1cd4,1,1.0
15176,6c5be6edc25f71f8af439d1a7b6550bcb50b4718,1,1.0
24513,afe23b60bbbf4938cb7c471f20a59b51a7ad26a8,1,1.0


In [21]:
upper_mad = fix_count['fixcount'].median() + fix_count['abs_deviation'].median()
upper_mad

3.0

In [22]:
f2 = link_bugreport[link_bugreport['fix_hash'].isin(fix_count[fix_count['fixcount'] <= max_fix]['fix_hash'])]
f2

Unnamed: 0,fix_hash,fix_date,bug_hash,bug_date,project,commit_id,Created
0,a1a676647c5a1ccec17fa1b00a70979c5c73df51,2010-01-14 10:17:54,a677c21240ac154a9fdd4cb1aca9001a5cb16472,2006-07-07 08:57:09,AMQ,a1a676647c5a1ccec17fa1b00a70979c5c73df51,2010-01-13 13:08:00
1,7ceb4cbc432afeea89928a6333f1f215a351eee4,2010-01-18 09:43:21,16e060bd7ef6ecb8ce92b9f461d1886a806a8f37,2010-01-13 11:07:47,AMQ,7ceb4cbc432afeea89928a6333f1f215a351eee4,2010-01-18 09:37:00
2,7ceb4cbc432afeea89928a6333f1f215a351eee4,2010-01-18 09:43:21,48764becbdb72c94fd50601464f6e9e848490ed2,2009-11-25 19:35:09,AMQ,7ceb4cbc432afeea89928a6333f1f215a351eee4,2010-01-18 09:37:00
3,7ceb4cbc432afeea89928a6333f1f215a351eee4,2010-01-18 09:43:21,f9d5449f47c2067dfe9606133d085ea5e0da734d,2009-03-13 11:59:08,AMQ,7ceb4cbc432afeea89928a6333f1f215a351eee4,2010-01-18 09:37:00
4,7ceb4cbc432afeea89928a6333f1f215a351eee4,2010-01-18 09:43:21,c808bebd00064c6c1d2c702f10e249e6e2f99254,2009-11-13 15:22:07,AMQ,7ceb4cbc432afeea89928a6333f1f215a351eee4,2010-01-18 09:37:00
...,...,...,...,...,...,...,...
166232,6a8728d98307f7d52cf6dbadb78149e01b1d0bf5,2020-07-28 08:29:28,12f70403c6fdc4a94e6bfa0a66ddabe7f81c3afc,2011-09-28 20:17:16,ZOOKEEPER,6a8728d98307f7d52cf6dbadb78149e01b1d0bf5,2018-08-07 00:46:00
166233,efbd660e1c4b90a8f538f2cccb5dcb7094cf9a22,2020-11-23 11:53:14,0957b8404e1ecfc5703d7c2827752773b7dc23be,2009-10-30 16:19:13,ZOOKEEPER,efbd660e1c4b90a8f538f2cccb5dcb7094cf9a22,2019-12-09 21:13:00
166234,7fad7ea33365304f8c268279689a6cbeed6698bc,2021-01-13 08:25:51,dc40617ce0ab9161ca8ad22c6c010d198d9abae8,2014-07-07 21:43:44,ZOOKEEPER,7fad7ea33365304f8c268279689a6cbeed6698bc,2019-06-11 19:11:00
166235,7fad7ea33365304f8c268279689a6cbeed6698bc,2021-01-13 08:25:51,62d5b08a2593c0ca772714cef7e933786208281f,2008-01-10 01:08:47,ZOOKEEPER,7fad7ea33365304f8c268279689a6cbeed6698bc,2019-06-11 19:11:00


## 3. Buggy commits with less than upper MAD fixes

In [23]:
bug_count = f2.groupby('bug_hash').size().reset_index(name='bugcount').sort_values('bugcount', ascending=False)
bug_count

Unnamed: 0,bug_hash,bugcount
26013,a196766ea07775f18ded69bd9e8d239f8cfd3ccc,841
35169,dbecbe5dfe50f834fc3b8401709079e9470cc517,333
18114,7016f154b20b9ba48fdb2f923d16de59fe5c7c92,251
39162,f52130953245d93489a9c700c3b00eedbe70fdb9,182
24930,9aadcffabd226557174f3ff566927f873c71672e,172
...,...,...
16992,6969dcc79a33d715250958b24361f2d43552d840,1
16991,6969402718c69609e8bc1c758085bfbe882f7293,1
16989,6968f68cd7c8d4a1ab4a6d4638c547e9de33a560,1
16988,6968a57a1a31a11b33bacd2c94d6559bcabd6eb9,1


In [24]:
print(bug_count['bugcount'].median())
print(bug_count['bugcount'].mean())
print(bug_count['bugcount'].std())

max_bug = int(bug_count['bugcount'].mean() + bug_count['bugcount'].std())
print(max_bug)

1.0
2.2812202097235463
6.0001131772246215
8


In [25]:
bug_count['abs_deviation'] = (bug_count['bugcount'] - bug_count['bugcount'].median()).abs()
bug_count

Unnamed: 0,bug_hash,bugcount,abs_deviation
26013,a196766ea07775f18ded69bd9e8d239f8cfd3ccc,841,840.0
35169,dbecbe5dfe50f834fc3b8401709079e9470cc517,333,332.0
18114,7016f154b20b9ba48fdb2f923d16de59fe5c7c92,251,250.0
39162,f52130953245d93489a9c700c3b00eedbe70fdb9,182,181.0
24930,9aadcffabd226557174f3ff566927f873c71672e,172,171.0
...,...,...,...
16992,6969dcc79a33d715250958b24361f2d43552d840,1,0.0
16991,6969402718c69609e8bc1c758085bfbe882f7293,1,0.0
16989,6968f68cd7c8d4a1ab4a6d4638c547e9de33a560,1,0.0
16988,6968a57a1a31a11b33bacd2c94d6559bcabd6eb9,1,0.0


In [26]:
upper_mad = bug_count['bugcount'].median() + bug_count['abs_deviation'].median()
upper_mad

1.0

In [27]:
f3 = f2[f2['bug_hash'].isin(bug_count[bug_count['bugcount'] <= max_bug]['bug_hash'])]
f3

Unnamed: 0,fix_hash,fix_date,bug_hash,bug_date,project,commit_id,Created
0,a1a676647c5a1ccec17fa1b00a70979c5c73df51,2010-01-14 10:17:54,a677c21240ac154a9fdd4cb1aca9001a5cb16472,2006-07-07 08:57:09,AMQ,a1a676647c5a1ccec17fa1b00a70979c5c73df51,2010-01-13 13:08:00
1,7ceb4cbc432afeea89928a6333f1f215a351eee4,2010-01-18 09:43:21,16e060bd7ef6ecb8ce92b9f461d1886a806a8f37,2010-01-13 11:07:47,AMQ,7ceb4cbc432afeea89928a6333f1f215a351eee4,2010-01-18 09:37:00
2,7ceb4cbc432afeea89928a6333f1f215a351eee4,2010-01-18 09:43:21,48764becbdb72c94fd50601464f6e9e848490ed2,2009-11-25 19:35:09,AMQ,7ceb4cbc432afeea89928a6333f1f215a351eee4,2010-01-18 09:37:00
3,7ceb4cbc432afeea89928a6333f1f215a351eee4,2010-01-18 09:43:21,f9d5449f47c2067dfe9606133d085ea5e0da734d,2009-03-13 11:59:08,AMQ,7ceb4cbc432afeea89928a6333f1f215a351eee4,2010-01-18 09:37:00
4,7ceb4cbc432afeea89928a6333f1f215a351eee4,2010-01-18 09:43:21,c808bebd00064c6c1d2c702f10e249e6e2f99254,2009-11-13 15:22:07,AMQ,7ceb4cbc432afeea89928a6333f1f215a351eee4,2010-01-18 09:37:00
...,...,...,...,...,...,...,...
166230,a908001be9641d78040b1954acb0cd3a8e9e42c2,2020-05-17 13:15:54,4af0f828051ace307b2378c22f0c6294ae6fd29c,2012-12-22 00:43:53,ZOOKEEPER,a908001be9641d78040b1954acb0cd3a8e9e42c2,2014-07-29 16:27:00
166232,6a8728d98307f7d52cf6dbadb78149e01b1d0bf5,2020-07-28 08:29:28,12f70403c6fdc4a94e6bfa0a66ddabe7f81c3afc,2011-09-28 20:17:16,ZOOKEEPER,6a8728d98307f7d52cf6dbadb78149e01b1d0bf5,2018-08-07 00:46:00
166234,7fad7ea33365304f8c268279689a6cbeed6698bc,2021-01-13 08:25:51,dc40617ce0ab9161ca8ad22c6c010d198d9abae8,2014-07-07 21:43:44,ZOOKEEPER,7fad7ea33365304f8c268279689a6cbeed6698bc,2019-06-11 19:11:00
166235,7fad7ea33365304f8c268279689a6cbeed6698bc,2021-01-13 08:25:51,62d5b08a2593c0ca772714cef7e933786208281f,2008-01-10 01:08:47,ZOOKEEPER,7fad7ea33365304f8c268279689a6cbeed6698bc,2019-06-11 19:11:00


# Bug-inducing commits

In [28]:
project_count = f3.groupby('project').size().reset_index(name='count').sort_values('count', ascending=False).reset_index(drop='True')
project_count

Unnamed: 0,project,count
0,SPARK,11243
1,HIVE,10517
2,HBASE,8620
3,CASSANDRA,6698
4,FLINK,6361
5,IGNITE,4980
6,CAMEL,4687
7,HDFS,4390
8,KAFKA,4081
9,GROOVY,3067


In [29]:
java_buggy = f3[['bug_hash', 'project']].drop_duplicates('bug_hash')
java_buggy

Unnamed: 0,bug_hash,project
0,a677c21240ac154a9fdd4cb1aca9001a5cb16472,AMQ
1,16e060bd7ef6ecb8ce92b9f461d1886a806a8f37,AMQ
2,48764becbdb72c94fd50601464f6e9e848490ed2,AMQ
3,f9d5449f47c2067dfe9606133d085ea5e0da734d,AMQ
4,c808bebd00064c6c1d2c702f10e249e6e2f99254,AMQ
...,...,...
166212,68bc2d165e8b88f9f352b3b50bbae84bba844657,ZOOKEEPER
166214,a58353b4fd25d376dd7e32d7440404b9159126a4,ZOOKEEPER
166217,2d61e7ebf2a75e227a140644357a073fe758ddbf,ZOOKEEPER
166220,5141003acd8ea3307bc36f9fc8c3990105a6f195,ZOOKEEPER


In [30]:
java_buggy['buggy'] = True
java_buggy

Unnamed: 0,bug_hash,project,buggy
0,a677c21240ac154a9fdd4cb1aca9001a5cb16472,AMQ,True
1,16e060bd7ef6ecb8ce92b9f461d1886a806a8f37,AMQ,True
2,48764becbdb72c94fd50601464f6e9e848490ed2,AMQ,True
3,f9d5449f47c2067dfe9606133d085ea5e0da734d,AMQ,True
4,c808bebd00064c6c1d2c702f10e249e6e2f99254,AMQ,True
...,...,...,...
166212,68bc2d165e8b88f9f352b3b50bbae84bba844657,ZOOKEEPER,True
166214,a58353b4fd25d376dd7e32d7440404b9159126a4,ZOOKEEPER,True
166217,2d61e7ebf2a75e227a140644357a073fe758ddbf,ZOOKEEPER,True
166220,5141003acd8ea3307bc36f9fc8c3990105a6f195,ZOOKEEPER,True


In [31]:
java_buggy['project'] = 'apache/' + java_buggy['project'].str.lower()
java_buggy

Unnamed: 0,bug_hash,project,buggy
0,a677c21240ac154a9fdd4cb1aca9001a5cb16472,apache/amq,True
1,16e060bd7ef6ecb8ce92b9f461d1886a806a8f37,apache/amq,True
2,48764becbdb72c94fd50601464f6e9e848490ed2,apache/amq,True
3,f9d5449f47c2067dfe9606133d085ea5e0da734d,apache/amq,True
4,c808bebd00064c6c1d2c702f10e249e6e2f99254,apache/amq,True
...,...,...,...
166212,68bc2d165e8b88f9f352b3b50bbae84bba844657,apache/zookeeper,True
166214,a58353b4fd25d376dd7e32d7440404b9159126a4,apache/zookeeper,True
166217,2d61e7ebf2a75e227a140644357a073fe758ddbf,apache/zookeeper,True
166220,5141003acd8ea3307bc36f9fc8c3990105a6f195,apache/zookeeper,True


In [32]:
java_buggy['project'] = ['apache/activemq' if p == 'apache/amq' else p for p in java_buggy['project'].tolist()]
java_buggy['project'] = ['apache/hadoop-hdfs' if p == 'apache/hdfs' else p for p in java_buggy['project'].tolist()]
java_buggy['project'] = ['apache/hadoop-mapreduce' if p == 'apache/mapreduce' else p for p in java_buggy['project'].tolist()]

In [33]:
java_buggy = java_buggy.rename(columns={'bug_hash': 'commit_id'})
java_buggy

Unnamed: 0,commit_id,project,buggy
0,a677c21240ac154a9fdd4cb1aca9001a5cb16472,apache/activemq,True
1,16e060bd7ef6ecb8ce92b9f461d1886a806a8f37,apache/activemq,True
2,48764becbdb72c94fd50601464f6e9e848490ed2,apache/activemq,True
3,f9d5449f47c2067dfe9606133d085ea5e0da734d,apache/activemq,True
4,c808bebd00064c6c1d2c702f10e249e6e2f99254,apache/activemq,True
...,...,...,...
166212,68bc2d165e8b88f9f352b3b50bbae84bba844657,apache/zookeeper,True
166214,a58353b4fd25d376dd7e32d7440404b9159126a4,apache/zookeeper,True
166217,2d61e7ebf2a75e227a140644357a073fe758ddbf,apache/zookeeper,True
166220,5141003acd8ea3307bc36f9fc8c3990105a6f195,apache/zookeeper,True


In [34]:
# fixing commits without bug-inducing commits based on SZZ

remain = set(found['commit_id']) - set(commit_links['fix_hash'])
remain = found[found['commit_id'].isin(remain)]
remain

Unnamed: 0,issue_key,commit_id,project
0,AMQ-2556,eedda6f2d1a3fa3e06793ca9e0d3a0891415f75a,AMQ
20,AMQ-2604,a65881cf4ec744847706225f140df48770fd147c,AMQ
30,AMQ-2630,f33e2190d0c481b78171749091d3d044f139e85d,AMQ
33,AMQ-2643,bfd1be561daf6be51b31e9b23dee900dbfa392fa,AMQ
40,AMQ-2663,9fcf16eaabb197b5c91d916a621243d6d141e5ae,AMQ
...,...,...,...
650,ZOOKEEPER-2785,0a3e2d1d4bdbb96f41da7111db3913ef08068722,ZOOKEEPER
659,ZOOKEEPER-2951,dcfbe45241855f2caccf3848be2e66e7aa23aa96,ZOOKEEPER
677,ZOOKEEPER-2413,21ae2cf7dfbfd12e8160503c43b5f379fd049d79,ZOOKEEPER
695,ZOOKEEPER-2647,dfcf4da75c35c12d08496ded2612031f887c0765,ZOOKEEPER


In [35]:
remain = remain[['commit_id', 'project']].drop_duplicates()
remain['project'] = 'apache/' + remain['project'].str.lower()

remain['project'] = ['apache/activemq' if p == 'apache/amq' else p for p in remain['project'].tolist()]
remain['project'] = ['apache/hadoop-hdfs' if p == 'apache/hdfs' else p for p in remain['project'].tolist()]
remain['project'] = ['apache/hadoop-mapreduce' if p == 'apache/mapreduce' else p for p in remain['project'].tolist()]
remain

Unnamed: 0,commit_id,project
0,eedda6f2d1a3fa3e06793ca9e0d3a0891415f75a,apache/activemq
20,a65881cf4ec744847706225f140df48770fd147c,apache/activemq
30,f33e2190d0c481b78171749091d3d044f139e85d,apache/activemq
33,bfd1be561daf6be51b31e9b23dee900dbfa392fa,apache/activemq
40,9fcf16eaabb197b5c91d916a621243d6d141e5ae,apache/activemq
...,...,...
650,0a3e2d1d4bdbb96f41da7111db3913ef08068722,apache/zookeeper
659,dcfbe45241855f2caccf3848be2e66e7aa23aa96,apache/zookeeper
677,21ae2cf7dfbfd12e8160503c43b5f379fd049d79,apache/zookeeper
695,dfcf4da75c35c12d08496ded2612031f887c0765,apache/zookeeper


In [36]:
remain['fix'] = True
remain

Unnamed: 0,commit_id,project,fix
0,eedda6f2d1a3fa3e06793ca9e0d3a0891415f75a,apache/activemq,True
20,a65881cf4ec744847706225f140df48770fd147c,apache/activemq,True
30,f33e2190d0c481b78171749091d3d044f139e85d,apache/activemq,True
33,bfd1be561daf6be51b31e9b23dee900dbfa392fa,apache/activemq,True
40,9fcf16eaabb197b5c91d916a621243d6d141e5ae,apache/activemq,True
...,...,...,...
650,0a3e2d1d4bdbb96f41da7111db3913ef08068722,apache/zookeeper,True
659,dcfbe45241855f2caccf3848be2e66e7aa23aa96,apache/zookeeper,True
677,21ae2cf7dfbfd12e8160503c43b5f379fd049d79,apache/zookeeper,True
695,dfcf4da75c35c12d08496ded2612031f887c0765,apache/zookeeper,True


In [37]:
buggy_fixes = [True if c in java_buggy['commit_id'].tolist() else False for c in remain['commit_id'].tolist()]
remain['buggy'] = buggy_fixes
remain

Unnamed: 0,commit_id,project,fix,buggy
0,eedda6f2d1a3fa3e06793ca9e0d3a0891415f75a,apache/activemq,True,True
20,a65881cf4ec744847706225f140df48770fd147c,apache/activemq,True,False
30,f33e2190d0c481b78171749091d3d044f139e85d,apache/activemq,True,True
33,bfd1be561daf6be51b31e9b23dee900dbfa392fa,apache/activemq,True,True
40,9fcf16eaabb197b5c91d916a621243d6d141e5ae,apache/activemq,True,False
...,...,...,...,...
650,0a3e2d1d4bdbb96f41da7111db3913ef08068722,apache/zookeeper,True,True
659,dcfbe45241855f2caccf3848be2e66e7aa23aa96,apache/zookeeper,True,False
677,21ae2cf7dfbfd12e8160503c43b5f379fd049d79,apache/zookeeper,True,False
695,dfcf4da75c35c12d08496ded2612031f887c0765,apache/zookeeper,True,False


In [38]:
fix_bugs = [True if c in found['commit_id'].tolist() else False for c in java_buggy['commit_id'].tolist()]
java_buggy['fix'] = fix_bugs
java_buggy

Unnamed: 0,commit_id,project,buggy,fix
0,a677c21240ac154a9fdd4cb1aca9001a5cb16472,apache/activemq,True,False
1,16e060bd7ef6ecb8ce92b9f461d1886a806a8f37,apache/activemq,True,False
2,48764becbdb72c94fd50601464f6e9e848490ed2,apache/activemq,True,False
3,f9d5449f47c2067dfe9606133d085ea5e0da734d,apache/activemq,True,False
4,c808bebd00064c6c1d2c702f10e249e6e2f99254,apache/activemq,True,False
...,...,...,...,...
166212,68bc2d165e8b88f9f352b3b50bbae84bba844657,apache/zookeeper,True,True
166214,a58353b4fd25d376dd7e32d7440404b9159126a4,apache/zookeeper,True,False
166217,2d61e7ebf2a75e227a140644357a073fe758ddbf,apache/zookeeper,True,False
166220,5141003acd8ea3307bc36f9fc8c3990105a6f195,apache/zookeeper,True,False


In [39]:
apachejava = pd.concat([java_buggy, remain]).sort_values('commit_id').drop_duplicates('commit_id', keep='first')
apachejava

Unnamed: 0,commit_id,project,buggy,fix
122023,0005467e3f5211378c2932f0ebc9b5b2c73d565a,apache/ignite,True,False
52876,000596f7c3074c7fbd490a47ac43be6229993941,apache/hbase,True,True
70726,00067895a01c66d53715b50bbcb3605efd6425f2,apache/hadoop-hdfs,True,False
71761,0007360c3344b3485fa17de0fd2015a628de947c,apache/hadoop-hdfs,True,True
2355,000a67839666bf7cb39d3955757bb05fa95f1b18,apache/hadoop-hdfs,False,True
...,...,...,...,...
164471,fffe0e0f1e2275899db903a925d68af53fd32dac,apache/zookeeper,True,False
135635,fffe4279217ebc8e1c09fc2808765bc65718ab0e,apache/hadoop-mapreduce,True,False
157450,fffeb6d7c37ee673a32584f3b2fd3afe86af793a,apache/spark,True,False
11937,fffecb84f9aeb55bd4576d914e87ad7834504023,apache/cassandra,True,False


In [4]:
from pydriller import Git

def find_date(commits, projects):
    dates = []
    for i in range(len(commits)):
        c = commits[i]
        p = projects[i]
        try:
            commit = Git('../repos/' + p.split('/')[1]).get_commit(c)
        except ValueError:  # for hadoop repos
            commit = Git('../repos/' + p.split('/')[1].split('-')[0]).get_commit(c)
        dates.append(int(commit.committer_date.timestamp()))
        if len(dates) % 2000 == 0:
            print('{} passed.'.format(len(dates)))
    return dates

In [6]:
dates = find_date(apachejava['commit_id'].tolist(), apachejava['project'].tolist())
apachejava['date'] = dates

2000 passed.
4000 passed.
6000 passed.
8000 passed.
10000 passed.
12000 passed.
14000 passed.
16000 passed.
18000 passed.
20000 passed.
22000 passed.
24000 passed.
26000 passed.
28000 passed.
30000 passed.
32000 passed.
34000 passed.
36000 passed.
38000 passed.
40000 passed.
42000 passed.
44000 passed.


In [None]:
apachejava.to_csv('../data/apachejava.csv', index=False)