In [25]:
import elasticsearch
import pandas as pd
from elasticsearch_dsl import Search,Q
es = elasticsearch.Elasticsearch(['http://localhost:9200/'], maxsize=500, block=False)
es_ma_index = 'grimoirelab_index'
es_bl_index = 'grimoirelab_blame_index'
# Using Elasticsearch DSL function to get the data of Commit index
blame_es_data = Search(using=es, index=es_bl_index)
# Loading data into a dictionary
blame_dict = [hit.to_dict() for hit in blame_es_data.scan()]
# Using Elasticsearch DSL function to get the data of Blame index
commit_es_data = Search(using=es, index=es_ma_index)
# Loading data into a dictionary
commit_dict = [hit.to_dict() for hit in commit_es_data.scan()]
# Creating pandas dataframe for commit data
commit_frame = pd.DataFrame(commit_dict)
# Creating pandas dataframe for blame data
blame_frame = pd.DataFrame(blame_dict)
blame_frame['file'] = blame_frame['file'].apply(lambda x:x.split('/')[-1])
print(commit_frame.shape)
print(blame_frame.shape)
# Getting the blame row count. If the frmae is empty, it means all the records are clean
blame_count = blame_frame.shape[0]
#print(blame_frame.columns)

if blame_count>0:
    # Adding a column to Blame frame indicating that the row represents a Buggy commit
    blame_frame['type'] = 'Buggy'
    # Combining Commit frmae with Blame frame. An additional column called 'type' gets added to the Commit frame.
    #comb_frame = pd.merge(commit_frame,blame_frame,how='left',left_on = ['hash','file_path'],right_on = ['blame_hash','file'])
    comb_frame = pd.merge(commit_frame,blame_frame,how='left',left_on = ['hash','file_name'],right_on = ['blame_hash','file'])


else:
    # If the Blame frame is empty, no need to merge.
    comb_frame=commit_frame
# When merging happnes and 'type' column gets added to the main Commit frame, The rows that are not part of Blame frame are filled with 'Nan'.
# Here, all the NaNs fro 'type' column are replaced with 'Clean' label.
# Effectively, Each commit file (one Commit can contain more than one file) is categorised as either Buggy or Clean.
print(comb_frame[comb_frame['type']=='Buggy'].shape)
comb_frame['type'] = comb_frame['type'].fillna('Clean')

# Cleaning and retaining the required columns
comb_frame_refined = comb_frame[['hash', 'Author','Committer', 'Email', 'message',                               'committed_date', 'no._of_branches', 'merge_commit?',
                                'no._of_mod_files', 'dmm_unit_size', 'dmm_unit_complexity','dmm_unit_interfacing',
                                'file_path','file_name', 'complexity','functions', 'lines_added', 'lines_removed', 'size', 'tokens',
                                'type']]
# Commit hash raw value is very long. Cutting the value into first ten chars 
# Assumption is that the first ten chars rednder necessary uniqueness. May need to revisit later
#comb_frame_refined['hash'] = comb_frame_refined['hash'].str.slice(0,10)

# Changing the type from string to Data. Used Pacific time zone. Heard the pacific coast is beautiful
commit_frame['committed_date'] = commit_frame['committed_date'].astype('str').apply(lambda x: pd.to_datetime(x).tz_convert('US/Pacific'))
# Sorting the frame by committes date
comb_frame_refined = comb_frame_refined.drop_duplicates().sort_values('committed_date', ascending=False)


(1657, 22)
(533, 3)
(556, 26)


In [26]:
#print(commit_frame['file_path']==blame_frame['file'])
#blame_frame



613          docker/Dockerfile-full
618                            None
623       docker/entrypoint-full.sh
1227                  .dockerignore
1232                           None
1450      docker/Dockerfile-factory
1529    docker/Dockerfile-installed
1533               docker/README.md
Name: file_path, dtype: object
