In [1]:
import numpy
import requests
import json
import plotly.express as px
import pandas as pd
import datetime
from numpy import ones,vstack
from numpy.linalg import lstsq

In [2]:
provider = "github"
nodes = ['pulls', 'issues', 'commits']
save_node = "repositoryStats"
# nodes = ['pulls', 'issues']
start_date = '2017-09-01T00:00:00-03:00'
end_date = '2021-09-15T23:59:59-03:00'

date_format = '%Y-%m-%dT%H:%M:%S%z'

In [18]:
def save_data(_provider, _data_api_params):
    _data_api_url = f'http://githunter-bind-starws.labbs.com.br/publish/provider/'+ _provider + '/node/' + save_node + '?createRawData=true'
    _data_api_headers = {
        'content-type': 'application/json'
    }
    _data_api_request = requests.post(_data_api_url, json=_data_api_params, headers=_data_api_headers)
    _data = json.loads(_data_api_request.text)
    return _data

In [19]:
def load_data(_provider, _node, _start_date, _end_date):
    _data_api_url = f'http://githunter-bind-starws.labbs.com.br/metrics'
    _data_api_params = {
        'startDateTime': _start_date,
        'endDateTime': _end_date,
        'provider': _provider,
        'node': _node
    }
    _data_api_headers = {
        'content-type': 'application/json'
    }
    _data_api_request = requests.get(_data_api_url, params=_data_api_params, headers=_data_api_headers)

    if not _data_api_request:
        print('No data found in StarWS')
        return None
    else:
        _data = json.loads(_data_api_request.text)['data']
        # data_source = json.dumps(data, indent=4, sort_keys=True)
        return _data

In [14]:
def load_mongo_data(_repo_list):
    _data_api_url = f'http://localhost:3333/code-info'
    _data_api_params = {
        'repoList': _repo_list
    }
    _data_api_headers = {
        'content-type': 'application/json'
    }
    _data_api_request = requests.get(_data_api_url, json=_data_api_params, headers=_data_api_headers)
    _data = json.loads(_data_api_request.text)
    # _data_source = json.dumps(_data, indent=4, sort_keys=True)
    if 'data' in _data:
        return _data['data']
    elif 'repositories' in _data:
        return _data['repositories']
    return _data


In [6]:
metrics = {
    # frequency
    "commitFrequency": [[366,0], [168,40], [48,60], [6,100]],
    "issuesRecent": [[720,0],[360,20],[160,50],[48,80],[24,100]],
    "issuesGettingClosed": [[720,0], [360,40], [160,80], [72,100]],
    "issuesClosedQuickly": [[720,0], [504,40], [240,60], [120,100]],
    "issuesResponseQuickly": [[336,0], [168,40], [48,50], [24,80],[2,100]],
    "pullsResponseQuickly": [[336,0], [168,40], [48,50], [24,80],[2,100]],
    "pullsRecent": [[720,0],[360,20],[160,50],[48,80],[24,100]],
    "pullsGettingMerged": [[720,0], [360,40], [160,80], [72,100]],

    #definition oos
    "readmeFileSize": [[0,0], [3500,100]],

    # popularity
    "numberOfContributors": [[1,0], [50,30] ,[100,70], [200,100]],
    "numberOfIssues": [[1,0], [50,30] ,[100,70], [200,100]],
    "numberOfPulls": [[1,0], [50,30] ,[100,70], [200,100]]
}

def format_date(_date_string):
    _date_obj = datetime.datetime.strptime(_date_string, date_format)
    return _date_obj

def average(_list):
    if len(_list) > 0:
        return sum(_list) / len(_list)
    return 0

def get_metric_percent(_metric, _value):
    _btw_index = -1 # Last array position
    for i, v in enumerate(metrics[_metric]):
        if _value >= v[0]:
            _btw_index = i
            break
    if _btw_index == 0:
        return metrics[_metric][0][1] # returning minimal score
    elif _btw_index == -1:
        return metrics[_metric][-1][1] # returning maximal score

    _point_0 = (metrics[_metric][i-1][0], metrics[_metric][i-1][1])
    _point_1 = (metrics[_metric][i][0], metrics[_metric][i][1])
    _points = [_point_0,_point_1]
    _x_coords, _y_coords = zip(*_points)
    _A = vstack([_x_coords,ones(len(_x_coords))]).T
    _x, _c = lstsq(_A, _y_coords)[0]

    _score = _x * _value + _c

    return _score

In [7]:
def calc_frequency(_list):
#     sort by date
#     print(_list)
    if len(_list) == 0:
        return 0
    _frequency = 0
    for index, value in enumerate(_list):
        if index == 0:
            _date_a = datetime.datetime.now().replace(tzinfo=datetime.timezone.utc).astimezone(tz=None)
        else:
            _date_a = format_date(_list[index-1][0])
        _date_b = format_date(_list[index][0])
        _delta = _date_a - _date_b
        if _delta.total_seconds() < 0:
            _delta = _date_b - _date_a
        # print("Delta: ",_date_a, _date_b, _delta)
        _frequency = _frequency + _delta.total_seconds()
    _frequency = (_frequency/len(_list)) / 60 / 60
    return _frequency

def frequency_calc_score(_list, _metric):
    _frequency = calc_frequency(_list)
    _score = get_metric_percent(_metric, _frequency)
    return _score

def frequency_calc_score_close_time_issue(_list_of_dict):
    _comments = _list_of_dict['comments.data']
    _frequency = []
    for _i, _c in _comments.items():
        if not _c:
            continue
        _closed_at = _list_of_dict['closedAt'].get(_i)
        _sorted = sorted(_c, key=lambda x: format_date(x['createdAt']), reverse=True)
        for _s in _sorted:
            _diff = format_date(_closed_at) - format_date(_s['createdAt'])
            # sometime the user say thanks then close the issue
            if _diff.total_seconds() >= 30:
                break
        _score = get_metric_percent('issuesClosedQuickly', (_diff.total_seconds() / 60 / 60))
        _frequency.append(_score)
    return average(_frequency)

def frequency_calc_score_response_time_issue(_list_of_dict):
    _comments = _list_of_dict['comments.data']
    _frequency = []
    for _i, _c in _comments.items():
        if not _c:
            continue
        _created_at = _list_of_dict['createdAt'].get(_i)
        _sorted = sorted(_c, key=lambda x: format_date(x['createdAt']))
        for _s in _sorted:
            _diff = format_date(_created_at) - format_date(_s['createdAt'])
            # sometime the user say thanks then close the issue
            if _diff.total_seconds() >= 30:
                break
        _score = get_metric_percent('issuesResponseQuickly', (_diff.total_seconds() / 60 / 60))
        _frequency.append(_score)
    return average(_frequency)

def frequency_calc_score_response_time_pull(_list_of_dict):
    _comments = _list_of_dict['comments.data']
    _frequency = []
    for _i, _c in _comments.items():
        if not _c:
            continue
        _created_at = _list_of_dict['createdAt'].get(_i)
        _sorted = sorted(_c, key=lambda x: format_date(x['createdAt']))
        for _s in _sorted:
            _diff = format_date(_created_at) - format_date(_s['createdAt'])
            # sometime the user say thanks then close the issue
            if _diff.total_seconds() >= 30:
                break
        _score = get_metric_percent('issuesResponseQuickly', (_diff.total_seconds() / 60 / 60))
        _frequency.append(_score)
    return average(_frequency)

In [8]:
def definition_oos_calc(_repo):
    _definitionOSS = []

    # Does it have a license?
    if _repo['licenseInfo'] != "":
        _definitionOSS.append(100)
    else:
        _definitionOSS.append(0)

    # Does it have a Readme? Check the size (reference size repo Docker)
    if _repo['hasReadmeFile']:
        _score = 100
        if not numpy.isnan(_repo['readmeFileSize']):
            _score = get_metric_percent("readmeFileSize", _repo['readmeFileSize'])
        _definitionOSS.append(_score)
    else:
        _definitionOSS.append(0)

    # Does it have a Contribution?
    if _repo['hasContributingFile']:
        _definitionOSS.append(70)
    else:
        _definitionOSS.append(30)

    # Does it have a Code of Conduct?
    if _repo['hasCodeOfConductFile']:
        _definitionOSS.append(70)
    else:
        _definitionOSS.append(30)

    return average(_definitionOSS)

In [9]:
def popularity_calc_contribution(_qty):
    _score = get_metric_percent("numberOfContributors", _qty)
    return _score

def popularity_calc_issues(_qty):
    _score = get_metric_percent("numberOfIssues", _qty)
    return _score

def popularity_calc_pulls(_qty):
    _score = get_metric_percent("numberOfPulls", _qty)
    return _score

In [30]:
data = {}
for node in nodes:
    raw_data = load_data(provider, node, start_date, end_date)
    # Keeping last version of itens
    if raw_data:
        if node in 'commits':
            data[node] = pd.DataFrame.from_dict(pd.json_normalize(raw_data))
        else:
            data[node] = pd.DataFrame.from_dict(pd.json_normalize(raw_data)).sort_values('dateTime').groupby('number').tail(1)

pulls
issues
commits


In [27]:
# Getting owner and name from all nodes to request data from Mongo
frames = []
for k in data:
    frames.append(data[k][['owner', 'name']])

if frames:
    result = pd.concat(frames).drop_duplicates().to_json(orient="table", index=None)
    repoList = json.loads(result)['data']

    if repoList:
        code_data = load_mongo_data(repoList)
        data['code'] = pd.DataFrame.from_dict(code_data)

In [29]:
stats_list = []

# We have data[node] with a clear data frame
if data:
    for _, repo in data['code'].iterrows():
        # if repo['name'] != 'thefuck':
        #    continue
        # print(repo)
        repoIssues = data['issues'].loc[(data['issues'].owner == repo['owner']) &  (data['issues'].name == repo['name'])]
        # print(repoIssues)
        repoPulls = data['pulls'].loc[(data['pulls'].owner == repo['owner']) &  (data['pulls'].name == repo['name'])]
        # print(repoPulls)
        repoCommits = data['commits'].loc[(data['commits'].owner == repo['owner']) &  (data['commits'].name == repo['name'])]
        # print("Commits:", repoCommits[['committedDate']].values)

        ############################# POPULARITY ####################################
        popularity_list = []

        # How many open pull requests are there?
        openPullsQty = repoPulls.loc[repoPulls['state'] == 'OPEN'].count()['dateTime']
        # print("openPullsQty:", openPullsQty)
        p = popularity_calc_pulls(openPullsQty)
        popularity_list.append(p)

        # How many open issues are there?
        open_issues_qty = repoIssues.loc[repoIssues['state'] == 'OPEN'].count()['dateTime']
        # print("open issuesQty:", open_issues_qty)
        p = popularity_calc_issues(open_issues_qty)
        popularity_list.append(p)

        # How many contributors does the project have?
        participantsTotalCount = repoIssues['participants.totalCount'].astype(int).sum()
        # print("participantsTotalCount:", participantsTotalCount)
        p = popularity_calc_contribution(participantsTotalCount)
        popularity_list.append(p)

        print(popularity_list)
        popularity = average(popularity_list)
        # print("Popularity:", popularity)
        # print('##########################')


        ############################# FREQUENCY ####################################
        frequency_list = []
        # When was the latest commit?
        f = frequency_calc_score(repoCommits[['committedDate']].values, 'commitFrequency' )
        frequency_list.append(f)

        # Are the issues recent?
        f = frequency_calc_score(repoIssues[['createdAt']].values, 'issuesRecent' )
        frequency_list.append(f)

        # Are issues getting closed?
        f = frequency_calc_score(repoIssues.loc[repoIssues['state'] == 'CLOSED'][['closedAt']].values, 'issuesGettingClosed' )
        frequency_list.append(f)

        # Are issues closed quickly?
        issuesClosed = repoIssues.loc[repoIssues['state'] == 'CLOSED']
        f = frequency_calc_score_close_time_issue(issuesClosed[['closedAt', 'comments.data']])
        frequency_list.append(f)

        # Do maintainers respond quickly to issues when they are opened?
        f = frequency_calc_score_response_time_issue(repoIssues[['createdAt', 'comments.data']])
        frequency_list.append(f)

        # Do maintainers respond quickly to pull requests when they are opened?
        # print(repoPulls[['createdAt', 'comments.data']])
        f = frequency_calc_score_response_time_pull(repoPulls[['createdAt', 'comments.data']])
        frequency_list.append(f)

        # Are the pull requests recent?
        f = frequency_calc_score(repoPulls[['createdAt']].values, 'pullsRecent' )
        frequency_list.append(f)

        # How recently were any pull requests merged?
        f = frequency_calc_score(repoPulls.loc[repoPulls['state'] == 'MERGED'][['mergedAt']].values, 'pullsGettingMerged' )
        frequency_list.append(f)

        frequency = average(frequency_list)
        # print("Frequency:", frequency)
        # print('##########################')


        ############################# DEFINITION OOS ####################################
        definitionOSS = definition_oos_calc(repo)
        # print(definitionOSS)

        ############################# QUALITY ####################################
        quality = 0.5

        ############################# FRIENDLY ####################################
        friendly = 0.5


        stats = {
            'frequency': round(frequency, 2),
            'definitionOSS': round(definitionOSS, 2),
            'popularity': round(popularity, 2),
            'friendly': round(friendly, 2),
            'quality': round(quality, 2),
            'name': repo['name'],
            'owner': repo['owner'],
            'provider': provider,
            'language': repo['languages']
        }
        stats_list.append(stats)

        df = pd.DataFrame(dict(
            r=[frequency, definitionOSS, 5, popularity, 5],
            theta=['Frequency','Definition of OSS','Friendly',
                   'Popularity', 'Quality']))
        fig = px.line_polar(df, r='r', theta='theta', line_close=True)
        fig.update_layout(
            title = repo['owner'] + '/' + repo['name'],
            polar=dict(
                radialaxis=dict(
                visible=True,
                  range=[0, 101]
                )),
            showlegend=False
        )
        fig.show()
        # break

    sr = save_data(provider, stats_list)
    print(sr)