In [37]:
import numpy
import requests
import json
import plotly.express as px
import pandas as pd
import datetime
from numpy import ones,vstack
from numpy.linalg import lstsq

In [50]:
provider = "521xueweihan_GitHub520"
nodes = ['pulls', 'issues', 'commits', 'comments']
save_node = "repositoryStats"
# nodes = ['pulls', 'issues']
start_date = '2017-09-01T00:00:00-03:00'
end_date = '2021-09-15T23:59:59-03:00'

date_format = '%Y-%m-%dT%H:%M:%S%z'

In [39]:
def save_data(_provider, _data_api_params):
    _data_api_url = f'https://githunter-bind-starws-r3.labbs.com.br/publish/provider/'+ _provider + '/node/' + save_node + '?createRawData=true'
    _data_api_headers = {
        'content-type': 'application/json'
    }
    _data_api_request = requests.post(_data_api_url, json=_data_api_params, headers=_data_api_headers)
    _data = json.loads(_data_api_request.text)
    return _data

In [40]:
def load_data(_provider, _node, _start_date, _end_date):
    _data_api_url = f'https://githunter-bind-starws-r3.labbs.com.br/metrics'
    _data_api_params = {
        'startDateTime': _start_date,
        'endDateTime': _end_date,
        'provider': _provider,
        'node': _node
    }
    _data_api_headers = {
        'content-type': 'application/json'
    }
    _data_api_request = requests.get(_data_api_url, params=_data_api_params, headers=_data_api_headers)

    if _data_api_request.status_code != 200:
        print('No data found in StarWS')
        return None
    else:
        _data = json.loads(_data_api_request.text)['data']
        # data_source = json.dumps(data, indent=4, sort_keys=True)
        return _data

In [41]:
def load_mongo_data(_repo_list):
    _data_api_url = f'https://githunter-data-provider-r3.labbs.com.br/code-info'
    _data_api_params = {
        'repoList': _repo_list
    }
    _data_api_headers = {
        'content-type': 'application/json'
    }
    _data_api_request = requests.get(_data_api_url, json=_data_api_params, headers=_data_api_headers)
    _data = json.loads(_data_api_request.text)
    # _data_source = json.dumps(_data, indent=4, sort_keys=True)
    if 'data' in _data:
        return _data['data']
    elif 'repositories' in _data:
        return _data['repositories']
    return _data


In [42]:
metrics = {
    # frequency
    "commitFrequency": [[366,0], [168,40], [48,60], [6,100]],
    "issuesRecent": [[720,0],[360,20],[160,50],[48,80],[24,100]],
    "issuesGettingClosed": [[720,0], [360,40], [160,80], [72,100]],
    "issuesClosedQuickly": [[720,0], [504,40], [240,60], [120,100]],
    "issuesResponseQuickly": [[336,0], [168,40], [48,50], [24,80],[2,100]],
    "pullsResponseQuickly": [[336,0], [168,40], [48,50], [24,80],[2,100]],
    "pullsRecent": [[720,0],[360,20],[160,50],[48,80],[24,100]],
    "pullsGettingMerged": [[720,0], [360,40], [160,80], [72,100]],

    #definition oos
    "readmeFileSize": [[0,0], [3500,100]],

    # popularity
    "numberOfContributors": [[1,0], [50,30] ,[100,70], [200,100]],
    "numberOfIssues": [[1,0], [50,30] ,[100,70], [200,100]],
    "numberOfPulls": [[1,0], [50,30] ,[100,70], [200,100]]
}

def format_date(_date_string):
    _date_obj = datetime.datetime.strptime(_date_string, date_format)
    return _date_obj

def average(_list):
    if len(_list) > 0:
        return sum(_list) / len(_list)
    return 0

def get_metric_percent(_metric, _value):
    _btw_index = -1 # Last array position
    for i, v in enumerate(metrics[_metric]):
        if _value >= v[0]:
            _btw_index = i
            break
    if _btw_index == 0:
        return metrics[_metric][0][1] # returning minimal score
    elif _btw_index == -1:
        return metrics[_metric][-1][1] # returning maximal score

    _point_0 = (metrics[_metric][i-1][0], metrics[_metric][i-1][1])
    _point_1 = (metrics[_metric][i][0], metrics[_metric][i][1])
    _points = [_point_0,_point_1]
    _x_coords, _y_coords = zip(*_points)
    _A = vstack([_x_coords,ones(len(_x_coords))]).T
    _x, _c = lstsq(_A, _y_coords)[0]

    _score = _x * _value + _c

    return _score

In [43]:
def calc_frequency(_list):
#     sort by date
#     print(_list)
    if len(_list) == 0:
        return 0
    _frequency = 0
    for index, value in enumerate(_list):
        if index == 0:
            _date_a = datetime.datetime.now().replace(tzinfo=datetime.timezone.utc).astimezone(tz=None)
        else:
            _date_a = format_date(_list[index-1][0])
        _date_b = format_date(_list[index][0])
        _delta = _date_a - _date_b
        if _delta.total_seconds() < 0:
            _delta = _date_b - _date_a
        # print("Delta: ",_date_a, _date_b, _delta)
        _frequency = _frequency + _delta.total_seconds()
    _frequency = (_frequency/len(_list)) / 60 / 60
    return _frequency

def frequency_calc_score(_list, _metric):
    _frequency = calc_frequency(_list)
    _score = get_metric_percent(_metric, _frequency)
    return _score

def frequency_calc_score_close_time_issue(_list_of_dict):
    _comments = _list_of_dict['comments.data']
    _frequency = []
    for _i, _c in _comments.items():
        if not _c:
            continue
        _closed_at = _list_of_dict['closedAt'].get(_i)
        _sorted = sorted(_c, key=lambda x: format_date(x['createdAt']), reverse=True)
        for _s in _sorted:
            _diff = format_date(_closed_at) - format_date(_s['createdAt'])
            # sometime the user say thanks then close the issue
            if _diff.total_seconds() >= 30:
                break
        _score = get_metric_percent('issuesClosedQuickly', (_diff.total_seconds() / 60 / 60))
        _frequency.append(_score)
    return average(_frequency)

def frequency_calc_score_response_time_issue(_list_of_dict):
    _comments = _list_of_dict['comments.data']
    _frequency = []
    for _i, _c in _comments.items():
        if not _c:
            continue
        _created_at = _list_of_dict['createdAt'].get(_i)
        _sorted = sorted(_c, key=lambda x: format_date(x['createdAt']))
        for _s in _sorted:
            _diff = format_date(_created_at) - format_date(_s['createdAt'])
            # sometime the user say thanks then close the issue
            if _diff.total_seconds() >= 30:
                break
        _score = get_metric_percent('issuesResponseQuickly', (_diff.total_seconds() / 60 / 60))
        _frequency.append(_score)
    return average(_frequency)

def frequency_calc_score_response_time_pull(_list_of_dict):
    _comments = _list_of_dict['comments.data']
    _frequency = []
    for _i, _c in _comments.items():
        if not _c:
            continue
        _created_at = _list_of_dict['createdAt'].get(_i)
        _sorted = sorted(_c, key=lambda x: format_date(x['createdAt']))
        for _s in _sorted:
            _diff = format_date(_created_at) - format_date(_s['createdAt'])
            # sometime the user say thanks then close the issue
            if _diff.total_seconds() >= 30:
                break
        _score = get_metric_percent('issuesResponseQuickly', (_diff.total_seconds() / 60 / 60))
        _frequency.append(_score)
    return average(_frequency)

In [44]:
def definition_oos_calc(_repo):
    _definitionOSS = []

    # Does it have a license?
    if _repo['licenseInfo'] != "":
        _definitionOSS.append(100)
    else:
        _definitionOSS.append(0)

    # Does it have a Readme? Check the size (reference size repo Docker)
    if _repo['hasReadmeFile']:
        _score = 100
        if not numpy.isnan(_repo['readmeFileSize']):
            _score = get_metric_percent("readmeFileSize", _repo['readmeFileSize'])
        _definitionOSS.append(_score)
    else:
        _definitionOSS.append(0)

    # Does it have a Contribution?
    if _repo['hasContributingFile']:
        _definitionOSS.append(70)
    else:
        _definitionOSS.append(30)

    # Does it have a Code of Conduct?
    if _repo['hasCodeOfConductFile']:
        _definitionOSS.append(70)
    else:
        _definitionOSS.append(30)

    return average(_definitionOSS)

In [45]:
def popularity_calc_contribution(_qty):
    _score = get_metric_percent("numberOfContributors", _qty)
    return _score

def popularity_calc_issues(_qty):
    _score = get_metric_percent("numberOfIssues", _qty)
    return _score

def popularity_calc_pulls(_qty):
    _score = get_metric_percent("numberOfPulls", _qty)
    return _score

In [46]:
data = {}
for node in nodes:
    raw_data = load_data(provider, node, start_date, end_date)
    # Keeping last version of itens
    if raw_data:
        if node in 'commits':
            data[node] = pd.DataFrame.from_dict(pd.json_normalize(raw_data))
        else:
            data[node] = pd.DataFrame.from_dict(pd.json_normalize(raw_data)).sort_values('dateTime').groupby('number').tail(1)
    else:
        data[node] = pd.DataFrame()

No data found in StarWS
No data found in StarWS


In [47]:
# Getting owner and name from all nodes to request data from Mongo
frames = []
for k in data:
    if len(data[k]) > 0:
        frames.append(data[k][['owner', 'name']])

if frames:
    result = pd.concat(frames).drop_duplicates().to_json(orient="table", index=None)
    repoList = json.loads(result)['data']

    if repoList:
        code_data = load_mongo_data(repoList)
        data['code'] = pd.DataFrame.from_dict(code_data)

In [59]:
stats_list = []

print (data)

# We have data[node] with a clear data frame
if data:
    for _, repo in data['code'].iterrows():
        # if repo['name'] != 'thefuck':
        #    continue
        # print(repo)
        if len(data['issues']) > 0:
            repoIssues = data['issues'].loc[(data['issues'].owner == repo['owner']) &  (data['issues'].name == repo['name'])]
        else:
            repoIssues = None
        # print(repoIssues)
        if len(data['pulls']) > 0:
            repoPulls = data['pulls'].loc[(data['pulls'].owner == repo['owner']) &  (data['pulls'].name == repo['name'])]
        else:
            repoPulls = None
        # print(repoPulls)
        if len(data['commits']) > 0:
            repoCommits = data['commits'].loc[(data['commits'].owner == repo['owner']) &  (data['commits'].name == repo['name'])]
        else:
            repoCommits = None
        # print("Commits:", repoCommits[['committedDate']].values)

        if repoIssues is not None:
            issuesClosed = repoIssues.loc[repoIssues['state'] == 'CLOSED']
        else:
            issuesClosed = None

        ############################# POPULARITY ####################################
        popularity_list = []

        # How many open pull requests are there?
        if repoPulls is not None:
            openPullsQty = repoPulls.loc[repoPulls['state'] == 'OPEN'].count()['dateTime']
        else:
            openPullsQty = 0
        # print("openPullsQty:", openPullsQty)
        p = popularity_calc_pulls(openPullsQty)
        popularity_list.append(p)

        # How many open issues are there?
        if repoIssues is not None:
            open_issues_qty = repoIssues.loc[repoIssues['state'] == 'OPEN'].count()['dateTime']
        else:
            open_issues_qty = 0
        # print("open issuesQty:", open_issues_qty)
        p = popularity_calc_issues(open_issues_qty)
        popularity_list.append(p)

        # How many contributors does the project have?
        # if repoIssues is not None and repoIssues['participants.totalCount'] is not None:
        #     participantsTotalCount = repoIssues['participants.totalCount'].astype(int).sum()
        # else:
        #     participantsTotalCount = 0
        # # print("participantsTotalCount:", participantsTotalCount)
        # p = popularity_calc_contribution(participantsTotalCount)
        # popularity_list.append(p)

        print(popularity_list)
        popularity = average(popularity_list)
        # print("Popularity:", popularity)
        # print('##########################')


        ############################# FREQUENCY ####################################
        frequency_list = []
        # When was the latest commit?
        if repoCommits is not None:
            f = frequency_calc_score(repoCommits[['committedDate']].values, 'commitFrequency' )
        else:
            f = 0
        frequency_list.append(f)

        # Are the issues recent?
        if repoIssues is not None:
            f = frequency_calc_score(repoIssues[['createdAt']].values, 'issuesRecent' )
        else:
            f = 0
        frequency_list.append(f)

        # Are issues getting closed?
        if repoIssues is not None:
            f = frequency_calc_score(repoIssues.loc[repoIssues['state'] == 'CLOSED'][['closedAt']].values, 'issuesGettingClosed' )
        else:
            f = 0
        frequency_list.append(f)

        # Are issues closed quickly?
        if issuesClosed is not None:
            print (issuesClosed)
            f = frequency_calc_score_close_time_issue(issuesClosed[['closedAt', 'comments.data']])
        else:
            f = 0
        frequency_list.append(f)

        # Do maintainers respond quickly to issues when they are opened?
        if repoIssues is not None:
            f = frequency_calc_score_response_time_issue(repoIssues[['createdAt', 'comments.data']])
        else:
            f = 0
        frequency_list.append(f)

        # Do maintainers respond quickly to pull requests when they are opened?
        # print(repoPulls[['createdAt', 'comments.data']])
        if repoPulls is not None:
            f = frequency_calc_score_response_time_pull(repoPulls[['createdAt', 'comments.data']])
        else:
            f = 0
        frequency_list.append(f)

        # Are the pull requests recent?
        if repoPulls is not None:
            f = frequency_calc_score(repoPulls[['createdAt']].values, 'pullsRecent' )
        else:
            f = 0
        frequency_list.append(f)

        # How recently were any pull requests merged?
        if repoPulls is not None:
            f = frequency_calc_score(repoPulls.loc[repoPulls['state'] == 'MERGED'][['mergedAt']].values, 'pullsGettingMerged' )
        else:
            f = 0
        frequency_list.append(f)

        frequency = average(frequency_list)
        # print("Frequency:", frequency)
        # print('##########################')


        ############################# DEFINITION OOS ####################################
        definitionOSS = definition_oos_calc(repo)
        # print(definitionOSS)

        ############################# QUALITY ####################################
        quality = 0.5

        ############################# FRIENDLY ####################################
        friendly = 0.5


        stats = {
            'dateTime': datetime.datetime.now().strftime(date_format),
            'frequency': round(frequency, 2),
            'definitionOSS': round(definitionOSS, 2),
            'popularity': round(popularity, 2),
            'friendly': round(friendly, 2),
            'quality': round(quality, 2),
            'name': repo['name'],
            'owner': repo['owner'],
            'provider': provider,
            'language': repo['languages']
        }
        stats_list.append(stats)

        df = pd.DataFrame(dict(
            r=[frequency, definitionOSS, 5, popularity, 5],
            theta=['Frequency','Definition of OSS','Friendly',
                   'Popularity', 'Quality']))
        fig = px.line_polar(df, r='r', theta='theta', line_close=True)
        fig.update_layout(
            title = repo['owner'] + '/' + repo['name'],
            polar=dict(
                radialaxis=dict(
                visible=True,
                  range=[0, 101]
                )),
            showlegend=False
        )
        fig.show()
        # break

{'pulls':                dateTime  number   state             createdAt  \
2  2020-07-28T02:32:46Z      16  CLOSED  2021-02-20T07:57:54Z   
3  2021-01-12T12:32:00Z      32  CLOSED  2021-02-14T11:23:53Z   
7  2021-01-19T02:53:19Z      36  MERGED  2021-01-06T02:40:46Z   
5  2021-01-24T08:37:26Z      40  MERGED  2021-01-19T02:51:30Z   
6  2021-01-25T01:27:39Z      41  MERGED  2021-02-06T15:29:45Z   
8  2021-02-01T12:16:09Z      44  MERGED  2021-01-24T09:41:01Z   
9  2021-02-01T12:17:40Z      46  MERGED  2021-01-29T13:49:21Z   
0  2021-02-07T06:05:06Z      48  CLOSED  2021-01-24T03:02:05Z   
1  2021-02-14T11:30:48Z      49  MERGED  2020-07-27T09:00:21Z   
4  2021-02-20T08:08:35Z      50  MERGED  2021-01-31T02:45:04Z   

               closedAt merged                       mergedAt      author  \
2  2021-02-20T07:58:24Z   true  2020-07-28T02:32:46.000+00:00   duandaxei   
3  2021-02-14T11:30:48Z   true  2021-01-12T12:32:00.000+00:00     agooday   
7  2021-01-12T12:32:00Z   true           20


`rcond` parameter will change to the default of machine precision times ``max(M, N)`` where M and N are the input matrix dimensions.



In [61]:
    print (stats_list)
    sr = save_data(provider, stats_list)
    print(sr)

[{'dateTime': '2021-03-22T12:30:59', 'frequency': 50.32, 'definitionOSS': 15.0, 'popularity': 50.0, 'friendly': 0.5, 'quality': 0.5, 'name': 'GitHub520', 'owner': '521xueweihan', 'provider': '521xueweihan_GitHub520', 'language': ['Python']}]
{'code': 'DATA_CREATED', 'message': 'State point batch created', 'error': '', 'data': [{'dateTime': '2021-03-22T12:30:59.000+00:00', 'tags': {'category': 'CXF', 'dono': 'o:521xueweihan', 'name': 'n:GitHub520', 'provider': '521xueweihan_GitHub520'}, 'fields': {'definitionOSS': '15', 'frequency': '50.32', 'friendly': '0.5', 'popularity': '50', 'quality': '0.5', 'rawData': 'https://agrows-file-server-r3.labbs.com.br/owner/githunter/thing/521xueweihan_GitHub520/node/json/datajson/b01d3d39-f548-49ef-9f6d-467d6939fb27', 'type': 'repositoryStats'}}]}
