# Issue JSON reducer

issues.pickle has downloaded info from github api. kubernetes/kubernetes is a big resource of issues (about 84k  on 2019.11.03) but is huge for use case. In this notebook i will reduce it to make easier to process into PowerBI

In [None]:
!pip install scalpl

In [None]:
!ls -alh ../resources/*.pickle

# Base

In [None]:
import pickle
from scalpl import Cut
import json
from tqdm.notebook import trange, tqdm

In [None]:
def clear_key_pattern(c_dict, key_pattern):
    # Proxy object to walk over it
    proxy_dict = Cut(c_dict)
    # Proxy object to operate data and return it
    data_dict = proxy_dict.copy()
    
    for i in proxy_dict.items():
        key = i[0]
        value = i[1]
        if key_pattern in key:
            #print(f"|{key}|{type(i[1])}")
            del data_dict[key]
            
        if type(value) is dict:
            data_dict[key] = clear_key_pattern(value, key_pattern)
            
        if type(value) is list:
            n_ele = []
            for c_value in value:
                n_ele.append(clear_key_pattern(c_value, key_pattern))
            data_dict[key] = n_ele
            
    return data_dict

In [None]:
def extract_list_items(data, key_list, id_field = 'number'):
    extracted_elements = []

    for i in tqdm(data.items()):
        key = i[0]
        value = i[1]

        if key_list in value:
            for c_value_j in value[key_list]:
                c_value_j[id_field] = key
                extracted_elements.append(c_value_j)

    return extracted_elements


In [None]:
def extract_element_w_key(data, origin_key, target_key):
    if origin_key in data and type(data[origin_key]) is dict:
        if target_key in data[origin_key]:
            return data[origin_key][target_key]
    return None

# Reduce Issue Info

In [None]:
with open('../resources/issues.pickle', 'rb') as handle:
    issues = pickle.load(handle)

In [None]:
reduced_issues = {}

for i in tqdm(issues.items()):
    key = i[0]
    value = i[1]

    c_issue = clear_key_pattern(value, "url")
    c_issue = clear_key_pattern(c_issue, "node_id")
    c_issue = clear_key_pattern(c_issue, "body")
    reduced_issues[key] = c_issue

In [None]:
with open("../resources/issues_list_reduced.json", "w") as handle:
    json.dump(list(reduced_issues.values()), handle)

In [None]:
labels = extract_list_items(reduced_issues,'labels')

In [None]:
with open("../resources/issues_labels.json", "w") as handle:
    json.dump(labels, handle)

In [None]:
assignees = extract_list_items(reduced_issues,'assignees')

In [None]:
with open("../resources/issues_assignees.json", "w") as handle:
    json.dump(assignees, handle)

In [None]:
curated_issues = {}

for i in tqdm(reduced_issues.copy().items()):
    key = i[0]
    value = i[1]

    value['user'] = extract_element_w_key(value, 'user', 'login')
    value['closed_by'] = extract_element_w_key(value, 'closed_by', 'login')
    value['assignee'] = extract_element_w_key(value, 'assignee', 'login')
    value['milestone'] = extract_element_w_key(value, 'milestone', 'title')
    if 'labels' in value:
        del value['labels']
    if 'assignees' in value:
        del value['assignees']
    if 'pull_request' in value:
        del value['pull_request']        
    curated_issues[key] = value
    

In [None]:
with open("../resources/issues_curated.json", "w") as handle:
    json.dump(list(curated_issues.values()), handle)

<h3 style="color:red;">Clear memory</h3>

In [None]:
del curated_issues
del reduced_issues
del issues

# Reduce Pull Requests Data

In [None]:
with open('../resources/pull_requests.pickle', 'rb') as handle:
    pull_requests = pickle.load(handle)

In [None]:
reduced_pulls = {}

for i in tqdm(pull_requests.items()):
    key = i[0]
    value = i[1]

    c = clear_key_pattern(value, "url")
    c = clear_key_pattern(c, "node_id")
    c = clear_key_pattern(c, "body")
    reduced_pulls[key] = c

In [None]:
pull_requests_labels = extract_list_items(reduced_pulls,'labels')
pull_requests_assignees = extract_list_items(reduced_pulls,'assignees')
pull_requests_requested_reviewers = extract_list_items(reduced_pulls,'requested_reviewers')
#pull_requests_requested_head = extract_list_items(reduced_pulls,'head')
#pull_requests_base = extract_list_items(reduced_pulls,'base')

In [None]:
curated_pull_requets = {}

for i in tqdm(reduced_pulls.copy().items()):
    key = i[0]
    value = i[1]

    value['user'] = extract_element_w_key(value, 'user', 'login')
    value['assignee'] = extract_element_w_key(value, 'assignee', 'login')
    value['milestone'] = extract_element_w_key(value, 'milestone', 'title')
    if 'labels' in value:
        del value['labels']
    if 'assignees' in value:
        del value['assignees']
    if 'requested_reviewers' in value:
        del value['requested_reviewers']        
    #if 'head' in value:
    #    del value['head']
    #if 'base' in value:
    #    del value['base']
    curated_pull_requets[key] = value

In [None]:
with open("../resources/curated_pull_requets.json", "w") as handle:
    json.dump(list(curated_pull_requets.values()), handle)

<h3 style="color:red;">Clear memory</h3>

In [None]:
del pull_requests
del reduced_pulls
del curated_pull_requets

# Reduce Releases Info

In [None]:
with open('../resources/releases.pickle', 'rb') as handle:
    releases = pickle.load(handle)

In [None]:
reduced_releases = {}

for i in tqdm(releases.items()):
    key = i[0]
    value = i[1]

    c = clear_key_pattern(value, "url")
    c = clear_key_pattern(c, "node_id")
    c = clear_key_pattern(c, "body")
    reduced_releases[key] = c

In [None]:
assets = extract_list_items(reduced_releases,'assets')

In [None]:
curated_releases = {}

for i in tqdm(reduced_releases.copy().items()):
    key = i[0]
    value = i[1]

    value['author'] = extract_element_w_key(value, 'author', 'login')
    #value['assignee'] = extract_element_w_key(value, 'assignee', 'login')
    #value['milestone'] = extract_element_w_key(value, 'milestone', 'title')
    if 'assets' in value:
        del value['assets']

    curated_releases[key] = value

In [None]:
with open("../resources/curated_releases.json", "w") as handle:
    json.dump(list(curated_releases.values()), handle)

<h3 style="color:red;">Clear memory</h3>

In [None]:
del releases
del reduced_releases
del curated_releases

# Reduce Commits Info

In [None]:
with open('../resources/commits.pickle', 'rb') as handle:
    commits = pickle.load(handle)

In [None]:
reduced_commits = {}

for i in tqdm(commits.items()):
    key = i[0]
    value = i[1].copy()

    c = clear_key_pattern(value, "url")
    c = clear_key_pattern(c, "node_id")
    c = clear_key_pattern(c, "body")
    
    reduced_commits[key] = c
    

In [None]:
#commits_parents = extract_list_items(reduced_commits,'parents')
#commits_files = extract_list_items(reduced_commits,'files')

commits_parents = []

for i in tqdm(commits.items()):
    key = i[0]
    value = i[1].copy()
    
    for c_value in value['parents']:
        commits_parents += [{ "parent_sha":key, "sha":c_value['sha']}]


In [None]:
commits_files = []

for i in tqdm(commits.items()):
    key = i[0]
    value = i[1].copy()
    
    for c_value in value['files']:
        commits_files += [
                            {
                                "commit":key,
                                "key":c_value['sha'],
                                "filename":c_value['filename'],
                                "status":c_value['status'],
                                "changes":c_value['changes'],
                                "additions":c_value['additions'],
                                "deletions":c_value['deletions']
                            }
                         ]

In [None]:
curated_commits = {}

for i in tqdm(reduced_commits.copy().items()):
    key = i[0]
    value = i[1].copy()
    proxy = Cut(value)

    value['author'] = proxy['commit.author.name']
    value['date'] = proxy['commit.author.date']
    value['message'] = proxy['commit.message']
    value['committer'] = proxy['commit.committer.name']
    value['login'] = proxy['commit.committer.name']
    value['stats_total'] = proxy['stats.total']
    value['stats_additions'] = proxy['stats.additions']
    value['stats_deletions'] = proxy['stats.deletions']
    #value['assignee'] = extract_element_w_key(value, 'assignee', 'login')
    #value['milestone'] = extract_element_w_key(value, 'milestone', 'title')
        
    if 'commit.author.login' in proxy:
        value['author_login'] = proxy['commit.author.login']
    else:
        value['author_login'] = proxy['commit.author.email']
        
    if 'commit.committer.login' in proxy:
        value['commiter_login'] = proxy['commit.committer.login']
    else:
        value['commiter_login'] = proxy['commit.committer.email']        

    if 'parents' in value:
        del value['parents']
    if 'commit' in value:
        del value['commit']
    if 'files' in value:
        del value['files']
    if 'stats' in value:
        del value['stats'] 
        
    curated_commits[key] = value

In [None]:
with open("../resources/curated_commits.json", "w") as handle:
    json.dump(list(curated_commits.values()), handle)

In [None]:
with open("../resources/commits_parents.json", "w") as handle:
    json.dump(commits_parents, handle)

In [None]:
with open("../resources/curated_files.json", "w") as handle:
    json.dump(commits_files, handle)

<h3 style="color:red;">Clear memory</h3>

In [None]:
del commits
del reduced_commits
del curated_commits
del commits_parents
del commits_files