# Issue JSON reducer

issues.pickle has downloaded info from github api. kubernetes/kubernetes is a big resource of issues (about 84k  on 2019.11.03) but is huge for use case. In this notebook i will reduce it to make easier to process into PowerBI

In [None]:
!pip install scalpl

In [1]:
import pickle
from scalpl import Cut
import json
from tqdm.notebook import trange, tqdm

In [2]:
with open('issues.pickle', 'rb') as handle:
    issues = pickle.load(handle)

In [3]:
def clear_key_pattern(c_dict, key_pattern):
    # Proxy object to walk over it
    proxy_dict = Cut(c_dict)
    # Proxy object to operate data and return it
    data_dict = proxy_dict.copy()
    
    for i in proxy_dict.items():
        key = i[0]
        value = i[1]
        if key_pattern in key:
            #print(f"|{key}|{type(i[1])}")
            del data_dict[key]
            
        if type(value) is dict:
            data_dict[key] = clear_key_pattern(value, key_pattern)
            
        if type(value) is list:
            n_ele = []
            for c_value in value:
                n_ele.append(clear_key_pattern(c_value, key_pattern))
            data_dict[key] = n_ele
            
    return data_dict

In [71]:
reduced_issues = {}

for i in tqdm(issues.items()):
    key = i[0]
    value = i[1]

    c_issue = clear_key_pattern(value, "url")
    c_issue = clear_key_pattern(c_issue, "node_id")
    c_issue = clear_key_pattern(c_issue, "body")
    reduced_issues[key] = c_issue

HBox(children=(IntProgress(value=0, max=84572), HTML(value='')))




In [None]:
with open("issues_list_reduced.json", "w") as handle:
    json.dump(list(reduced_issues.values()), handle)

In [29]:
def extract_list_items(data, key_list):
    extracted_elements = []

    for i in tqdm(data.items()):
        key = i[0]
        value = i[1]

        if key_list in value:
            for c_value_j in value[key_list]:
                c_value_j['number'] = key
                extracted_elements.append(c_value_j)

    return extracted_elements


In [30]:
labels = extract_list_items(reduced_issues,'labels')

HBox(children=(IntProgress(value=0, max=84572), HTML(value='')))




In [32]:
with open("issues_labels.json", "w") as handle:
    json.dump(labels, handle)

In [39]:
assignees = extract_list_items(reduced_issues,'assignees')

HBox(children=(IntProgress(value=0, max=84572), HTML(value='')))




In [40]:
with open("issues_assignees.json", "w") as handle:
    json.dump(assignees, handle)

In [84]:
def extract_element_w_key(data, origin_key, target_key):
    if origin_key in data and type(data[origin_key]) is dict:
        if target_key in data[origin_key]:
            return data[origin_key][target_key]
    return None

In [86]:
#print(reduced_issues[84639])
#print(extract_element_w_key(reduced_issues[84639], 'assignee', 'login'))

In [101]:
curated_issues = {}

for i in tqdm(reduced_issues.copy().items()):
    key = i[0]
    value = i[1]

    value['user'] = extract_element_w_key(value, 'user', 'login')
    value['assignee'] = extract_element_w_key(value, 'assignee', 'login')
    value['milestone'] = extract_element_w_key(value, 'milestone', 'title')
    if 'labels' in value:
        del value['labels']
    if 'assignees' in value:
        del value['assignees']
    if 'pull_request' in value:
        del value['pull_request']        
    curated_issues[key] = value
    

HBox(children=(IntProgress(value=0, max=84572), HTML(value='')))




In [91]:
with open("issues_curated.json", "w") as handle:
    json.dump(curated_issues, handle)

In [98]:
!ls -alh

total 2531216
drwxr-xr-x  12 jdayllon  staff   384B  3 nov 23:41 [1m[36m.[m[m
drwxr-xr-x   7 jdayllon  staff   224B  3 nov 11:40 [1m[36m..[m[m
-rw-r--r--@  1 jdayllon  staff    51B  3 nov 12:30 .env
drwxr-xr-x   4 jdayllon  staff   128B  3 nov 12:42 [1m[36m.ipynb_checkpoints[m[m
-rw-r--r--   1 jdayllon  staff   3,5K  3 nov 12:42 GitHub API Downloader.ipynb
-rw-r--r--   1 jdayllon  staff   7,9K  3 nov 23:41 Reduce Issues Data.ipynb
-rw-r--r--   1 jdayllon  staff    77M  3 nov 11:40 issues.json
-rw-r--r--   1 jdayllon  staff   496M  2 nov 20:43 issues.pickle
-rw-r--r--   1 jdayllon  staff   9,8M  3 nov 23:23 issues_assignees.json
-rw-r--r--   1 jdayllon  staff    46M  3 nov 23:41 issues_curated.json
-rw-r--r--   1 jdayllon  staff    31M  3 nov 23:22 issues_labels.json
-rw-r--r--   1 jdayllon  staff   566M  3 nov 11:30 issues_list.json


In [100]:
curated_issues

{84641: {'id': 516044254,
  'number': 84641,
  'title': 'ServiceAntiAffinityPriority as score plugin',
  'user': None,
  'state': 'open',
  'locked': False,
  'assignee': None,
  'milestone': None,
  'comments': 3,
  'created_at': '2019-11-01T10:54:03Z',
  'updated_at': '2019-11-01T11:02:05Z',
  'closed_at': None,
  'author_association': 'CONTRIBUTOR',
  'closed_by': None},
 84640: {'id': 516041743,
  'number': 84640,
  'title': 'Use log functions of core framework on test/e2e/framework/kubelet',
  'user': None,
  'state': 'open',
  'locked': False,
  'assignee': None,
  'milestone': None,
  'comments': 5,
  'created_at': '2019-11-01T10:48:19Z',
  'updated_at': '2019-11-01T11:13:54Z',
  'closed_at': None,
  'author_association': 'CONTRIBUTOR',
  'closed_by': None},
 84639: {'id': 516033308,
  'number': 84639,
  'title': 'scheduler: improve some comments and validation messages',
  'user': None,
  'state': 'open',
  'locked': False,
  'assignee': None,
  'milestone': None,
  'comments':