In [17]:
import sys
!{sys.executable} -m pip install python-dotenv

Collecting python-dotenv
  Using cached https://files.pythonhosted.org/packages/32/2e/e4585559237787966aad0f8fd0fc31df1c4c9eb0e62de458c5b6cde954eb/python_dotenv-0.15.0-py2.py3-none-any.whl
Installing collected packages: python-dotenv
Successfully installed python-dotenv-0.15.0


In [1]:
# Jupyter Notebook for GitHub issue data analysis
import os
import pandas as pd

from dotenv import load_dotenv
load_dotenv()
ROOT = os.environ.get("ROOT")  # remember to set your root path in `.env`; refer to installation.

# English repo - train sets
df_tensorflow = pd.read_json(f'{ROOT}/data/eng_labelled/remove_duplicates/tensorflow_removed_dups.json')
df_rust = pd.read_json(f'{ROOT}/data/eng_labelled/remove_duplicates/rust_removed_dups.json')
df_kubernetes = pd.read_json(f'{ROOT}/data/eng_labelled/remove_duplicates/kubernetes_removed_dups.json')

# English repo - test sets
df_flutter = pd.read_json(f'{ROOT}/data/eng_labelled/remove_duplicates/flutter_removed_dups.json')
df_ohmyzsh = pd.read_json(f'{ROOT}/data/eng_labelled/remove_duplicates/ohmyzsh_removed_dups.json')
df_electron = pd.read_json(f'{ROOT}/data/eng_labelled/remove_duplicates/electron_removed_dups.json')

num_row_tensorflow = len(df_tensorflow.index)
num_row_rust = len(df_rust.index)
num_row_kubernetes = len(df_kubernetes.index)
num_row_flutter = len(df_flutter.index)
num_row_ohmyzsh = len(df_ohmyzsh.index)
num_row_electron = len(df_electron.index)

print('Overall')
print('-------')
print('Total number of tensorflow issues:' + str(num_row_tensorflow))
print('Total number of rust issues:' + str(num_row_rust))
print('Total number of kubernetes issues:' + str(num_row_kubernetes))
print('Total number of flutter issues:' + str(num_row_flutter))
print('Total number of ohmyzsh issues:' + str(num_row_ohmyzsh))
print('Total number of electron issues:' + str(num_row_electron))

Overall
-------
Total number of tensorflow issues:19438
Total number of rust issues:13657
Total number of kubernetes issues:19255
Total number of flutter issues:12574
Total number of ohmyzsh issues:1370
Total number of electron issues:4943


In [19]:
import nltk
from nltk.tokenize import RegexpTokenizer
from nltk.probability import FreqDist
from nltk.corpus import stopwords
import string

stop = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')

In [20]:
feature_data = []
doc_data = []
bug_data = []
other_data = []

for _, row in df_kubernetes.iterrows():
    # feature
    if row['labels'] == 'kind/feature' or row['labels'] == 'kind/api-change': 
        feature_data.extend([w for w in tokenizer.tokenize(row['title']) if w not in stop])
        feature_data.extend([w for w in tokenizer.tokenize(row['body']) if w not in stop])
    elif row['labels'] == 'kind/bug' or row['labels'] == 'kind/failing-test':
        bug_data.extend([w for w in tokenizer.tokenize(row['title']) if w not in stop])
        bug_data.extend([w for w in tokenizer.tokenize(row['body']) if w not in stop])
    elif row['labels'] == 'kind/documentation': 
        doc_data.extend([w for w in tokenizer.tokenize(row['title']) if w not in stop])
        doc_data.extend([w for w in tokenizer.tokenize(row['body']) if w not in stop])
    else:
        other_data.extend([w for w in tokenizer.tokenize(row['title']) if w not in stop])
        other_data.extend([w for w in tokenizer.tokenize(row['body']) if w not in stop])

for _, row in df_rust.iterrows():
    if row['labels'] == 'C-feature-request' or row['labels'] == 'C-feature-accepted' or row['labels'] == 'C-enhancement': 
        feature_data.extend([w for w in tokenizer.tokenize(row['title']) if w not in stop])
        feature_data.extend([w for w in tokenizer.tokenize(row['body']) if w not in stop])
    elif row['labels'] == 'C-bug': 
        bug_data.extend([w for w in tokenizer.tokenize(row['title']) if w not in stop])
        bug_data.extend([w for w in tokenizer.tokenize(row['body']) if w not in stop])
    elif row['labels'] == 'T-doc': 
        doc_data.extend([w for w in tokenizer.tokenize(row['title']) if w not in stop])
        doc_data.extend([w for w in tokenizer.tokenize(row['body']) if w not in stop])
    else:
        other_data.extend([w for w in tokenizer.tokenize(row['title']) if w not in stop])
        other_data.extend([w for w in tokenizer.tokenize(row['body']) if w not in stop])

for _, row in df_tensorflow.iterrows():
    if row['labels'] == 'type:feature':
        feature_data.extend([w for w in tokenizer.tokenize(row['title']) if w not in stop])
        feature_data.extend([w for w in tokenizer.tokenize(row['body']) if w not in stop])
    elif row['labels'] == 'type:docs-feature' or row['labels'] == 'type:docs-bug':
        doc_data.extend([w for w in tokenizer.tokenize(row['title']) if w not in stop])
        doc_data.extend([w for w in tokenizer.tokenize(row['body']) if w not in stop])
    elif row['labels'] == 'type:others':
        other_data.extend([w for w in tokenizer.tokenize(row['title']) if w not in stop])
        other_data.extend([w for w in tokenizer.tokenize(row['body']) if w not in stop])
    else:
        bug_data.extend([w for w in tokenizer.tokenize(row['title']) if w not in stop])
        bug_data.extend([w for w in tokenizer.tokenize(row['body']) if w not in stop])
        
print("Feature word frequencies:")
feature_freq = FreqDist(feature_data)
feature_freq.most_common(80)

Feature word frequencies:


[('S', 102839),
 ('std', 30577),
 ('I', 23962),
 ('1', 20116),
 ('0', 19339),
 ('rust', 16952),
 ('Ratio', 16915),
 ('num_rational', 16900),
 ('https', 16294),
 ('error', 15932),
 ('mut', 15331),
 ('io', 13541),
 ('fn', 12151),
 ('com', 11544),
 ('type', 11210),
 ('2', 10817),
 ('iter', 10749),
 ('use', 10397),
 ('would', 10074),
 ('rs', 10064),
 ('kubernetes', 9504),
 ('github', 9172),
 ('src', 8564),
 ('x', 8488),
 ('The', 8314),
 ('version', 8283),
 ('Take', 8144),
 ('like', 8060),
 ('tensorflow', 7972),
 ('This', 7863),
 ('code', 7697),
 ('let', 7690),
 ('feature', 7464),
 ('T', 7082),
 ('3', 7050),
 ('self', 6957),
 ('tf', 6904),
 ('lang', 6448),
 ('impl', 6440),
 ('main', 6112),
 ('test', 5773),
 ('str', 5445),
 ('4', 5252),
 ('note', 5235),
 ('_', 5161),
 ('rustc', 5155),
 ('pub', 5154),
 ('using', 5117),
 ('It', 4857),
 ('foo', 4808),
 ('time', 4778),
 ('one', 4750),
 ('trait', 4745),
 ('5', 4712),
 ('could', 4580),
 ('org', 4553),
 ('name', 4338),
 ('e', 4336),
 ('Empty', 4320

In [21]:
print("Doc word frequencies:")
doc_freq = FreqDist(doc_data)
doc_freq.most_common(80)

Doc word frequencies:


[('https', 6039),
 ('tensorflow', 5905),
 ('0', 4509),
 ('I', 4366),
 ('1', 4238),
 ('tf', 4108),
 ('org', 3080),
 ('com', 2965),
 ('docs', 2482),
 ('documentation', 2449),
 ('rust', 2393),
 ('github', 2373),
 ('issue', 2341),
 ('2', 2271),
 ('python', 2174),
 ('www', 2160),
 ('The', 2140),
 ('kubernetes', 1975),
 ('example', 1901),
 ('version', 1716),
 ('code', 1659),
 ('doc', 1609),
 ('use', 1545),
 ('lang', 1448),
 ('keras', 1274),
 ('master', 1230),
 ('issues', 1224),
 ('TensorFlow', 1211),
 ('10', 1202),
 ('3', 1162),
 ('io', 1159),
 ('x', 1143),
 ('guide', 1138),
 ('blob', 1125),
 ('py', 1120),
 ('std', 1101),
 ('defined', 1038),
 ('name', 1015),
 ('html', 975),
 ('api_docs', 950),
 ('error', 922),
 ('This', 901),
 ('link', 901),
 ('md', 897),
 ('would', 887),
 ('source', 883),
 ('node', 879),
 ('go', 869),
 ('using', 864),
 ('line', 864),
 ('self', 855),
 ('model', 854),
 ('API', 843),
 ('needs', 822),
 ('src', 820),
 ('v1', 808),
 ('lib', 783),
 ('get', 737),
 ('examples', 723)

In [22]:
print("Bug word frequencies:")
bug_freq = FreqDist(bug_data)
bug_freq.most_common(80)

Bug word frequencies:


[('0', 249166),
 ('1', 191927),
 ('tensorflow', 167661),
 ('I', 99559),
 ('2', 86060),
 ('tf', 83433),
 ('lib', 79166),
 ('version', 77218),
 ('_', 76315),
 ('go', 69267),
 ('kubernetes', 66144),
 ('10', 61516),
 ('3', 61310),
 ('src', 60296),
 ('python', 56982),
 ('py', 56140),
 ('io', 54143),
 ('https', 51810),
 ('error', 48827),
 ('com', 48499),
 ('k8s', 44949),
 ('packages', 44434),
 ('cc', 43561),
 ('line', 43438),
 ('6', 41978),
 ('std', 41567),
 ('rustc', 40748),
 ('core', 40293),
 ('7', 39673),
 ('File', 38906),
 ('4', 38125),
 ('self', 38008),
 ('rs', 37914),
 ('model', 36816),
 ('5', 36499),
 ('build', 36375),
 ('x86_64', 36124),
 ('C', 35133),
 ('name', 34814),
 ('TensorFlow', 34267),
 ('code', 34256),
 ('home', 33767),
 ('site', 33185),
 ('github', 33094),
 ('16', 31958),
 ('8', 31602),
 ('11', 31256),
 ('local', 31231),
 ('python3', 31154),
 ('use', 30915),
 ('usr', 30484),
 ('12', 30310),
 ('rust', 30112),
 ('test', 29384),
 ('v1', 29291),
 ('keras', 29257),
 ('bug', 2832

In [23]:
print("Other word frequencies:")
other_freq = FreqDist(other_data)
other_freq.most_common(80)

Other word frequencies:


[('0', 18760),
 ('1', 13986),
 ('kubernetes', 11471),
 ('go', 9528),
 ('I', 8421),
 ('io', 6878),
 ('https', 6510),
 ('10', 6122),
 ('k8s', 5792),
 ('com', 5703),
 ('2', 5242),
 ('kubelet', 4985),
 ('master', 4871),
 ('kube', 4782),
 ('github', 4093),
 ('tensorflow', 3991),
 ('v1', 3963),
 ('version', 3914),
 ('3', 3777),
 ('src', 3427),
 ('01', 3308),
 ('name', 2955),
 ('use', 2878),
 ('11', 2874),
 ('cluster', 2827),
 ('16', 2749),
 ('kubectl', 2685),
 ('4', 2625),
 ('node', 2616),
 ('error', 2509),
 ('pod', 2408),
 ('service', 2287),
 ('tf', 2264),
 ('pkg', 2109),
 ('rust', 2078),
 ('19', 2053),
 ('The', 1995),
 ('x', 1990),
 ('18', 1979),
 ('test', 1961),
 ('5', 1942),
 ('15', 1935),
 ('code', 1925),
 ('get', 1906),
 ('apiserver', 1894),
 ('7', 1891),
 ('default', 1891),
 ('20', 1868),
 ('14', 1859),
 ('server', 1837),
 ('api', 1833),
 ('12', 1825),
 ('lib', 1816),
 ('8', 1810),
 ('Kubernetes', 1807),
 ('docker', 1757),
 ('6', 1750),
 ('file', 1735),
 ('etc', 1733),
 ('17', 1705),


In [2]:
train_mappings = {
    "tf": {
        "feature": ["type:feature"],
        "bug": ["type:bug"],
        "doc": ["type:docs-feature", "type:docs-bug"],
        "repo": df_tensorflow
    },
    "rust": {
        "feature": ["C-feature-request", "C-feature-accepted", "C-enhancement"],
        "bug": ["C-bug"],
        "doc": ["T-doc"],
        "repo": df_rust
    },
    "kubernetes": {
        "feature": ["kind/feature", "kind/api-change"],
        "bug": ["kind/bug"],
        "doc": ["kind/documentation"],
        "repo": df_kubernetes
    }
}

test_mappings = {
    "flutter": {
        "feature": ['severe: new feature'],
        "bug": ["severe: crash", "severe: fatal crash", "severe: rendering"],
        "doc": ["documentation"],
        "repo": df_flutter
    },
    "ohmyzsh": {
        "feature": ["Feature", "Enhancement"],
        "bug": ["Bug"],
        "doc": ["Type: documentation"],
        "repo": df_ohmyzsh,
    },
    "electron": {
        "feature": ["enhancement :sparkles:"],
        "bug": ["bug :beetle:", "crash :boom:"],
        "doc": ["documentation :notebook:"],
        "repo": df_electron
    }
}

In [3]:
def analysis(mappings):
    total_features = 0
    total_bugs = 0
    total_docs = 0
    for repo_label, repo in mappings.items():
        repo_features = 0
        repo_bugs = 0
        repo_docs = 0
        for _, row in repo["repo"].iterrows():
            if row['labels'] in repo["feature"]: repo_features += 1
            elif row['labels'] in repo["bug"]: repo_bugs += 1
            elif row['labels'] in repo["doc"]: repo_docs += 1
        
        print(f'{repo_label} issue analysis')
        print('-------------------------')
        print('Number of feature issues:' + str(repo_features))
        print('Number of bug issues:' + str(repo_bugs))
        print('Number of doc issues:' + str(repo_docs))
        print('Total issues:' + str(repo_features + repo_bugs + repo_docs))        
        print('-------------------------')
        
        total_features += repo_features
        total_bugs += repo_bugs
        total_docs += repo_docs
        
    total = total_features + total_bugs + total_docs
    print(f'Overall issue analysis')
    print('-------------------------')
    print('Number of feature issues:' + str(total_features))
    print('Number of bug issues:' + str(total_bugs))
    print('Number of doc issues:' + str(total_docs))
    print('Total issues:' + str(total))
    
    print('-------------------------')
    print('% of feature issues:' + str(total_features / total))
    print('% of bug issues:' + str(total_bugs / total))
    print('% of doc issues:' + str(total_docs / total))
    print('-------------------------')
        
        
analysis(train_mappings)
analysis(test_mappings)

tf issue analysis
-------------------------
Number of feature issues:2314
Number of bug issues:6063
Number of doc issues:1431
Total issues:9808
-------------------------
rust issue analysis
-------------------------
Number of feature issues:5376
Number of bug issues:6843
Number of doc issues:301
Total issues:12520
-------------------------
kubernetes issue analysis
-------------------------
Number of feature issues:4061
Number of bug issues:10172
Number of doc issues:878
Total issues:15111
-------------------------
Overall issue analysis
-------------------------
Number of feature issues:11751
Number of bug issues:23078
Number of doc issues:2610
Total issues:37439
-------------------------
% of feature issues:0.3138705627821256
% of bug issues:0.6164160367531184
% of doc issues:0.069713400464756
-------------------------
flutter issue analysis
-------------------------
Number of feature issues:4197
Number of bug issues:4044
Number of doc issues:933
Total issues:9174
-------------------

In [None]:
# German repo
df_corona_widget = pd.read_json('./data/de_unlabelled/corona-widget.json')
df_open_wb = pd.read_json('./data/de_unlabelled/openWB.json')

num_corona_widget = len(df_corona_widget.index)
num_open_wb = len(df_open_wb)

print('German repo issue analysis')
print('--------------------------')
print('Total number of corona widget issues:' + str(num_corona_widget))
print('Total number of openWB issues:' + str(num_open_wb))

In [19]:
# French repo
df_dvf_app = pd.read_json('./data/fr_unlabelled/DVF-app.json')
df_grafikart = pd.read_json('./data/fr_unlabelled/Grafikart.fr.json')
df_azure_docs = pd.read_json('./data/fr_unlabelled/azure-docs.fr-fr.json')
df_bcdlibre = pd.read_json('./data/fr_unlabelled/bcdlibre.json')

num_row_dvf_app = len(df_dvf_app.index)
num_grafikart = len(df_grafikart.index)
num_azure_docs = len(df_azure_docs.index)
num_bcdlibre = len(df_bcdlibre.index)

print('French repo issue analysis')
print('--------------------------')
print('Total number of DVF-app issues:' + str(num_row_dvf_app))
print('Total number of Grafikart issues:' + str(num_grafikart))
print('Total number of azure docs issues:' + str(num_azure_docs))
print('Total number of bcdlibre issues:' + str(num_bcdlibre))

French repo issue analysis
--------------------------
Total number of DVF-app issues:104
Total number of Grafikart issues:313
Total number of azure docs issues:247
Total number of bcdlibre issues:36
