In [1]:
import json
import pandas as pd
import gc
import fasttext

In [3]:
labels_category={"bug":"bug", "enhancement":"feature", "support":"other", "docs":"other", "documentation":"other", "question":"other"}
first_level_columns = ['repository_url', 'number', 'title', 'body', 'locked', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association']
reaction_columns = ["total_count", "+1", "-1", "laugh", "hooray", "confused", "heart", "rocket", "eyes"]

In [4]:
def extract_data(dicts_list, key):
    return([dict_item[key] for dict_item in dicts_list])

In [5]:
issues_list = []

for label in labels_category.keys():
    with open(f"data/{label}_issues.txt") as f:
        issues = json.loads(f.read())
        print(label, len(issues))
        for issue in issues:
            issue_dict = {c:issue[c] for c in first_level_columns}            
            issue_dict["author_login"] = issue["user"]["login"]
            issue_dict['is_pull_request'] = 1 if 'pull_request' in issue else 0
            issue_dict['has_milestone'] = 0 if issue["milestone"] == None else 1
            issue_dict["has_assignee"] = 0 if issue["assignee"] == None else 1
            issue_dict['labels'] = '|'.join(extract_data(issue['labels'], 'name'))
            issue_dict['num_of_assignees'] = len(extract_data(issue['assignees'], 'login'))
            for c in reaction_columns:
                issue_dict["reaction_" + c] = issue["reactions"][c] 
            issue_dict["label"] = label
            issue_dict["label_cat"] = labels_category[label]
            issues_list.append(issue_dict)
        issues.clear()
        gc.collect()
        
df = pd.DataFrame(issues_list)

bug 471031
enhancement 430849
support 4712
docs 4809
documentation 26652
question 77785


In [6]:
df.shape

(1015838, 27)

In [7]:
df.duplicated(subset=["repository_url","number","label"], keep='first').value_counts()

False    1015670
True         168
dtype: int64

In [8]:
df.duplicated(subset=["repository_url","number"], keep='first').value_counts()

False    992740
True      23098
dtype: int64

In [9]:
df.drop_duplicates(subset=["repository_url","number"], inplace=True)

In [10]:
df['title_len'] = df['title'].apply(lambda x: 0 if not x else len(x))
df['body_len'] = df['body'].apply(lambda x: 0 if not x else len(x))
df['numeric_association'] = df['author_association'].apply(lambda x: 0 if x == "NONE"  else 1 if x == "CONTRIBUTOR" else 2 if x == "MEMBER" else 3 if x == "OWNER" else 4)
df["author_core_team"] = df["author_association"].apply(lambda x: 1 if x in ["OWNER", "MEMBER"] else 0)
df["author_has_association"] = df["author_association"].apply(lambda x: 0 if x == "NONE" else 1)         
df['labels_count'] = df["labels"].apply(lambda x: len(x.split('|')))

In [11]:
df["title"] = df["title"].astype(str)
df["body"] = df["body"].astype(str)
df['title'] = df['title'].apply(lambda x:" ".join(x.split()))
df['body'] = df['body'].apply(lambda x:" ".join(x.split()))

In [12]:
model = fasttext.load_model('data/lid.176.bin')
title_langs = model.predict(list(df["title"]))[0]
body_langs = model.predict(list(df["body"]))[0]




In [None]:
df["title_lang"] = list(map(lambda x: x[0] , title_langs))
df["body_lang"] = list(map(lambda x: x[0] , body_langs))
title_langs.clear()
body_langs.clear()

In [16]:
print(df[df["title_lang"]=="__label__en"].shape)
print(df[df["body_lang"]=="__label__en"].shape)
print(df[(df["title_lang"]=="__label__en") & (df["body_lang"]=="__label__en")].shape)

(886439, 34)
(920696, 34)
(871878, 34)


In [18]:
df["title_lang"].value_counts()

__label__en    886439
__label__zh     19575
__label__es     12245
__label__de     11811
__label__fr      8746
                ...  
__label__tk         1
__label__cv         1
__label__mg         1
__label__rm         1
__label__ku         1
Name: title_lang, Length: 134, dtype: int64

In [19]:
df = df[(df["title_lang"]=="__label__en") & (df["body_lang"]=="__label__en")]

In [32]:
df.drop(columns=["title_lang", "body_lang"], inplace=True)

In [33]:
df.to_csv("data/english_issues.csv", index=False)

In [5]:
df.body.isna().value_counts()

False    747058
True     124820
Name: body, dtype: int64