In [1]:
import pandas as pd
import json
import ast
import numpy as np
from datetime import datetime
import time

In [16]:
def process(reponame):
    repo = reponame
    df = pd.read_csv("../data/repos/0/" + repo + ".csv")
    json_dict = {}

    df['repository'] = repo

    obj_columns = ['author', 'comments', 'events', 'issue']
    for c in obj_columns:
        json_dict[c + '_object'] = df[c + '_obj'].apply(json.loads)
    json_dict['closer_object'] = df['closer_obj'].apply(lambda x : np.nan if pd.isnull(x) else json.loads(x))

    #issue_info

    def extract_data(dicts_list, key):
        result = []
        for dict_item in dicts_list:
            result.append(dict_item[key])
        return(result)

    issue_columns = ['title', 'body', 'state', 'locked', 'assignee', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at']
    for c in issue_columns:
        df[c] = json_dict['issue_object'].apply(lambda x:x[c])

    df['author_association'] = json_dict['issue_object'].apply(lambda x:x['author_association'])
    df['is_pull_request'] = json_dict['issue_object'].apply(lambda x: 1 if 'pull_request' in x else 0)
    df['labels'] = json_dict['issue_object'].apply(lambda x: '|'.join(extract_data(x['labels'], 'name')) if x['labels'] else None)
    df['assignees'] = json_dict['issue_object'].apply(lambda x: '|'.join(extract_data(x['assignees'], 'login')) if x['assignees'] else None)

    df['title_len'] = df['title'].apply(lambda x: 0 if not x else len(x))
    df['body_len'] = df['body'].apply(lambda x: 0 if not x else len(x))

    df['num_comments'] = json_dict['comments_object'].apply(len)
    df['num_events'] = json_dict['events_object'].apply(len)

    #author and closer info
    user_columns = ['login', 'location', 'followers', 'following', 'public_repos', 'public_gists', 'created_at', 'updated_at', 'bio', 'site_admin', 'type']
    for c in user_columns:
        df["author_"+c] = json_dict['author_object'].apply(lambda x:x[c])
        df["closer_"+c] = json_dict['closer_object'].apply(lambda x:np.nan if pd.isnull(x) else x[c])

    df["author_core_team"] = df["author_association"].apply(lambda x: 1 if x in ["OWNER", "MEMBER"] else 0)
    df["author_has_association"] = df["author_association"].apply(lambda x: 0 if x == "NONE" else 1) 

    author_issue_counts = df.author_login.value_counts()
    df["author_issue_counts"] = df["author_login"].apply(lambda x:author_issue_counts[x])

    df.to_csv("../data/repos/1/" + repo + ".csv", index=False)

In [18]:
for reponame in ["okhttp", "retrofit", "RxJava", "guava", "spring-boot", "spring-framework", "elasticsearch"]:
    process(reponame)