In [1]:
import pandas as pd
import json
import ast
import numpy as np
from datetime import datetime
import time
import re
from statistics import mean 

In [2]:
def find_important_events(events_obj, author_login):    
    events = json.loads(events_obj)
    important_events = {}
    commits_set=set()
    for event in events:
        commit_id = event["commit_id"]
        event_name = event["event"]
        if commit_id:
            commits_set.add(commit_id)
        if event_name == "subscribed" and event["actor"]:
            if event["actor"]["login"] != author_login and not event_name in important_events:
                important_events[event_name] = event["created_at"]
        elif event_name in important_events_name and not event_name in important_events:
            important_events[event_name] = event["created_at"]
    return json.dumps(important_events), len(commits_set) 

def compute_time_interval(t1, t2):
    d1 = datetime.strptime(t1, "%Y-%m-%dT%H:%M:%SZ")
    d2 = datetime.strptime(t2, "%Y-%m-%dT%H:%M:%SZ")
    delta = d2-d1
    minutes = delta.days*24*60 + delta.seconds/60
    return minutes

def find_determinative_events(events_obj, comments_obj):  
    if events_obj == '{}':
         return '{}'
    
    events = json.loads(events_obj)
    comments = json.loads(comments_obj)
    determinative_events = {}
    
    for event, time in events.items():
        if event in determinative_events_name:
            determinative_events[event] = time
    if determinative_events == {}:
        return '{}'
    
    if comments:
        determinative_events["comment"] = comments[0]["created_at"]
    
    return json.dumps(determinative_events) 

def find_reaction_time(events_obj, created_at):
    events = json.loads(events_obj)
    
    if not events:
        return np.nan
    
    min_time = min(events.values())  
    d1 = datetime.strptime(created_at, "%Y-%m-%dT%H:%M:%SZ")
    d2 = datetime.strptime(min_time, "%Y-%m-%dT%H:%M:%SZ")
    delta = d2-d1
    minutes = delta.days*24*60 + delta.seconds/60
    return minutes

In [3]:
important_events_name = [
    "assigned",
    "closed",
    "labeled",
    "mentioned",
    "merged",
    "milestoned",
    "referenced",
    "review_requested",
    "added_to_project",
    "converted_note_to_issue",
    "moved_columns_in_project",
]
# "subscribed"

determinative_events_name = [
    "milestoned",
    "assigned",
    "merged",
    "referenced",
    "added_to_project",
    "converted_note_to_issue",
    "moved_columns_in_project",
]
#comment

In [4]:
def process(repo_name):
    df = pd.read_csv(f"data/{repo_name}.csv")    

    df = df[df.state == "closed"]

    _ = df.apply(lambda x: find_important_events(x["events_obj"], x["author_login"]), axis = 1)
    important_events, commits_count = zip(*_)
    df['important_events'] = list(important_events)
    df['commits_count'] = list(commits_count)
    df['has_commit'] = df["commits_count"].apply(lambda x: 1 if x!=0 else 0)

    df["cm_developers_obj"] = df["comments_obj"].apply(lambda x: json.dumps([i for i in json.loads(x) if i["author_association"] != "NONE"]))
    df["cm_developers_number"] = df["cm_developers_obj"].apply(lambda x: len(json.loads(x)))
    df["cm_developers_ratio"] = df.apply(lambda x: 0 if x["num_comments"] == 0 else x["cm_developers_number"] / x["num_comments"], axis=1)
    df["cm_developers_unique"] = df["cm_developers_obj"].apply(lambda x: len(set([cm["user"]["login"] for cm in json.loads(x)])))
    df["cm_authors_unique"] = df["comments_obj"].apply(lambda x: len(set([cm["user"]["login"] for cm in json.loads(x)])))
    df["cm_developers_ratio_unique"] = df.apply(lambda x: 0 if x["cm_authors_unique"] == 0 else x["cm_developers_unique"] / x["cm_authors_unique"], axis=1)
    df['cm_mean_len'] = df['comments_obj'].apply(lambda x: 0 if x == "[]" else mean([len(cm["body"]) for cm in json.loads(x)]))

    #Time lapse (in minutes) between issue opening and last comment posted in the issue discussion.
    df['time_to_discuss'] = df.apply(lambda x: 0 if x["comments_obj"] == "[]" else compute_time_interval(x["created_at"], json.loads(x["comments_obj"])[-1]["created_at"]), axis=1) 

    df["determinative_events"] = df.apply(lambda x: find_determinative_events(x["important_events"], x["cm_developers_obj"]), axis = 1)

    df["reaction_time"] = df.apply(lambda x: find_reaction_time(x.determinative_events, x.created_at), axis=1)

    df = df.drop(columns=['author_obj', 'comments_obj', 'events_obj', 'issue_obj', 'closer_obj'])
    df.to_csv(f"data/{repo_name}.csv", index=False)

In [None]:
with open("repos.txt") as f:
    repos = json.loads(f.read())

In [5]:
for repo in repos:
    print(repo)
    process(repo)
print("finished")