In [1]:
import subprocess

import h5py
import numpy as np
import pandas as pd

In [2]:
file = h5py.File("/Users/jpivarski/storage/data/GHArchive/GHArchive-2022-aggregated.h5")

In [3]:
df = pd.DataFrame({"actor": file["actor_id"], "repo": file["repo_id"], "type": file["event_type_id"], "count": file["count"]})

In [4]:
bot_actors = np.array(
    [
        int(line.split(b"\t")[0])
        for line in subprocess.Popen(
            [
                "fgrep",
                "[bot]",
                "/Users/jpivarski/storage/data/GHArchive/actor_id_name.txt",
            ],
            stdout=subprocess.PIPE,
        )
        .communicate()[0]
        .split(b"\n")
        if line != b""
    ]
)

In [5]:
df2 = df[~df.actor.isin(bot_actors)]

In [7]:
event_type_to_id = {
    "CommitCommentEvent":             1,
    "CreateEvent":                    2,
    "DeleteEvent":                    3,
    "DownloadEvent":                  4,
    # "FollowEvent":                    5,
    # "ForkApplyEvent":                 6,
    # "ForkEvent":                      7,
    # "GistEvent":                      8,
    # "GollumEvent":                    9,
    "IssueCommentEvent":             10,
    "IssuesEvent":                   11,
    # "MemberEvent":                   12,
    # "PublicEvent":                   13,
    "PullRequestEvent":              14,
    "PullRequestReviewCommentEvent": 15,
    "PullRequestReviewEvent":        16,
    "PushEvent":                     17,
    "ReleaseEvent":                  18,
    # "TeamAddEvent":                  19,
    # "WatchEvent":                    20,
}

In [8]:
df3 = df2[df2.type.isin(event_type_to_id.values())].drop(columns=["type"]).groupby(["actor", "repo"]).sum()

In [9]:
df3

Unnamed: 0_level_0,Unnamed: 1_level_0,count
actor,repo,Unnamed: 2_level_1
0,32611596,3
0,54378638,1
0,56915933,1
0,65711522,1
0,79607905,1
...,...,...
121737330,584019097,2
121737333,584019433,2
121737340,584019333,2
121737357,584019416,1


In [10]:
seed_repos = [int(x.rstrip("\n").split(",")[0]) for x in open("/Users/jpivarski/talks/2023-05-09-chep23-analysis-of-physicists/analysis/list-of-scientific-python-repo_id.csv").readlines()[1:]]

In [12]:
df4 = df3.reset_index()

In [15]:
df4_repos_0 = df4[df4.repo.isin(seed_repos)]

In [20]:
df4_repos_1 = df4[df4.actor.isin(df4_repos_0.actor.values)]

In [22]:
len(df4_repos_0), len(df4_repos_1), len(df4)

(27666, 594790, 86449452)

In [23]:
len(np.unique(df4_repos_0.repo)), len(np.unique(df4_repos_1.repo)), len(np.unique(df4.repo))

(210, 446117, 73892130)

In [25]:
open("list-of-degree1-repos.txt", "w").write("\n".join(str(x) for x in np.unique(df4_repos_1.repo)))

4407171

In [26]:
degree1 = set(df4_repos_1.repo)
len(degree1)

446117

In [27]:
degree1_id2name = {}

for line in open("/Users/jpivarski/storage/data/GHArchive/repo_id_name.txt"):
    idstr, name = line[:-1].split("\t")
    idnum = int(idstr)
    if idnum in degree1:
        degree1_id2name[idnum] = name

In [28]:
set(degree1_id2name) == degree1

True

In [40]:
open("list-of-degree1-repos.csv", "w").write("".join(f"{idnum},{name},{df4_repos_1['count'][df4_repos_1.repo == idnum].values[0]}\n" for idnum, name in degree1_id2name.items()))

17093691

In [7]:
is_comment = (df.type == 10) | (df.type == 11) | (df.type == 15) | (df.type == 16)

In [9]:
df_comment = df[is_comment].drop(columns=["type"])

In [10]:
df_comment

Unnamed: 0,actor,repo,count
22,41347805,1,1
27,46025304,1,2
74,116427844,1,2
111,1595356,27,2
112,1595356,27,1
...,...,...,...
241048242,99094815,584016253,2
241048842,35613825,584017175,2
241048843,40209326,584017175,1
241048966,1580039,584017373,8


In [16]:
pd.Series(bot_actors).to_csv("bot-actors.csv")

In [18]:
seed = [
    # ipython/ipython
    658518,
    # matplotlib/matplotlib
    1385122,
    # networkx/networkx
    890377,
    # pandas-dev/pandas
    858127,
    # scikit-image/scikit-image
    2014929,
    # scikit-learn/scikit-learn
    843222,
    # pydata/xarray
    13221727,
    # scipy/scipy
    1460385,
]

In [19]:
df[df["actor"] == 0]

Unnamed: 0,actor,repo,type,count
11062163,0,32611596,16,3
16277189,0,54378638,16,1
16737843,0,56915933,16,1
18599358,0,65711522,16,1
21445990,0,79607905,16,1
26520038,0,105711679,16,1
32682954,0,137451403,16,1
41114131,0,180085554,16,1
62819371,0,260273356,16,1
86317163,0,379570608,16,1
