In [None]:
import json

import pandas as pd

In [None]:
class Config:
    git_metadata = "./datasets/repo_metadata.json"
    # https://gist.github.com/ppisarczyk/43962d06686722d26d176fad46879d41
    lang_extensions = "./datasets/lang_extensions.json"
    min_reqs = {
        "stars": 25,
        "watchers": 25,
        "forks": 25,
        "target_languages": {
            "Python",
            "Java",
            "Go",
            "JavaScript",
            "C++",
            "TypeScript",
            "PHP",
            "C",
            "C#",
            "Rust",
            "Scala",
            "Kotlin",
        },
        "size_thr": 0.5,
        "languages_size_sum": 100000,
        "language_size": 10000,
    }
    licenses = ["MIT License", "MIT No Attribution", "Apache License 2.0"]
    out_file = "./datasets/top_150.csv"

In [None]:
with open(Config.lang_extensions, "r") as f:
    lang_extensions = json.load(f)
    lang_extensions = {
        d["name"]: d["extensions"]
        for d in lang_extensions
        if d["name"] in Config.min_reqs["target_languages"] and "extensions" in d
    }
print(f"{len(lang_extensions)} from {len(Config.min_reqs['target_languages'])}")

In [None]:
git_data = pd.read_json(Config.git_metadata)
git_data.head(5)

In [None]:
git_data = git_data[
    ["owner", "name", "stars", "forks", "watchers", "isFork", "languages", "primaryLanguage", "license"]
]


def filter_langs(x):
    size_sum = sum(l["size"] for l in x)
    if len(x) == 0 or size_sum == 0:
        return []

    langs = [lang for lang in sorted(x, key=lambda l: -l["size"])]

    cur = 0.0
    out = []
    for l in langs:
        cur += l["size"] / size_sum
        out.append(l)
        if cur > Config.min_reqs["size_thr"]:
            break

    out = [l for l in out if l["name"] in Config.min_reqs["target_languages"]]
    return out


git_data["languages"] = git_data["languages"].apply(lambda x: filter_langs(x))
git_data["languages_size_sum"] = git_data["languages"].apply(lambda x: sum(l["size"] for l in x))
git_data.head(5)

In [None]:
processed_data = git_data[
    (git_data["stars"] >= Config.min_reqs["stars"])
    & (git_data["watchers"] >= Config.min_reqs["watchers"])
    & (git_data["forks"] >= Config.min_reqs["forks"])
    & (git_data["primaryLanguage"].isin(Config.min_reqs["target_languages"]))
    & (git_data["languages"].apply(lambda x: len(x)) > 0)
    & (git_data["languages_size_sum"] >= Config.min_reqs["languages_size_sum"])
    & (~git_data["isFork"])  # useless because it contains no forks
    & (git_data["license"].isin(Config.licenses))
]
processed_data = processed_data.sort_values(by=["stars", "forks", "watchers"], ascending=False)
processed_data = processed_data.drop_duplicates(["name"])
processed_data["all_languages"] = processed_data["languages"].apply(lambda x: [l["name"] for l in x])
processed_data = processed_data.explode("languages").reset_index(drop=True)
processed_data["language_size"] = processed_data["languages"].apply(lambda x: x["size"])
processed_data["language"] = processed_data["languages"].apply(lambda x: x["name"])

processed_data = processed_data[processed_data["language_size"] >= Config.min_reqs["language_size"]]
processed_data

In [None]:
top_150 = processed_data.groupby("language").head(150)
top_150 = top_150.drop(columns=["language", "language_size"]).drop_duplicates(["name"]).reset_index(drop=True)
top_150["repo"] = top_150.apply(lambda x: f"https://github.com/{x['owner']}/{x['name']}", axis=1)
top_150

In [None]:
top_25 = processed_data.groupby("language").head(25)
top_25 = top_25.drop(columns=["language", "language_size"]).drop_duplicates(["name"]).reset_index(drop=True)
top_25["repo"] = top_25.apply(lambda x: f"https://github.com/{x['owner']}/{x['name']}", axis=1)
top_25

In [None]:
top_150.to_csv(Config.out_file)

In [None]:
[a for a in top_150.to_dict(orient="records")][0]