In [1]:
import pymongo
import pandas as pd

from collections import Counter
from scripts import MONGO_URL
from scripts.analysis import data, plot, checkpoint

plot.init_matplotlib()

W, H = 6, 2.5

In [2]:
repos, stars = data.get_fake_star_repos(), data.get_fake_stars_by_month()
repos_with_campaign = data.get_repos_with_campaign()

checkpoint_repo_events = pd.read_csv("data/checkpoint_repo_events.csv")

In [3]:
# Only consider repositories with >= 50 stars
checkpoint_repos = {
    repo
    for repo, n_stars in Counter(
        checkpoint_repo_events[checkpoint_repo_events.type == "WatchEvent"].repo
    ).items()
    if n_stars >= 50
}

print(len(set(checkpoint_repo_events.repo)), "repos in malware campaign in total")
print(len(checkpoint_repos), "repos with >= 50 stars")
print(
    f"Recall: {len(repos_with_campaign & set(checkpoint_repos))}/{len(checkpoint_repos)}"
    f" = {len(repos_with_campaign & set(checkpoint_repos)) / len(checkpoint_repos):.4f}"
)
print(
    f"Recall wo. postprocessing: {len(set(repos.repo_name) & set(checkpoint_repos))}/{len(set(checkpoint_repos))}"
    f" = {len(set(repos.repo_name) & set(checkpoint_repos)) / len(checkpoint_repos):.4f}"
)

1315 repos in malware campaign in total
847 repos with >= 50 stars
Recall: 688/847 = 0.8123
Recall wo. postprocessing: 694/847 = 0.8194


In [4]:
# checkpoint_actors = checkpoint.CHECKPOINT_ACCOUNTS
checkpoint_actors = checkpoint_repo_events[
    (checkpoint_repo_events.type == "WatchEvent")
    & checkpoint_repo_events.repo.isin(checkpoint_repos)
].actor.unique()

with pymongo.MongoClient(MONGO_URL) as client:
    stars = list(
        client["fake_stars"]["clustered_stars"].find(
            {"actor": {"$in": list(checkpoint_actors)}}
        )
    ) + list(
        client["fake_stars"]["low_activity_stars"].find(
            {"actor": {"$in": list(checkpoint_actors)}}
        )
    )
n_actors_low_activity = len(
    set(s["actor"] for s in stars if s.get("low_activity", False))
)
n_actors_clustered = len(set(s["actor"] for s in stars if s.get("clustered", False)))
n_actors_both = len(
    set(
        s["actor"]
        for s in stars
        if s.get("clustered", False) or s.get("low_activity", False)
    )
)
print(
    n_actors_low_activity,
    n_actors_clustered,
    n_actors_both,
    len(checkpoint_actors),
)
print(f"Recall: {n_actors_both / len(checkpoint_actors):.4f}")

314 11589 11903 15672
Recall: 0.7595
