In [19]:
import random
import pymongo
import matplotlib.pyplot as plt

from scripts import MONGO_URL
from scripts.analysis import data


def get_unique_actors(collection: str, query: dict) -> set:
    with pymongo.MongoClient(MONGO_URL) as client:
        return set(
            map(
                lambda x: x["_id"],
                client.fake_stars[collection].aggregate(
                    [
                        {"$match": query},
                        {"$group": {"_id": "$actor"}},
                    ]
                ),
            )
        )

In [9]:
repos, stars = data.get_fake_star_repos(), data.get_fake_stars_by_month()
repos_with_campaign = data.get_repos_with_campaign()

In [28]:
# bean counting
print("# repos with suspected fake stars:", len(repos))
print(
    "# suspected fake stars:",
    sum(stars.n_stars_fake),
    ", # low activity, ",
    sum(stars.n_stars_low_activity),
    ", # clustered, ",
    sum(stars.n_stars_clustered),
)
print("# repos with suspected fake star campaigns:", len(repos_with_campaign))
print(
    "# suspected fake stars in those campaigns:",
    sum(stars[stars.repo.isin(repos_with_campaign)].n_stars_fake),
)
print(
    "# actors in suspected fake stars:",
    len(
        get_unique_actors("low_activity_stars", {"low_activity": True})
        | get_unique_actors("clustered_stars", {"clustered": True})
    ),
)
print(
    "# actors in suspected fake stars:",
    len(
        get_unique_actors(
            "low_activity_stars",
            {"low_activity": True, "repo": {"$in": list(repos_with_campaign)}},
        )
        | get_unique_actors(
            "clustered_stars",
            {"clustered": True, "repo": {"$in": list(repos_with_campaign)}},
        )
    ),
)

# repos with suspected fake stars: 22915
# suspected fake stars: 4525461.0 , # low activity,  946858.0 , # clustered,  3578603.0
# repos with suspected fake star campaigns: 15835
# suspected fake stars in those campaigns: 3076427.0
# actors in suspected fake stars: 1070551
# actors in suspected fake stars: 201873


In [18]:
%%capture
# visual evaluation of post processing
long_lived_repos = set(
    stars.groupby("repo").filter(lambda x: len(x) >= 3).repo.unique()
)
sample = random.sample(list(repos_with_campaign & long_lived_repos), 100)

fig, axes = plt.subplots(20, 5, figsize=(50, 80))
for i, repo in enumerate(sample):
    df = stars[stars["repo"] == repo]
    ax = axes[i // 5, i % 5]
    df.plot(kind="bar", x="month", y="n_stars", ax=ax, color="C0")
    df.plot(kind="bar", x="month", y="n_stars_fake", ax=ax, color="C1")
    ax.set_xticklabels([])
    ax.set_title(repo)
fig.savefig("plots/fake_star_campaigns.pdf")