In [1]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from collections import defaultdict

from scripts import END_DATE
from scripts.analysis import data, plot

from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
from sklearn.cluster import SpectralClustering, KMeans
from sklearn.metrics import silhouette_score

plot.init_matplotlib()

W, H = 6, 2.5

In [2]:
repos, stars = data.get_fake_star_repos(), data.get_fake_stars_by_month()
repos_with_campaign = data.get_repos_with_campaign()
repo_campaign_months = set(zip(stars[stars.anomaly].repo, stars[stars.anomaly].month))

In [3]:
# bean counting
print("# repos with suspected fake stars:", len(repos))
print(
    "# suspected fake stars:",
    sum(stars.n_stars_fake),
    ", # low activity, ",
    sum(stars.n_stars_low_activity),
    ", # clustered, ",
    sum(stars.n_stars_clustered),
)
print("# repos with suspected fake star campaigns:", len(repos_with_campaign))
print(
    "# suspected fake stars in those campaigns:",
    sum(stars[stars.repo.isin(repos_with_campaign)].n_stars_fake),
)

low_activity_actors = data.get_unique_actors("low_activity")
clustered_actors = data.get_unique_actors("clustered")
print(
    "# actors in suspected fake stars:",
    len(clustered_actors | low_activity_actors),
)

low_activity_actors_in_campaign = data.get_unique_actors_in_campaign("low_activity")
print("# low activity stars in campaign:", len(low_activity_actors_in_campaign))

clustered_actors_in_campaign = data.get_unique_actors_in_campaign("clustered")
print("# clustered actors in campaign:", len(clustered_actors_in_campaign))
print(
    "# actors in suspected fake star campaigns:",
    len(clustered_actors_in_campaign | low_activity_actors_in_campaign),
)

# repos with suspected fake stars: 26254
# suspected fake stars: 5999002.0 , # low activity,  1066934.0 , # clustered,  4932068.0
# repos with suspected fake star campaigns: 18617
# suspected fake stars in those campaigns: 3807118.0
# actors in suspected fake stars: 1501957
# low activity stars in campaign: 96309
# clustered actors in campaign: 206355
# actors in suspected fake star campaigns: 301096


In [4]:
# face validity check: how many of the repos and actors are deleted?
deleted = repos.repo_id.isnull()
low_act = repos.n_stars_low_activity > 0
clustered = repos.n_stars_clustered > 0
campaign = repos.repo_name.isin(repos_with_campaign)
sample_repo_ids = pd.read_csv(f"data/{END_DATE}/sample_repo_ids.csv")
print(
    f"Baseline % deletion: {len(sample_repo_ids[sample_repo_ids.id.isnull()]) / len(repos) * 100:.2f}%"
)
print(
    f"% deleted in repos with fake stars: {len(repos[deleted]) / len(repos) * 100:.2f}\n"
    f"% deleted in low activity repos: {len(repos[deleted & low_act]) / len(repos[low_act]) * 100:.2f}\n"
    f"% deleted in clustered repos: {len(repos[deleted & clustered]) / len(repos[clustered]) * 100:.2f}\n"
)
print(
    f"% deleted in repos with fake star campaigns: {len(repos[deleted & campaign]) / len(repos[campaign]) * 100:.2f}\n"
    f"% deleted in low activity repos with campaign: {len(repos[deleted & campaign & low_act]) / len(repos[campaign & low_act]) * 100:.2f}\n"
    f"% deleted in clustered repos with campaign: {len(repos[deleted & campaign & clustered]) / len(repos[campaign & clustered]) * 100:.2f}\n"
)

fake_user_info = pd.read_csv(f"data/{END_DATE}/fake_user_info.csv")
sample_user_info = pd.read_csv(f"data/{END_DATE}/sample_user_info.csv")

deleted = fake_user_info.error.notnull()
low_act = fake_user_info.actor.isin(low_activity_actors)
clustered = fake_user_info.actor.isin(clustered_actors)
campaign = fake_user_info.actor.isin(
    clustered_actors_in_campaign | low_activity_actors_in_campaign
)
print(
    f"Baseline % deletion: {len(sample_user_info[sample_user_info.error.notnull()]) / len(repos) * 100:.2f}%"
)
print(
    f"% deleted in repos with fake stars: {len(fake_user_info[deleted]) / len(fake_user_info) * 100:.2f}\n"
    f"% deleted in low activity repos: {len(fake_user_info[deleted & low_act]) / len(fake_user_info[low_act]) * 100:.2f}\n"
    f"% deleted in clustered repos: {len(fake_user_info[deleted & clustered]) / len(fake_user_info[clustered]) * 100:.2f}\n"
)
print(
    f"% deleted in repos with fake star campaigns: {len(fake_user_info[deleted & campaign]) / len(fake_user_info[campaign]) * 100:.2f}\n"
    f"% deleted in low activity repos with campaign: {len(fake_user_info[deleted & campaign & low_act]) / len(fake_user_info[campaign & low_act]) * 100:.2f}\n"
    f"% deleted in clustered repos with campaign: {len(fake_user_info[deleted & campaign & clustered]) / len(fake_user_info[campaign & clustered]) * 100:.2f}\n"
)

Baseline % deletion: 5.03%
% deleted in repos with fake stars: 70.05
% deleted in low activity repos: 14.38
% deleted in clustered repos: 82.03

% deleted in repos with fake star campaigns: 90.42
% deleted in low activity repos with campaign: 79.36
% deleted in clustered repos with campaign: 90.70

Baseline % deletion: 3.54%
% deleted in repos with fake stars: 18.77
% deleted in low activity repos: 19.19
% deleted in clustered repos: 23.03

% deleted in repos with fake star campaigns: 57.07
% deleted in low activity repos with campaign: 72.29
% deleted in clustered repos with campaign: 48.83

