In [20]:
import os
import re
import random
import pandas as pd
from collections import Counter, defaultdict
from scripts.analysis import data

In [15]:
repos_in_campaign = data.get_repos_with_campaign()
TRENDING_REPO = "data/trending_archive"
MONTH_TO_TRENDING = defaultdict(set)
pattern = re.compile(r'^\d{4}-\d{2}-\d{2}\.md$')
for root, dir, files in os.walk(TRENDING_REPO):
    for file_name in files:
        if pattern.match(file_name):
            with open(os.path.join(root, file_name), 'r') as file:
                content = file.read()
                repos = re.findall(r'https://github\.com/([\w-]+/[\w-]+)', content)
                for repo in repos:
                    month = file_name[:7]
                    MONTH_TO_TRENDING[month].add(repo)
trending_repos = set()
for repos in MONTH_TO_TRENDING.values():
    trending_repos.update(repos & repos_in_campaign)

In [16]:
packages = pd.read_csv("data/packages_fake.csv")
packages.domain = packages.domain.map(lambda x: x.split(":")[0])
domains = [df.domain.values[0] for x, df in packages.groupby("repo_name")]
print(len(packages.repo_name.unique()), "repositories with packages")
for domain, count in sorted(Counter(domains).items(), key=lambda x: x[1], reverse=True):
    percentage = (count / len(domains)) * 100
    print(f"  {domain}: {count} ({percentage:.2f}%)")

229 repositories with packages
  web: 60 (26.20%)
  blockchain: 41 (17.90%)
  ai: 37 (16.16%)
  suspicious: 23 (10.04%)
  basic-utilty: 22 (9.61%)
  other: 18 (7.86%)
  database: 12 (5.24%)
  deleted: 10 (4.37%)
  tool/application: 6 (2.62%)


In [22]:
pkg_repo_domains = [(x, df.domain.values[0]) for x, df in packages.groupby("repo_name")]
repo_with_readmes = pd.read_csv("data/readmes/summary.csv")
repo_df = dict()
for repo, domain in pkg_repo_domains:
    repo_df[repo] = {
        "repo": repo,
        "packages": True,
        "trending": repo in trending_repos,
        "domain": domain,
    }
for repo in trending_repos:
    if repo not in repo_df:
        repo_df[repo] = {
            "repo": repo,
            "packages": False,
            "trending": True,
            "domain": "",
        }
remaining_repos = set(repo_with_readmes.repo) -  set(repo_df.keys())
print(len(remaining_repos))
for repo in random.sample(list(remaining_repos), 299):
    repo_df[repo] = {
        "repo": repo,
        "packages": False,
        "trending": False,
        "domain": "",
    }
repo_df = pd.DataFrame(repo_df.values()).sort_values("repo")
repo_df.to_csv("data/repo_labels.csv", index=False)

1345


In [18]:
len(set(repo_df.keys()))


4