See `README` file for more information about the process performed by this notebook.

In [1]:
import pandas as pd
import requests
from tqdm import tqdm 

import subprocess
import shutil
from multiprocessing import Pool
from pathlib import Path
from functools import partial
import re

In [2]:
# Path to clone repositories
DATA_DIR = Path('../data')
WORKING = Path('../data/actions')
WORKERS = 40

In [3]:
repositories_action = (
    pd.read_csv(DATA_DIR / 'action_name.csv.gz')
    [['action']]
    .values
    .tolist()
)


In [4]:
from collections import Counter 

In [5]:
a = [action[0].strip('/') for action in repositories_action]
print(len(Counter(a)))

2468


In [6]:
print(f'There are {len(repositories_action)} action repositories.')

There are 2472 action repositories.


Let's define a function that will do most of the job for a given repository and its default branch. Inline comments explain the process. 

In [28]:
def download_workflows(base_path: Path, repo: str, skip_existing=False, timeout=None):
    """
    Download the action.yml/yaml files for given repository. 
    
    :param base_path: target path for downloading the repository. 
    :param repo: name of the repository on GitHub. 
    :param branch: name of the branch to consider.
    :param timeout: timeout for individual command (in seconds).
    """
    # Create repository folder
    path = base_path / repo.replace('/', '---')
    try:
        path.mkdir(parents=True)
    except FileExistsError:
        if not skip_existing:
            raise
       
    # Check if given repository exists
    url = f'https://github.com/{repo}'
    r = requests.head(url)
    if r.status_code == 404:
        # Remove folder
        shutil.rmtree(path)
        return
    
    # Quick helper
    cmd = partial(subprocess.run, cwd=path, capture_output=True, text = True, timeout=timeout)
    
    # Initialize git repository
    out = cmd(['git', 'init'])
    
    # Configure git repository
    out = cmd(['git', 'remote', 'add', 'origin', 'https://github.com/' + repo])

    out = cmd(['git', 'config', 'core.sparsecheckout', 'true'])
    (path / '.git/info/sparse-checkout').write_text('action.yml\naction.yaml\nREADME.md\n')
    
    # Pull default branch
    try:
        branch = cmd(['git', 'remote', 'show', 'origin']).stdout      
        branch = re.search('HEAD branch: (.+)', branch).group(1)
        print(branch)
        out = cmd(['git', 'pull', 'origin', branch])
        out.check_returncode()
    except subprocess.CalledProcessError:
        # Remove folder
        shutil.rmtree(path)
        raise


Let's define a thin wrapper to handle errors.

In [29]:
def job(repository):
    repo = repository[0]
    try:
        download_workflows(WORKING, repo, skip_existing=True, timeout=None)
        return repo
    except Exception as e: 
        return repo, e

In [62]:
output = []
inputs = repositories_action[1800:2000]

with Pool(processes=WORKERS) as pool:
    jobs = pool.imap_unordered(job, inputs)
    for r in tqdm(jobs, total=len(inputs)):
        output.append(r)

  1%|▊                                                                                  | 2/200 [00:00<00:28,  6.97it/s]

master
main
main
master

  2%|█▏                                                                                 | 3/200 [00:00<00:57,  3.44it/s]


master
master
mastermastermaster


main
main
master
mastermaster

main
main
master
main
main
main
mastermastermaster


master
mastermain



  2%|██                                                                                 | 5/200 [00:00<00:34,  5.60it/s]

main
main
mastermainmain


dev
master
1.33.x
master
master
main


  5%|████                                                                              | 10/200 [00:01<00:13, 13.79it/s]

main
1.23.x
main


 20%|████████████████▊                                                                 | 41/200 [00:01<00:03, 45.84it/s]

main
main
master
master
master
master
main
dev
v1.x
master
mainmain

main
master
master
master
main
main
master
main
master
old-repo
main
master
master


 24%|███████████████████▋                                                              | 48/200 [00:02<00:05, 25.46it/s]

master
master
master
main
main
mastermaster

master
main
master


 26%|█████████████████████▋                                                            | 53/200 [00:02<00:05, 28.01it/s]

master
master
master


 32%|██████████████████████████▏                                                       | 64/200 [00:02<00:03, 39.78it/s]

master
master


 42%|██████████████████████████████████▊                                               | 85/200 [00:02<00:01, 59.20it/s]

master
main
main


 46%|██████████████████████████████████████▏                                           | 93/200 [00:02<00:02, 45.93it/s]

main
master
master
main
master
mainmaster

master
main
master
master
main
main


 50%|████████████████████████████████████████▌                                        | 100/200 [00:03<00:02, 34.07it/s]

master
master
mainmain

master
mastermaster

master
master
master
master
master
main


 52%|██████████████████████████████████████████▌                                      | 105/200 [00:03<00:02, 33.59it/s]

master
main
main
master


 56%|████████████████████████████████████████████▉                                    | 111/200 [00:03<00:02, 37.40it/s]

main
master
master


 60%|█████████████████████████████████████████████████                                | 121/200 [00:03<00:01, 48.97it/s]

main
main
main
master


 64%|███████████████████████████████████████████████████▊                             | 128/200 [00:03<00:01, 46.43it/s]

main
master


 68%|██████████████████████████████████████████████████████▋                          | 135/200 [00:03<00:01, 47.30it/s]

master
main
master
main
main
master


 70%|█████████████████████████████████████████████████████████                        | 141/200 [00:04<00:01, 38.92it/s]

master
master
main
master
master
main
main
main
main
master
master
main
mastermaster



 73%|███████████████████████████████████████████████████████████▏                     | 146/200 [00:04<00:01, 32.23it/s]

main
master
master


 77%|██████████████████████████████████████████████████████████████▎                  | 154/200 [00:04<00:01, 38.81it/s]

main
master
master
master
main
main
master
main
master


 80%|████████████████████████████████████████████████████████████████▊                | 160/200 [00:04<00:00, 41.49it/s]

master
master
master
master
master


 82%|██████████████████████████████████████████████████████████████████▊              | 165/200 [00:04<00:00, 39.39it/s]

develop


 86%|██████████████████████████████████████████████████████████████████████           | 173/200 [00:04<00:00, 47.25it/s]

main
master
main
master
master
master


 90%|████████████████████████████████████████████████████████████████████████▍        | 179/200 [00:05<00:00, 38.57it/s]

main
master
main
main
master
mainline
master
master

 92%|██████████████████████████████████████████████████████████████████████████▌      | 184/200 [00:05<00:00, 32.43it/s]


master
main
main
main
v1


 94%|████████████████████████████████████████████████████████████████████████████▏    | 188/200 [00:05<00:00, 29.13it/s]

master


 98%|███████████████████████████████████████████████████████████████████████████████▍ | 196/200 [00:05<00:00, 38.25it/s]

main


100%|█████████████████████████████████████████████████████████████████████████████████| 200/200 [00:05<00:00, 34.08it/s]


In [63]:
failures = [x for x in output if len(x) == 2]
print(f'There are {len(failures)} failing repositories out of {len(output)}.')
print('\n'.join(map(str, failures)))

There are 2 failing repositories out of 200.
('johnwbyrd/update-release', CalledProcessError(128, ['git', 'pull', 'origin', 'master'], '', "From https://github.com/johnwbyrd/update-release\n * tag               master     -> FETCH_HEAD\nfatal: update_ref failed for ref 'HEAD': cannot update ref 'refs/heads/master': trying to write non-commit object d2eeee14cc6eca1dde7b33127961c6fbf67670b5 to branch 'refs/heads/master'\n"))
('ljharb/actions-js-build/commit', AttributeError("'NoneType' object has no attribute 'group'"))
