# Collect workflow & job reruns for all branches in every repository

In [1]:
import sys
sys.path.append('..')
import pandas as pd
import time
import pickle
import copy
from datetime import datetime
from github import Github
import requests
from tqdm import tqdm
from tools.empirical import *

from ast import literal_eval

## 1. Filter out the repository without GHA enabled.

In [2]:
repositories = (
    pd.read_csv('../data/repositories_original.csv', encoding='latin-1')
    # Exclude very large repositories, for disk space purposes
    .query('size <= 500 * 1024')
    [['name', 'defaultBranch']]
    .sort_values('name')
    .values
    .tolist()
)

print(f'There are {len(repositories)} repositories in total.')

There are 5217 repositories in total.


In [None]:
repositories_with_workflows = []

repo_index = 0
for repository in tqdm(repositories):
    
    try:
        
        repo, branch = repository
        # Check if given repository has a .github/workflows
        url = f'https://github.com/{repo}/tree/{branch}/.github/workflows'
        r = requests.head(url)
        
        if r.status_code != 404:
            repositories_with_workflows.append(repository)
    
    except:
        print(f'Error occurred with repository #{repo_index}')
        pass
    
    repo_index += 1

In [None]:
repository_path = '../data/repositories_with_workflows_3324.csv.gz'

with open(repository_path, 'wb') as f:
    pickle.dump(repositories_with_workflows,f)

## 2. Load data and collect reruns 

In [3]:
# load the repositories from .csv file.

repositories = (
    pd.read_csv('../data/repositories_with_workflows.csv.gz')
    [['name', 'defaultBranch']]
    .sort_values('name')
    .values
    .tolist()
)

# drop duplicates, repository 420/1800/2409/2410 has been removed.
repos = repositories[:420] + repositories[421:1800] + repositories[1801:2409] + repositories[2411:]
print(f'There are {len(repos)} repositories in total with GHA enabled.')

3320

In [5]:
# load the data for workflow runs and commit hashs for every branch in all 3320 repositories 
# those .pkl files are very large and can take some time to load.

with open(f'../data/all_branch_workflow_runs2023_final.pkl', 'rb') as f:
    workflow_runs_all_branch = pickle.load(f)
with open(f'../data/all_branch_commit_hashs2023_final.pkl', 'rb') as f:
    commit_hashs_all_branch = pickle.load(f)

In [9]:
# find the job & workflow rerun for each branch within each repo.

all_rerun_workflow = []
all_rerun_job = []
num_workflow_file = []
num_workflow_file_default = []
num_workflow_file_failure = []

for repo_index in range(len(workflow_runs_all_branch)):
    
    branch_rerun_workflow = []
    branch_rerun_job = []
    branch_workflow_file = []
    default_branch_workflow_file = []
    branch_workflow_file_failure = []
    
    for branch_index in range(len(workflow_runs_all_branch[repo_index])):
        
        branch = workflow_runs_all_branch[repo_index][branch_index][0]
        rerun_workflow, rerun_job = find_reruns(workflow_runs_all_branch[repo_index][branch_index][1], commit_hashs_all_branch[repo_index][branch_index][1])

        branch_rerun_workflow.append([branch,rerun_workflow])
        branch_rerun_job.append([branch,rerun_job])
        
        
            
        for file in workflow_runs_all_branch[repo_index][branch_index][1]:
            branch_workflow_file.append(file)
            
            if branch == repos[repo_index][1]:
                default_branch_workflow_file.append(file)
            
            for run_detail in workflow_runs_all_branch[repo_index][branch_index][1][file]:
                if run_detail['conclusion'] == 'failure':
                    branch_workflow_file_failure.append(file)
                    break
            
        
    num_branch_workflow_file = len(list(set(branch_workflow_file)))
    num_default_branch_workflow_file = len(list(set(default_branch_workflow_file)))
    num_branch_workflow_file_failure = len(list(set(branch_workflow_file_failure)))
    
    all_rerun_workflow.append(branch_rerun_workflow)
    all_rerun_job.append(branch_rerun_job)
    num_workflow_file.append(num_branch_workflow_file)
    num_workflow_file_default.append(num_default_branch_workflow_file)
    num_workflow_file_failure.append(num_branch_workflow_file_failure)

In [10]:
# count the total # of workflow files / workflow reruns and job reruns.
# find the index of the workflow and job reruns in the dict of all_rerun_workflow and all_job_workflow, 
# then store them in 2 individual lists ‘w_rerun_index’ and ‘j_rerun_index’.

number_workflow_rerun = 0
number_workflow_rerun_set = 0

number_job_rerun = 0
number_job_rerun_set = 0


number_workflow_rerun_times = 0

w_rerun_index = []
j_rerun_index = []

for repo_index in tqdm(range(len(all_rerun_workflow))):
    
    for branch_index in range(len(all_rerun_workflow[repo_index])):
        
        for workflow_index in range(len(all_rerun_workflow[repo_index][branch_index][1])):
            
            if len(all_rerun_workflow[repo_index][branch_index][1][workflow_index]) > 0:

                w_rerun_index.append([repo_index, branch_index, workflow_index])
                number_workflow_rerun += 1
                number_workflow_rerun_set += len(all_rerun_workflow[repo_index][branch_index][1][workflow_index])
                    

            if len(all_rerun_job[repo_index][branch_index][1][workflow_index]) > 0:
                j_rerun_index.append([repo_index, branch_index, workflow_index])
                number_job_rerun += 1
                number_job_rerun_set += len(all_rerun_job[repo_index][branch_index][1][workflow_index])

100%|████████████████████████████████████████████████████████████████████████████| 3320/3320 [00:00<00:00, 11288.34it/s]


In [11]:
print(f'There exists {sum(num_workflow_file)} workflow files in total.')
print(f'There exists {sum(num_workflow_file_default)} workflow files in default branches.')
print(f'There exists {sum(num_workflow_file_failure)} workflow files failed in total.')
print(f'{number_workflow_rerun}/{sum(num_workflow_file_failure)} of them have workflow reruns.')
print(f'{number_job_rerun}/{sum(num_workflow_file_failure)} of them have job reruns.')
print(f'{number_workflow_rerun_set}')
print(f'{number_job_rerun_set}')

There exists 9202 workflow files in total.
There exists 6565 workflow files in default branches.
There exists 5576 workflow files failed in total.
1406/5576 of them have workflow reruns.
4351/5576 of them have job reruns.
6512
17362


### 2.1 Build rerun dataset for workflow reruns

In [15]:
df_workflow_rerun = build_rerun_df(repos, w_rerun_index, 'workflow', all_rerun_workflow)

# drop duplicate data record.
df_workflow_rerun_distinct = df_workflow_rerun.loc[df_workflow_rerun[['repo path','workflow file path','reruns ids','branch']].astype(str).drop_duplicates().index].reset_index(drop=True)
df_workflow_rerun_distinct.to_csv(f'../data/dataframes/dataframe_workflow_rerun_distinct.csv.gz', compression='gzip')
df_workflow_rerun_distinct

1406 workflow files have workflow reruns.
844/1406(60.0%) reruns occurred in default branches.


Unnamed: 0,File index,repo path,workflow file path,branch,default,event,reruns,reruns ids,# avg rerun,# max rerun,# min rerun,# rerun sets,last rerun's conclusion,# success,# failure,content,rerun conclusion
0,"[3, 3, 1]",1c-syntax/bsl-language-server,.github/workflows/qa.yml,develop,True,"[[workflow_run, workflow_run, workflow_run], [...","[2, 19, 7, 3, 3, 5, 6, 8, 2, 9, 1, 1]","[[6174675977, 6174737838, 6174898363], [644200...",5.500000,19,1,12,"[success, success, success, success, success, ...",7,5,[[https://api.github.com/repos/1c-syntax/bsl-l...,"[[failure, success, success], [failure, failur..."
1,"[3, 14, 0]",1c-syntax/bsl-language-server,.github/workflows/gradle.yml,feature/osClassModuleType,False,"[[push, pull_request]]",[1],"[[5694924874, 5694924932]]",1.000000,1,1,1,[failure],0,1,[[https://api.github.com/repos/1c-syntax/bsl-l...,"[[failure, failure]]"
2,"[4, 1, 0]",1c-syntax/sonar-bsl-plugin-community,.github/workflows/night_build.yml,develop,True,"[[schedule, schedule, schedule], [schedule, sc...","[2, 3, 7, 6, 3, 2, 4, 18, 1, 13, 1]","[[3827335175, 3835710330, 3844383153], [385315...",5.454545,18,1,11,"[failure, failure, failure, failure, failure, ...",0,11,[[https://api.github.com/repos/1c-syntax/sonar...,"[[failure, failure, failure], [failure, failur..."
3,"[8, 2, 0]",3arthqu4ke/3arthh4ck,.github/workflows/docker.yml,main,True,"[[workflow_run, workflow_run]]",[1],"[[4076411830, 4091899320]]",1.000000,1,1,1,[success],1,0,[[https://api.github.com/repos/3arthqu4ke/3art...,"[[failure, success]]"
4,"[12, 11, 1]",94fzb/zrlog,.github/workflows/codeql-analysis.yml,master,True,"[[schedule, schedule, schedule, schedule, sche...","[8, 2, 3, 3]","[[3872519179, 3929066488, 3985101351, 40424906...",4.000000,8,2,4,"[success, success, success, success]",4,0,[[https://api.github.com/repos/94fzb/zrlog/con...,"[[failure, success, success, failure, success,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1401,"[3308, 4, 0]",zhouhailin/freeswitch-externals,dynamic/pages/pages-build-deployment,v1.7-dev,True,"[[dynamic, dynamic, dynamic]]",[2],"[[5123044637, 5123054258, 6582462821]]",2.000000,2,2,1,[success],1,0,[[https://api.github.com/repos/zhouhailin/free...,"[[failure, success, success]]"
1402,"[3310, 2, 1]",zinggAI/zingg,.github/workflows/codeql.yml,0.3.5,False,"[[push, push]]",[1],"[[4581579825, 4626574290]]",1.000000,1,1,1,[failure],0,1,[[https://api.github.com/repos/zinggAI/zingg/c...,"[[failure, failure]]"
1403,"[3310, 10, 1]",zinggAI/zingg,.github/workflows/codeql.yml,main,True,"[[push, schedule], [push, push], [push, schedu...","[1, 1, 1, 7, 2, 1, 2, 2, 3, 3, 4, 3, 8, 4]","[[4375112804, 4381093887], [4550952813, 455098...",3.000000,8,1,14,"[failure, failure, failure, failure, failure, ...",0,14,[[https://api.github.com/repos/zinggAI/zingg/c...,"[[failure, failure], [failure, failure], [fail..."
1404,"[3314, 29, 0]",zonkyio/embedded-database-spring-test,.github/workflows/test.yml,flyway-9.9-plus,False,"[[push, pull_request]]",[1],"[[7534009017, 7534061061]]",1.000000,1,1,1,[failure],0,1,[[https://api.github.com/repos/zonkyio/embedde...,"[[failure, failure]]"


### 2.2 Build rerun dataset for job reruns

In [17]:
df_job_rerun = build_rerun_df(repos, j_rerun_index, 'job', all_rerun_job)

# drop duplicate data record.
df_job_rerun_distinct = df_job_rerun.loc[df_job_rerun[['repo path','reruns ids','branch','event']].astype(str).drop_duplicates().index].reset_index(drop=True)
df_job_rerun_distinct.to_csv(f'../data/dataframes/dataframe_job_rerun_distinct.csv.gz', compression='gzip')
df_job_rerun_distinct

4351 workflow files have job reruns.
2346/4351(53.9%) reruns occurred in default branches.


Unnamed: 0,File index,repo path,workflow file path,branch,default,event,reruns,reruns ids,# avg rerun,# max rerun,# min rerun,# rerun sets,last rerun's conclusion,# success,# failure
0,"[1, 5, 0]",19MisterX98/SeedcrackerX,.github/workflows/build.yml,master,True,[push],[1],[5205081138],1.000000,1,1,1,[success],1,0
1,"[2, 8, 1]",1C-Company/v8-code-style,.github/workflows/fork-pull.yml,feature/116-name-contains-unallowed-letter,False,[pull_request_target],[1],[4061614246],1.000000,1,1,1,[success],1,0
2,"[2, 13, 0]",1C-Company/v8-code-style,.github/workflows/ci-build.yml,master,True,"[push, push, push, push]","[7, 1, 3, 1]","[4383819350, 4402684116, 5051597175, 7162937806]",3.000000,7,1,4,"[failure, failure, success, failure]",1,3
3,"[3, 3, 4]",1c-syntax/bsl-language-server,.github/workflows/publish-to-sonatype.yml,develop,True,"[push, push, push]","[1, 1, 1]","[6416810239, 6422031572, 6854676215]",1.000000,1,1,3,"[failure, success, success]",2,1
4,"[3, 3, 5]",1c-syntax/bsl-language-server,.github/workflows/javadoc.yml,develop,True,[push],[1],[6422031570],1.000000,1,1,1,[success],1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4346,"[3318, 0, 0]",zxing/zxing,.github/workflows/codeql-analysis.yml,master,True,"[pull_request, pull_request, pull_request, pul...","[1, 1, 1, 1, 1]","[5355746619, 5616171182, 5616288815, 587506197...",1.000000,1,1,5,"[success, success, success, failure, success]",4,1
4347,"[3318, 0, 1]",zxing/zxing,.github/workflows/test_java_17.yml,master,True,"[pull_request, pull_request, pull_request, pul...","[1, 1, 1, 1, 1]","[5355746626, 5616171186, 5616288816, 587506197...",1.000000,1,1,5,"[success, failure, success, failure, success]",3,2
4348,"[3318, 0, 2]",zxing/zxing,.github/workflows/test_java_8.yml,master,True,[push],[1],[7719613165],1.000000,1,1,1,[failure],0,1
4349,"[3318, 0, 3]",zxing/zxing,dynamic/pages/pages-build-deployment,master,True,[dynamic],[1],[5444760221],1.000000,1,1,1,[success],1,0


### 2.3 Collect the 'conclusions' for all the reruns for each job rerun set

In [28]:
# GitHub API endpoint for getting workflow runs

workflow_runs_url = "https://api.github.com/repos/{owner}/{repo}/actions/runs"

In [28]:
access_token = # put access token here

headers = {
    "Authorization": f"Bearer {access_token}",
    "Accept": "application/vnd.github.v3+json"
}

In [29]:
df_job_rerun = pd.read_csv('../data/dataframes/dataframe_job_rerun_distinct.csv.gz', index_col=0,converters={'reruns ids':literal_eval})
job_reruns_repo_ids_list =  df_job_rerun[['repo path','reruns ids']].values.tolist()

In [None]:
outcome_list = []

for repo_ids in tqdm(job_reruns_repo_ids_list[4210:]):
    
    repo_runs_url = f'https://api.github.com/repos/{repo_ids[0]}/actions/runs/'
    repo_outcomes = []
    
    for rerun_id in repo_ids[1]:
        
        rerun_outcomes = []
        run_details_url = repo_runs_url + str(rerun_id)
        check_access_token(headers)
        
        try:
            
            run_details = api_request(run_details_url, headers)
            outcome_info = run_details['conclusion']
            rerun_outcomes.append(outcome_info)
                                    
            while run_details['run_attempt'] != 1:
                
                check_access_token(headers)
                previous_attempt_run_url = run_details['previous_attempt_url']
                run_details = api_request(previous_attempt_run_url, headers)
                outcome_info = run_details['conclusion']
                rerun_outcomes.append(outcome_info)

        except Exception as e:
            print(f'Error {e} occurred when extracting outcome info for {repo_ids[0]}/actions/runs/{rerun_id}')
            pass
            
        repo_outcomes.append(rerun_outcomes[::-1])
        
    outcome_list.append(repo_outcomes)

In [60]:
df_job_rerun['rerun conclusion'] = outcome_list
df_job_rerun

In [62]:
df_job_rerun.to_csv(f'../data/dataframes/dataframe_job_rerun_distinct_with_conclusion.csv.gz', compression='gzip')