# Collect Waiting & Computational time for workflow & job reruns

In [1]:
import sys
sys.path.append('..')
import pandas as pd
import pickle
from tqdm import tqdm
from tools.empirical import *

from ast import literal_eval

In [3]:
# GitHub API endpoint for getting workflow runs

workflow_runs_url = "https://api.github.com/repos/{owner}/{repo}/actions/runs"

In [4]:
access_token = # put access token here

headers = {
    "Authorization": f"Bearer {access_token}",
    "Accept": "application/vnd.github.v3+json"
}

# 1. Waiting time
## 1.1 Collect waiting time (represented by 'total duration') for workflow reruns.

In [5]:
# load the repositories from .csv file.

df_workflow_rerun = pd.read_csv('../data/dataframes/dataframe_workflow_rerun_distinct.csv.gz', index_col=0, converters={'reruns ids':literal_eval})
wf_reruns_repo_ids_list =  df_workflow_rerun[['repo path','reruns ids']].values.tolist()

In [None]:
wf_results_timing = []

for repo_ids in tqdm(wf_reruns_repo_ids_list):
    
    repo = repo_ids[0]
    repo_runs_url = f'https://api.github.com/repos/{repo}/actions/runs/'
    timing_list = []
    
    for rerun_set_ids in repo_ids[1]:
        
        rerun_set_timing = []
        
        for rerun_id in rerun_set_ids:
            
            # check if is approaching the token's access limit.
            check_access_token(headers)
                        
            timing_url = repo_runs_url + str(rerun_id) + '/timing'
            
            try:
                timing = api_request(timing_url, headers).get("run_duration_ms")
                
            except Exception as e:
                print(f'Error occurred when extracting timing data for {repo}/actions/runs/{rerun_id}')
                print(e)
                pass
            
            rerun_set_timing.append(timing)
        
        timing_list.append(rerun_set_timing)
        
    wf_results_timing.append(timing_list)

In [None]:
df_workflow_rerun['total duration'] = wf_results_timing
dr_workflow_rerun

In [None]:
df_workflow_rerun.to_csv(f'../data/dataframes/dataframe_workflow_totalduration_distinct.csv.gz', compression='gzip')

# 2. Computational Time
## 2.1 Collect computational times for workflow reruns.

In [31]:
workflow_jobs_details = []

for repo_ids in tqdm(reruns_repo_ids_list[:]):
    
    repo = repo_ids[0]
    repo_runs_url = f'https://api.github.com/repos/{repo}/actions/runs/'
    repo_jobs_details = []
    
    for rerun_set_ids in repo_ids[1]:
        
        rerun_set_jobs_details = []
        
        run_details_url = repo_runs_url + str(rerun_set_ids[-1])
        check_access_token(headers)
        
        for rerun_id in rerun_set_ids:
            
            check_access_token(headers)
            
            jobs_details_url = run_details_url + '/jobs'
            
            try:
                 
                jobs_details = api_request(jobs_details_url, headers).get("jobs")
                 
            except Exception as e:
                print(f'There occurs an error when extracting jobs details for {repo}/actions/runs/{rerun_id}')
                print(e)
                pass
        
        
            rerun_set_jobs_details.append(jobs_details)
        
        
        repo_jobs_details.append(rerun_set_jobs_details)
        
    workflow_jobs_details.append(repo_jobs_details)


100%|█████████████████████████████████████████████████████████████████████████████| 1372/1372 [1:29:45<00:00,  3.92s/it]


In [26]:
run_time = extract_runtime_wf(workflow_jobs_details)

In [28]:
df_workflow_rerun_total = pd.read_csv('../data/dataframes/dataframe_workflow_rerun_totalduration_distinct.csv.gz', index_col=0)
df_workflow_rerun_total['run time'] = run_time

In [31]:
df_workflow_rerun_total.to_csv(f'../data/dataframes/dataframe_workflow_rerun_totalduration_runtime_distinct.csv.gz', compression='gzip')

## 2.2 Collect computational time for job reruns.

In [36]:
# load the repositories from .csv file.

df_job_rerun = pd.read_csv('../data/dataframes/dataframe_job_rerun_distinct.csv.gz', index_col=0,converters={'reruns ids':literal_eval})
job_reruns_repo_ids_list =  df_job_rerun[['repo path','reruns ids']].values.tolist()

Unnamed: 0,File index,repo path,workflow file path,branch,default,reruns,reruns ids,# avg rerun,# max rerun,# min rerun,# rerun sets,last rerun's conclusion,# success,# failure
0,"[1, 5, 0]",19MisterX98/SeedcrackerX,.github/workflows/build.yml,master,True,[2],[5205081138],2.000000,2,2,1,['success'],1,0
1,"[2, 8, 1]",1C-Company/v8-code-style,.github/workflows/fork-pull.yml,feature/116-name-contains-unallowed-letter,False,[2],[4061614246],2.000000,2,2,1,['success'],1,0
2,"[2, 13, 0]",1C-Company/v8-code-style,.github/workflows/ci-build.yml,master,True,"[8, 2, 4, 2]","[4383819350, 4402684116, 5051597175, 7162937806]",4.000000,8,2,4,"['failure', 'failure', 'success', 'failure']",1,3
3,"[3, 3, 4]",1c-syntax/bsl-language-server,.github/workflows/publish-to-sonatype.yml,develop,True,"[2, 2, 2]","[6416810239, 6422031572, 6854676215]",2.000000,2,2,3,"['failure', 'success', 'success']",2,1
4,"[3, 3, 5]",1c-syntax/bsl-language-server,.github/workflows/javadoc.yml,develop,True,[2],[6422031570],2.000000,2,2,1,['success'],1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4346,"[911, 0, 0]",zxing/zxing,.github/workflows/codeql-analysis.yml,master,True,"[2, 2, 2, 2, 2]","[5355746619, 5616171182, 5616288815, 587506197...",2.000000,2,2,5,"['success', 'success', 'success', 'failure', '...",4,1
4347,"[911, 0, 1]",zxing/zxing,.github/workflows/test_java_17.yml,master,True,"[2, 2, 2, 2, 2]","[5355746626, 5616171186, 5616288816, 587506197...",2.000000,2,2,5,"['success', 'failure', 'success', 'failure', '...",3,2
4348,"[911, 0, 2]",zxing/zxing,.github/workflows/test_java_8.yml,master,True,[2],[7719613165],2.000000,2,2,1,['failure'],0,1
4349,"[911, 0, 3]",zxing/zxing,dynamic/pages/pages-build-deployment,master,True,[2],[5444760221],2.000000,2,2,1,['success'],1,0


In [None]:
job_details_list = []
initial_state = []
job_date_info = []

for repo_ids in tqdm(job_reruns_repo_ids_list):
    
    repo = repo_ids[0]
    repo_runs_url = f'https://api.github.com/repos/{repo}/actions/runs/'
    repo_job_details = []
    repo_initial_state = []
    repo_date_info = []
    
    for rerun_id in repo_ids[1]:
        
        rerun_job_details = []
        
        run_details_url = repo_runs_url + str(rerun_id)
        jobs_details_url = run_details_url + '/jobs'
        
        check_access_token(headers)
        
        try:
            
            run_details = api_request(run_details_url, headers)#.get("updated_at")
            date_info = run_details['updated_at']
            repo_date_info.append(date_info)
            
            jobs_details = api_request(jobs_details_url, headers).get("jobs")
            
            rerun_job_details.append(jobs_details)
            
            
            while run_details['run_attempt'] != 1:
                
                check_access_token(headers)
                    
                previous_attempt_run_url = run_details['previous_attempt_url']
            
                previous_attempt_jobs_url = previous_attempt_run_url + '/jobs'
                
                run_details = api_request(previous_attempt_run_url, headers)
                
                jobs_details = api_request(previous_attempt_jobs_url, headers).get("jobs")
                
                rerun_job_details.append(jobs_details)
            
            #print(run_details['run_attempt'])
            set_initial_state = run_details['conclusion']
            repo_initial_state.append(set_initial_state)

        except Exception as e:
            print(f'There occurs an error when extracting timing data for {repo}/actions/runs/{rerun_id}')
            print(e)
            pass
            
        repo_job_details.append(rerun_job_details)
        
        
    job_details_list.append(repo_job_details)
    initial_state.append(set_initial_state)
    job_date_info.append(repo_date_info)

In [None]:
runtime = extract_runtime_jobs(jobs_details_list)

In [42]:
df_job_rerun['run time'] = runtime
df_job_rerun

In [44]:
df_job_rerun.to_csv(f'../data/dataframes/dataframe_job_rerun_runtime_distinct.csv.gz', compression='gzip')