# In this notebook we build and save our dataset for actions.

In [1]:
import os
import pandas as pd
import yaml
from pathlib import Path
from strictyaml import load, dirty_load

In [2]:
#  create a dictionary from the downloaded action files with key : value = relative path of the actions : content of the actions

DATA_DIR = Path('.')

# Create an empty dictionary to hold the results
results = {}
fail_count = 0
# Iterate over the subdirectories in the given directory
for subdir in os.listdir(DATA_DIR / 'actions_downloaded'):
    subdir_path = os.path.join(DATA_DIR / 'actions_downloaded', subdir)

    # If the current item is a directory and contains an action.yml or action.yaml file
    if os.path.isdir(subdir_path) and ("action.yml" in os.listdir(subdir_path) or "action.yaml" in os.listdir(subdir_path)):
        action_file_path = os.path.join(subdir_path, "action.yml") if "action.yml" in os.listdir(subdir_path) else os.path.join(subdir_path, "action.yaml")
        with open(action_file_path) as f:
            try:
                action_contents = dirty_load(Path(action_file_path).read_text(),allow_flow_style=True).data
            except Exception:
                print(action_file_path)
                fail_count += 1
                pass
            
            results[subdir.replace('---','/')] = action_contents

print(f'Successfully loaded {len(results)} actions, {fail_count} failed.')

actions_downloaded/acud---openapi-dockerized/action.yml
actions_downloaded/shaunlwm---action-release-debugapk/action.yml
Successfully loaded 2229 actions, 2 failed.


In [5]:
# show an example of the results.

results['appleboy/gitlab-ci-action']

{'name': 'Trigger GitLab CI Job',
 'description': 'Triggering GitLab CI Job through the API',
 'author': 'Bo-Yi Wu',
 'inputs': {'host': {'description': 'gitlab-ci base url',
   'default': 'https://gitlab.com'},
  'token': {'description': 'gitlab-ci token', 'required': 'true'},
  'ref': {'description': 'gitlab-ci valid refs are only the branches and tags'},
  'project_id': {'description': 'gitlab project id', 'required': 'true'},
  'debug': {'description': 'debug mode'}},
 'runs': {'using': 'docker', 'image': 'Dockerfile'},
 'branding': {'icon': 'check-circle', 'color': 'orange'}}

In [6]:
# extract useful information (relative path, official name, official description) and generate dict for each action.

list_action_features = []
for key, value in results.items():
    action_feature = {}
    action_feature['action'] = key
    action_feature['name'] = results[key]['name'] if 'name' in results[key].keys() else ''
    action_feature['description'] = results[key]['description'] if 'description' in results[key].keys() else ''
    list_action_features.append(action_feature)
    
list_action_features[:5]

[{'action': 'nuget/setup-nuget',
  'name': 'Setup NuGet.exe for use with actions',
  'description': 'Official NuGet.exe setup action that supports cross-platform installation of specific NuGet.exe versions.'},
 {'action': 'derjuulsn/todo-issue',
  'name': 'todo-issue',
  'description': 'Converts TODOs to Issues'},
 {'action': 'shilman/linear-action',
  'name': 'Linear export',
  'description': 'Export github issues to linear on label'},
 {'action': 'seanmiddleditch/gha-setup-ninja',
  'name': 'Install ninja-build tool',
  'description': 'Installs ninja and adds it to the path'},
 {'action': 'anaynayak/python-vulture-action',
  'name': 'Python Vulture Action',
  'description': 'Run vulture to analyse dead code in python projects'}]

In [9]:
# generate dataframe from the list of actions. (d_2)

df_action_name_description_official = pd.DataFrame.from_records(list_action_features)
df_action_name_description_official.sort_values(by=['action'])

df_action_name_description_official.to_csv(DATA_DIR / 'action_official_names_descriptions.csv.gz', compression='gzip')

In [10]:
# load the dataframe which contains the user-assigned names. (d_1)
dict_action_name_users = (
    pd.read_csv(DATA_DIR / 'action_user_assigned_names.csv.gz')
    .rename(columns={'0':'names'})
    .sort_values(by=['action'])
    .to_dict('records')
)

print(f'There are {len(dict_action_name_users} actions in d_1.')

2472

In [11]:
# only keep the actions with at least 3 distinct use-assigned names.
# divide the whole dataset into train and test&valid sets with a ratio of 2:1.

import copy

dict_action_selected_train = []
dict_action_selected_test = []

for action in dict_action_name_users:
    
    list_name = list(set(action['names'].lower().split(',')))
    names_number = len(list_name)
    train_n = int(names_number*2/3)
    test_n = names_number - train_n
    
    action_train = copy.deepcopy(action)
    action_test = copy.deepcopy(action)
    
    action_train['names'] = ','.join(list_name[:train_n])
    action_test['names'] = ','.join(list_name[train_n:])
    action_train['names_number'] = train_n
    action_test['names_number'] = test_n
    
    if names_number > 2:
        dict_action_selected_train.append(action_train)
        dict_action_selected_test.append(action_test)

df_action_train = pd.DataFrame.from_records(dict_action_selected_train)
df_action_test = pd.DataFrame.from_records(dict_action_selected_test)


'''
dict_action_selected_all = []

for action in dict_action_name_users:
    list_name = list(set(action['names'].lower().split(',')))

    action['names'] = ','.join(list_name)
    action['names_number'] = len(list_name)
    
    names_number = len(list_name)
      
    if names_number > 2:
        dict_action_selected_all.append(action)


df_action_all = pd.DataFrame.from_records(dict_action_selected_all)
df_merged_users_official_all = (
    df_action_name_description_official.merge(df_action_all, how='inner', on='action')
    #.sort_values(by=['action'])
    .rename(columns={'names':'names_users'})
    .rename(columns={'name':'name_official'})
    .rename(columns={'description':'description_official'})
    .sort_values(by=['names_number'],ascending=False)
)

df_merged_users_official_all
'''

In [16]:
df_merged_users_official_train = (
    df_action_name_description_official.merge(df_action_train, how='inner', on='action')
    #.sort_values(by=['action'])
    .rename(columns={'names':'names_users'})
    .rename(columns={'name':'name_official'})
    .rename(columns={'description':'description_official'})
)

df_merged_users_official_test = (
    df_action_name_description_official.merge(df_action_test, how='inner', on='action')
    #.sort_values(by=['action'])
    .rename(columns={'names':'names_users'})
    .rename(columns={'name':'name_official'})
    .rename(columns={'description':'description_official'})
)

In [17]:
df_merged_users_official_test

Unnamed: 0,action,name_official,description_official,names_users,names_number
0,nuget/setup-nuget,Setup NuGet.exe for use with actions,Official NuGet.exe setup action that supports ...,"nuget - setup,get nuget,setup nuget for use wi...",6
1,seanmiddleditch/gha-setup-ninja,Install ninja-build tool,Installs ninja and adds it to the path,"install ninja (macos/windows),setup ninja.,ins...",3
2,github/super-linter,Super-Linter,"It is a simple combination of various linters,...","run super-linter,lint with github super linter...",5
3,passiverecords/chrome-extension-upload-action,Chrome Extension upload & publish,Action for uploading chrome extensions,publish bsc extension to chrome web store,1
4,ethomson/send-tweet-action,Send Tweet Action,Post a tweet to Twitter during a GitHub Action...,"notify twitter,send tweet",2
...,...,...,...,...,...
751,scacap/action-surefire-report,Surefire Report,Report Surefire test results as annotations on...,"publish unit test results,publish aggregated t...",3
752,anchore/scan-action,Anchore Container Scan,Scan docker containers with Grype for vulnerab...,"scan image dev,scan image alpine,anchore scann...",4
753,azure/functions-action,Azure Functions Action,Deploy Function App to Azure Functions,"build & deploy function,deploy textanalytics,d...",3
754,wzieba/appcenter-github-action,App Center,GitHub Action that uploads artefacts for Visua...,"deploy to app center,upload artefact to app ce...",2


In [18]:
dict_action_test_valid = (
    df_merged_users_official_test
    .sort_values(by=['names_number'],ascending=False)
    .head(100)
    .to_dict('records')
)

In [19]:
# divide the test&valid sets again into test and valid sets with a ratio of 1:1.

dict_action_test = []
dict_action_valid = []

for action in dict_action_test_valid:
    
    list_name = list(action['names_users'].split(','))
    names_number = action['names_number']
    test_n = int(names_number/2)

    action_test = copy.deepcopy(action)
    action_valid = copy.deepcopy(action)
    
    action_test['names_users'] = ','.join(list_name[:test_n])
    action_valid['names_users'] = ','.join(list_name[test_n:])
    action_test['names_number'] = test_n
    action_valid['names_number'] = names_number - test_n
    
    dict_action_test.append(action_test)
    dict_action_valid.append(action_valid)

df_action_test = pd.DataFrame.from_records(dict_action_test)
df_action_valid = pd.DataFrame.from_records(dict_action_valid)

In [21]:
df_action_test

Unnamed: 0,action,name_official,description_official,names_users,names_number
0,actions/upload-artifact,Upload a Build Artifact,Upload a build artifact that can be used by su...,"upload dh-make-golang test run as artifact,upl...",736
1,actions/cache,Cache,Cache artifacts like dependencies and build ou...,"cache conan data,handle yarn cache,restore nod...",363
2,actions/checkout,Checkout,Checkout a Git repository at a particular version,"checkout ref commit,checkout the source code,c...",359
3,actions/download-artifact,Download a Build Artifact,Download a build artifact that was previously ...,"download external libs,download ${{ matrix.nam...",228
4,actions/upload-release-asset,Upload a Release Asset,Upload a release asset to an existing release ...,"upload node modules package,uploading release ...",218
...,...,...,...,...,...
95,juliangruber/read-file-action,Read file,Read file contents,"read benchmark output,read benchmark output 1,...",4
96,papeloto/action-zip,Easy Zip Files,Action for zipping files and folders easily,"zip cmd linux release,compress artifacts,zip l...",4
97,crazy-max/ghaction-docker-meta,Docker Metadata action,"GitHub Action to extract metadata (tags, label...","docker meta cgo,build docker metadata,gather d...",4
98,andymckay/labeler,Simple Issue Labeler,Adds and removes labels from issues.,"remove triage,needs-triage labeling,label iss...",4


In [22]:
df_action_test.to_csv(DATA_DIR / 'test.csv.gz', compression='gzip')
df_action_valid.to_csv(DATA_DIR / 'valid.csv.gz', compression='gzip')

df_merged_users_official_train.to_csv(DATA_DIR / 'train.csv.gz', compression='gzip')
#df_merged_users_official_test.to_csv('../data/action_features_test_valid.csv.gz', compression='gzip')