# In this notebook we test and evaluate RoBERTa without finetune.

In [1]:
import pandas as pd
from tqdm import tqdm 
from pathlib import Path
import csv
import pprint
import numpy as np

from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

# Path to data folder

DATA_DIR = Path('../../data')

# Load data and model

In [2]:
# load test data from action_test.cev.gz file

df_action_name = (
    pd.read_csv(DATA_DIR / 'test.csv.gz', index_col = [0])
    .sort_values(by=['names_number'],ascending=False)
    .head(100)
)

df_action_name

Unnamed: 0,action,name_official,description_official,names_users,names_number
0,actions/upload-artifact,Upload a Build Artifact,Upload a build artifact that can be used by su...,archive ruby package artifact [ruby-pkg_3.1.0_...,736
1,actions/cache,Cache,Cache artifacts like dependencies and build ou...,"cache gdcm,load cached .local,restore node_mod...",363
2,actions/checkout,Checkout,Checkout a Git repository at a particular version,checkout amplitude-ios gh-pages for building d...,359
3,actions/download-artifact,Download a Build Artifact,Download a build artifact that was previously ...,"download vs2022 uwp lite,fetch artifacts for a...",228
4,actions/upload-release-asset,Upload a Release Asset,Upload a release asset to an existing release ...,"upload archlinux package x86_64,upload plugin-...",218
...,...,...,...,...,...
92,juliangruber/read-file-action,Read file,Read file contents,"read package.json,read the pr_num file,read sd...",4
90,aquasecurity/trivy-action,Aqua Security Trivy,Scans container images for vulnerabilities wit...,"trivy scan,run trivy on deployment.yaml,trivy ...",4
89,r0adkll/sign-android-release,Sign Android release,An action to sign an Android release APK or AAB,"sign app apk,sign debug apk,sign premium apk,s...",4
88,crazy-max/ghaction-docker-meta,Docker Metadata action,"GitHub Action to extract metadata (tags, label...","get docker tags,docker tomcat meta,gather dock...",4


In [3]:
# initialize the RoBERTa based model. 

import torch
from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
                          BertConfig, BertForMaskedLM, BertTokenizer,
                          GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,
                          OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
                          RobertaConfig, RobertaModel, RobertaTokenizer,
                          DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)

MODEL_CLASSES = {
    'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
    'openai-gpt': (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
    'bert': (BertConfig, BertForMaskedLM, BertTokenizer),
    'roberta': (RobertaConfig, RobertaModel, RobertaTokenizer),
    'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)
}


config_class, model_class, tokenizer_class = MODEL_CLASSES['roberta']

config = config_class.from_pretrained("roberta-base")
tokenizer = tokenizer_class.from_pretrained("roberta-base")
encoder = model_class.from_pretrained("roberta-base", config=config)


2023-07-05 22:49:11.406351: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weight

In [4]:
# load model's parameters from checkpoint.

from model import Model

#input_dir = './saved_models/action_model_finetuned/checkpoint-best-f1/model.bin'

args = ''

encoder=Model(encoder,config,tokenizer,args)
#encoder.load_state_dict(torch.load(input_dir))

# Text embedding 

In [5]:
def phrase_tokenization(phrase, tokenizer, max_token_length):
    
    '''This function tokenize the phrases into ids'''
    
    phrase_tokens = tokenizer.tokenize(phrase)
    phrase_tokens = phrase_tokens[:max_token_length-2]
    phrase_tokens = [tokenizer.cls_token]+phrase_tokens+[tokenizer.sep_token]
    
    phrase_ids=tokenizer.convert_tokens_to_ids(phrase_tokens)
    padding_length = max_token_length - len(phrase_ids)
    phrase_ids+=[tokenizer.pad_token_id]*padding_length
    
    return phrase_ids

def phrases_embedding(list_phrases, tokenizer, encoder, max_token_length = 128):
    
    '''This function tansfer phrases' ids into tensors and apply our model's encoder to embed the phrases' ids into high-dimensional vectors'''
    
    max_token_length = max_token_length
    
    phrases_tokens = [phrase_tokenization(phrase, tokenizer, max_token_length) for phrase in list_phrases]
    phrases_tensors = [torch.tensor(token) for token in phrases_tokens]
    phrases_embeddings = [encoder.encode(input_ids=phrase.unsqueeze(0)) for phrase in tqdm(phrases_tensors, position=0, leave=True)]
    

    return phrases_embeddings

In [6]:
# extract test data from dataframe 'df_action_name'.

list_actions = df_action_name.to_dict('records')


list_names = [str(action['name_official']).lower() for action in list_actions]
list_descriptions = [str(action['description_official']).lower() for action in list_actions]
list_names_users = [list(action['names_users'].lower().split(',')) for action in list_actions]

list_names_users = [[name.strip() for name in names] for names in list_names_users]

In [7]:
# show data.

print(f'Top {len(list_names)} most frequently applied GitHub Actions has been selected! \n' )
n_example = 0
print(f'e.g. Action number {n_example}: \n')
print(f'Official name: \t\t {list_names[n_example]}')
print(f'Official description:\t {list_descriptions[n_example]} \n')
print(f'With {len(list_names_users[n_example])} User-assigned names, give 10 examples:')
pprint.pprint(list_names_users[n_example][:10])

Top 100 most frequently applied GitHub Actions has been selected! 

e.g. Action number 0: 

Official name: 		 upload a build artifact
Official description:	 upload a build artifact that can be used by subsequent workflow steps 

With 736 User-assigned names, give 10 examples:
['archive ruby package artifact [ruby-pkg_3.1.0_ubuntu-20.04_malloctrim] to '
 'github',
 'upload source artifact',
 'upload built site as artifacts',
 'upload diff output',
 'upload intellij build',
 'archive artifacts (floodgate velocity)',
 'store tarball as artifact',
 'store source distribution',
 'upload build mapping',
 'storing test data artifacts']


In [8]:
# embed test data. (official names + official descriptions)

block_size = 128

names_official_embeddings = phrases_embedding(list_names, tokenizer, encoder, block_size)
descriptions_official_embeddings = phrases_embedding(list_descriptions, tokenizer, encoder, block_size)

# different way to aggregate the embedding of names and descriptions: Arithmetic mean & L2-Norm

combined_official_embeddings = [torch.div((name+description),2) for name,description in zip(names_official_embeddings,descriptions_official_embeddings)]
combined_official_embeddings_l2 = [torch.sqrt(torch.square(name)+torch.square(description)) for name,description in zip(names_official_embeddings,descriptions_official_embeddings)]

100%|███████████████████████████████████████████████████████████████████████████████| 100/100 [00:13<00:00,  7.40it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 100/100 [00:10<00:00,  9.60it/s]


In [9]:
# embed test data. (user-assigned names)

list_names_users_embeddings = [phrases_embedding(name[:10],tokenizer, encoder, block_size) for name in list_names_users]

100%|█████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  9.64it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  9.74it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  9.83it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 10.00it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  9.86it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  9.79it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 10.07it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 10.09it/s]
100%|███████████████████████████████████████████

# Model testing and evaluation

In [10]:
def predict(list_names_users_embeddings,official_embeddings,top_n = 3):
    
    '''
    This function calculates the cosine similarities between each embedding vector of the user-assigned names 
    and every vectors of official name/description, etc., then sorts the results from the highest value to the lowest value
    along with their indexes(labels), finally returns the top_n indexes(labels) as the output suggested_labels.
    '''

    
    predicted_labels = []

    for names_users in tqdm(list_names_users_embeddings, position=0, leave=True):
        predicted_label= []

        for name_user in names_users:
            #print(name_user.shape)
            #print(names_official_embeddings[0].shape)
            cos_sim = [[F.cosine_similarity(official[0].view(1,-1).float(), name_user.view(1,-1).float()),index] for index,official in enumerate(official_embeddings)]
            
            predicted_label.append(sorted(cos_sim,reverse=True)[:top_n])


        predicted_labels.append(predicted_label)
        
    suggested_labels = [[[label[1] for label in labels] for labels in one_predicted_labels] for one_predicted_labels in predicted_labels]
    
    return suggested_labels


def predict_cs(list_names_users_embeddings,names_official_embeddings,descriptions_official_embeddings,top_n = 3):
    
    '''
    This function is slightly different from the previous predict function. It combines the cos_similarities calculated from 
    (user-assigned names, official names) & (user-assigned names, descriptions) by taking the arithmetic mean value.
    '''
    
    predicted_labels = []

    for names_users in tqdm(list_names_users_embeddings, position=0, leave=True):
        predicted_label= []

        for name_user in names_users:
            cos_sim_name = [F.cosine_similarity(name_official[0].view(1,-1).float(), name_user.view(1,-1).float()) for name_official in names_official_embeddings]
            cos_sim_description = [F.cosine_similarity(description_official[0].view(1,-1).float(), name_user.view(1,-1).float()) for description_official in descriptions_official_embeddings]

            divide = lambda x,y: (x+y) /2
            cos_sim = [[divide(cos_sim_name[index],cos_sim_description[index]),index] for index in range(len(cos_sim_name))]
            
            predicted_label.append(sorted(cos_sim,reverse=True)[:top_n])


        predicted_labels.append(predicted_label)
        
    suggested_labels = [[[label[1] for label in labels] for labels in one_predicted_labels] for one_predicted_labels in predicted_labels]
    
    return suggested_labels

In [11]:
def coverage_rate_(predicted_labels, top_n):
    '''
    This function calculates the coverage rate of the model's suggestions.
    '''
    covered_names = []

    for action in predicted_labels:
        all_name_per_action = []
        for name in action:
            all_name_per_action += name[:top_n]
        
        all_name_per_action = list(set(all_name_per_action))

        covered_names += all_name_per_action
    
    covered_names = list(set(covered_names))
    coverage = round(len(covered_names)/100,3)

    return coverage

def find_false_cases(predicted_labels, top_n):
    '''
    This function finds the failed cases, i.e. the expected action is not been suggested by the model.
    '''
    false_cases = []
    for action_index,predicted_labels_action in enumerate(predicted_labels):
        
        false_cases_action = []
    
        for name_index, labels in enumerate(predicted_labels_action):
            if action_index not in labels[:top_n]:
                false_cases_action.append([action_index,name_index])

        if false_cases_action:
            false_cases.append(false_cases_action)
    
    return false_cases
    
def overall_success_rate(suggested_labels, prediction_type, top_n):
    '''
    This function calculates the average success rate of the model, and prints it.
    '''
    success = 0

    for action_index,predicted_labels in enumerate(suggested_labels):
        success += len([n for n in predicted_labels if action_index in n[:top_n]])

    number_names = sum([len(suggestion) for suggestion in suggested_labels])
    average_acc = round(success/number_names,3)
    print(prediction_type+'.')

    return average_acc

def count_number(list_samples):
    '''
    This function counts the overall number of samples in a list of list.
    It's been used to count the failed samples and also the number of samples existing in test set.
    '''
    return sum([len(samples) for samples in list_samples])


def output_to_df(list_names_users, suggested_labels):

    list_selected_user_names = [name[:10] for name in list_names_users]

    list_all_output = []
    for action_index,name in enumerate(list_selected_user_names):
        for n in range(len(name)):
            dict_output = {}
            dict_output['name'] = name[n]
            dict_output['actual_action'] = action_index
            dict_output['suggested_actions'] = suggested_labels[action_index][n]
            list_all_output.append(dict_output)
    
    df_output = pd.DataFrame.from_records(list_all_output)
    
    return df_output

In [12]:
# compute the suggested labels in different ways.

import torch.nn.functional as F

top_n = 5

suggested_labels_combined_embedding = predict(list_names_users_embeddings, combined_official_embeddings,top_n)
#suggested_labels_combined_cs = predict_cs(list_names_users_embeddings,names_official_embeddings,descriptions_official_embeddings,top_n)
#suggested_labels_name = predict(list_names_users_embeddings,names_official_embeddings,top_n)
#suggested_labels_description = predict(list_names_users_embeddings,descriptions_official_embeddings,top_n)

100%|███████████████████████████████████████████████████████████████████████████████| 100/100 [00:41<00:00,  2.44it/s]


In [13]:
top_n = 5
suggested_labels = suggested_labels_combined_embedding
prediction_type = 'Both w/ vector mean'
#prediction_type = 'Both w/ mean similarities'
#prediction_type = 'Names only'
#prediction_type = 'Description only'

success_rate = overall_success_rate(suggested_labels, prediction_type, top_n)
coverage_rate = coverage_rate_(suggested_labels,top_n)
false_cases = find_false_cases(suggested_labels,top_n)

n_fail = count_number(false_cases)
n_predict = count_number([name[:10] for name in list_names_users])

print(f'Success rate: {success_rate}')
print(f'Coverage rate: {coverage_rate}')
print(f'Failure: {len(false_cases)}/100 actions has at least one failed suggestion; {n_fail}/{n_predict} fail attempts in total.')

Both w/ vector mean.
Success rate: 0.19
Coverage rate: 0.95
Failure: 100/100 actions has at least one failed suggestion; 635/784 fail attempts in total.


In [None]:
#store the output results into .csv files.

df_output = output_to_df(list_names_users, suggested_labels)
df_output.to_csv('../results/results_t'+str(Tau)+'.csv.gz', compression='gzip')

# Failure investigation

In [14]:
def predict_single(name, tokenizer, names_official_embeddings, list_actions, max_token_length = 128, top_n = 3):
    
    name_tokens = phrase_tokenization(name, tokenizer, max_token_length)
    #print(name_tokens)
    name_tensors = torch.tensor(name_tokens)
    name_embedding = encoder.encode(input_ids=name_tensors.unsqueeze(0))
    #print(name_embedding.shape)
    

    cos_sim = [[F.cosine_similarity(name_official[0].view(1,-1).float(), name_embedding.view(1,-1).float()),index] for index,name_official in enumerate(names_official_embeddings)]
            
    predicted_label = sorted(cos_sim,reverse=True)[:top_n]

    suggested_labels = [labels[1] for labels in predicted_label]
    
    print(f'Top {top_n} suggested GitHub Actions:')
    
    for i in range(len(suggested_labels)):
        print(f'{i}: {list_actions[suggested_labels[i]]["action"]}')
    
    return 

#predict_single('Download a Build Artifact',tokenizer,names_official_embeddings,list_actions,128,5)

In [15]:
false_cases[:10]

[[[6, 2], [6, 9]],
 [[7, 0], [7, 1], [7, 2], [7, 3], [7, 4], [7, 6], [7, 7], [7, 8], [7, 9]],
 [[8, 1], [8, 2], [8, 3], [8, 4], [8, 5], [8, 6], [8, 7], [8, 8], [8, 9]],
 [[10, 0], [10, 7]],
 [[11, 8]],
 [[12, 0]],
 [[13, 2], [13, 7], [13, 9]],
 [[14, 0],
  [14, 1],
  [14, 2],
  [14, 3],
  [14, 4],
  [14, 5],
  [14, 6],
  [14, 8],
  [14, 9]],
 [[15, 0]],
 [[16, 0]]]

In [16]:
n_action, n_name = 0,1

print(f"One failure example(user-assigned name):'{list_names_users[n_action][n_name]}'\n")
print(f"Expect action: {list_actions[n_action]['action']}\n")

predict_single(list_names_users[0][1],tokenizer,names_official_embeddings,list_actions,128,top_n)

One failure example(user-assigned name):'upload source artifact'

Expect action: actions/upload-artifact

Top 5 suggested GitHub Actions:
0: actions/upload-artifact
1: actions/upload-release-asset
2: brandedoutcast/publish-nuget
3: drusellers/publish-nuget
4: jakejarvis/s3-sync-action


# Vector visualization

In [17]:
list_names_users_visualization = [phrases_embedding(name[:30],tokenizer, encoder, block_size) for name in list_names_users[30:60]]

100%|█████████████████████████████████████████████████████████████████████████████████| 16/16 [00:03<00:00,  4.30it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 15/15 [00:03<00:00,  3.79it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 14/14 [00:02<00:00,  6.81it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 14/14 [00:01<00:00,  9.03it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 14/14 [00:01<00:00,  7.57it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 13/13 [00:01<00:00,  8.60it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 13/13 [00:01<00:00,  8.73it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 13/13 [00:01<00:00,  8.68it/s]
100%|███████████████████████████████████████████

In [None]:
# data visualization.
from sklearn.manifold import TSNE

data_visualization = list_names_users_visualization[9:14]
group_tensors = [torch.cat(inner_list, dim=0) for inner_list in data_visualization]

with torch.no_grad():
    flattened_data = torch.cat(group_tensors, dim=0)
    flattened_data = flattened_data.view(flattened_data.size(0), -1).numpy()

labels = np.repeat(np.arange(len(data_visualization)), [len(inner_list) for inner_list in data_visualization])


# Perform t-SNE dimension reduction
tsne = TSNE(n_components=2, random_state=42)
embedded_data = tsne.fit_transform(flattened_data)

# plot the embedded data with different colors for each group
for group_idx in np.unique(labels):
    group_mask = (labels == group_idx)
    plt.scatter(embedded_data[group_mask, 0], embedded_data[group_mask, 1], label=f"{list_actions[39+group_idx]['name_official']}")

plt.legend(bbox_to_anchor=(1, 1), loc="upper left")
ax = plt.gca()
ax.axes.xaxis.set_ticklabels([])
ax.axes.yaxis.set_ticklabels([])

plt.show()