In [18]:
from sklearn.neural_network import MLPClassifier
from pathlib import Path
import pandas as pd
import pprint

In [10]:
DATA_DIR = Path('../data')

In [15]:
df_action_name = (
                pd.read_csv(DATA_DIR / 'action_features_train.csv.gz', index_col = [0])
                .sort_values(by=['names_number'],ascending=False)
                .head(100)
            )

In [13]:
# initialize the RoBERTa based model. 

import torch
from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
                          BertConfig, BertForMaskedLM, BertTokenizer,
                          GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,
                          OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
                          RobertaConfig, RobertaModel, RobertaTokenizer,
                          DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)

MODEL_CLASSES = {
    'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
    'openai-gpt': (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
    'bert': (BertConfig, BertForMaskedLM, BertTokenizer),
    'roberta': (RobertaConfig, RobertaModel, RobertaTokenizer),
    'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)
}


config_class, model_class, tokenizer_class = MODEL_CLASSES['roberta']

config = config_class.from_pretrained("roberta-base")
tokenizer = tokenizer_class.from_pretrained("roberta-base")
encoder = model_class.from_pretrained("roberta-base", config=config)


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
def phrase_tokenization(phrase, tokenizer, max_token_length):
    
    '''This function tokenize the phrases into ids'''
    
    phrase_tokens = tokenizer.tokenize(phrase)
    phrase_tokens = phrase_tokens[:max_token_length-2]
    phrase_tokens = [tokenizer.cls_token]+phrase_tokens+[tokenizer.sep_token]
    
    phrase_ids=tokenizer.convert_tokens_to_ids(phrase_tokens)
    padding_length = max_token_length - len(phrase_ids)
    phrase_ids+=[tokenizer.pad_token_id]*padding_length
    
    return phrase_ids

def phrases_tokenization(list_phrases, tokenizer, encoder, max_token_length = 128):
    
    '''This function tansfer phrases' ids into tensors and apply our model's encoder to embed the phrases' ids into high-dimensional vectors'''
    
    max_token_length = max_token_length
    #print(max_token_length)
    
    ##### -----------> Here is a problem: what about the descriptions with more than 25 words?

    
    phrases_tokens = [phrase_tokenization(phrase, tokenizer, max_token_length) for phrase in list_phrases]
    
    #phrases_tensors = [torch.tensor(token) for token in phrases_tokens]
    
    #phrases_embeddings = [encoder.encode(input_ids=phrase.unsqueeze(0)) for phrase in tqdm(phrases_tensors, position=0, leave=True)]
    

    return phrases_tokens

In [16]:
# extract test data from dataframe 'df_action_name'.

list_actions = df_action_name.to_dict('records')

list_names = [str(action['name_official']).lower() for action in list_actions]
list_descriptions = [str(action['description_official']).lower() for action in list_actions]
list_names_users = [list(action['names_users'].lower().split(',')) for action in list_actions]
#list_names_users = [list(set(action['names_users'].lower().split(','))) for action in list_actions]

list_names_users = [[name.strip() for name in names] for names in list_names_users]

In [19]:
# show data.

print(f'Top {len(list_names)} most frequently applied GitHub Actions has been selected! \n' )
n_example = 0
print(f'e.g. Action number {n_example}: \n')
print(f'Official name: \t\t {list_names[n_example]}')
print(f'Official description:\t {list_descriptions[n_example]} \n')
print(f'With {len(list_names_users[n_example])} User-assigned names, give 10 examples:')
pprint.pprint(list_names_users[n_example][:10])

Top 100 most frequently applied GitHub Actions has been selected! 

e.g. Action number 0: 

Official name: 		 upload a build artifact
Official description:	 upload a build artifact that can be used by subsequent workflow steps 

With 2944 User-assigned names, give 10 examples:
['upload wheelhouse artifact',
 'upload pytest log as artifact',
 '📤 upload artifact: html',
 'archive artifacts (geyser velocity)',
 'upload binary files (linux_arm64)',
 'upload entitygraphql.aspnet',
 'upload test results on failure',
 'upload clang-format patch as artifact',
 'archive jemalloc binary artifact [centos-8] to github',
 'upload test result artifact']
