In [66]:
from sklearn.neural_network import MLPClassifier
from pathlib import Path
import pandas as pd
import pprint
from tqdm import tqdm 

In [2]:
DATA_DIR = Path('../data')

In [4]:
df_action_name_train = (
                pd.read_csv(DATA_DIR / 'action_features_train.csv.gz', index_col = [0])
                .sort_values(by=['names_number'],ascending=False)
                #.head(100)
            )

df_action_name_test = (
                pd.read_csv(DATA_DIR / 'action_test.csv.gz', index_col = [0])
                .sort_values(by=['names_number'],ascending=False)
                .head(100)
            )

In [5]:
# initialize the RoBERTa based model. 

import torch
from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
                          BertConfig, BertForMaskedLM, BertTokenizer,
                          GPT2Config, GPT2LMHeadModel, GPT2Tokenizer,
                          OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer,
                          RobertaConfig, RobertaModel, RobertaTokenizer,
                          DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)

MODEL_CLASSES = {
    'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer),
    'openai-gpt': (OpenAIGPTConfig, OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
    'bert': (BertConfig, BertForMaskedLM, BertTokenizer),
    'roberta': (RobertaConfig, RobertaModel, RobertaTokenizer),
    'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer)
}


config_class, model_class, tokenizer_class = MODEL_CLASSES['roberta']

config = config_class.from_pretrained("roberta-base")
tokenizer = tokenizer_class.from_pretrained("roberta-base")
encoder = model_class.from_pretrained("roberta-base", config=config)


Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
def phrase_tokenization(phrase, tokenizer, max_token_length):
    
    '''This function tokenize the phrases into ids'''
    
    phrase_tokens = tokenizer.tokenize(phrase)
    phrase_tokens = phrase_tokens[:max_token_length-2]
    phrase_tokens = [tokenizer.cls_token]+phrase_tokens+[tokenizer.sep_token]
    
    phrase_ids=tokenizer.convert_tokens_to_ids(phrase_tokens)
    padding_length = max_token_length - len(phrase_ids)
    phrase_ids+=[tokenizer.pad_token_id]*padding_length
    
    return phrase_ids

def phrases_tokenization(list_phrases, tokenizer, max_token_length = 128):
    
    '''This function tansfer phrases' ids into tensors and apply our model's encoder to embed the phrases' ids into high-dimensional vectors'''
    
    max_token_length = max_token_length
    #print(max_token_length)
    
    ##### -----------> Here is a problem: what about the descriptions with more than 25 words?

    
    phrases_tokens = [phrase_tokenization(phrase, tokenizer, max_token_length) for phrase in list_phrases]
    
    #phrases_tensors = [torch.tensor(token) for token in phrases_tokens]
    
    #phrases_embeddings = [encoder.encode(input_ids=phrase.unsqueeze(0)) for phrase in tqdm(phrases_tensors, position=0, leave=True)]
    

    return phrases_tokens

In [22]:
# extract test data from dataframe 'df_action_name'.

def extract_data(df_action_name, tokenizer, block_size = 128):
    
    list_actions = df_action_name.to_dict('records')

    list_names = [str(action['name_official']).lower() for action in list_actions]
    list_descriptions = [str(action['description_official']).lower() for action in list_actions]
    list_names_users = [list(action['names_users'].lower().split(',')) for action in list_actions]
    #list_names_users = [list(set(action['names_users'].lower().split(','))) for action in list_actions]

    list_names_users = [[name.strip() for name in names] for names in list_names_users]
    
    

    names_tokens = phrases_tokenization(list_names, tokenizer, block_size)
    descriptions_tokens = phrases_tokenization(list_descriptions, tokenizer, block_size)
    list_names_users_tokens = [phrases_tokenization(name[:10], tokenizer, block_size) for name in list_names_users]
    
    return list_actions,names_tokens,descriptions_tokens,list_names_users_tokens

In [7]:
# show data.

print(f'Top {len(list_names)} most frequently applied GitHub Actions has been selected! \n' )
n_example = 0
print(f'e.g. Action number {n_example}: \n')
print(f'Official name: \t\t {list_names[n_example]}')
print(f'Official description:\t {list_descriptions[n_example]} \n')
print(f'With {len(list_names_users[n_example])} User-assigned names, give 10 examples:')
pprint.pprint(list_names_users[n_example][:10])

Top 756 most frequently applied GitHub Actions has been selected! 

e.g. Action number 0: 

Official name: 		 upload a build artifact
Official description:	 upload a build artifact that can be used by subsequent workflow steps 

With 2944 User-assigned names, give 10 examples:
['upload wheelhouse artifact',
 'upload pytest log as artifact',
 '📤 upload artifact: html',
 'archive artifacts (geyser velocity)',
 'upload binary files (linux_arm64)',
 'upload entitygraphql.aspnet',
 'upload test results on failure',
 'upload clang-format patch as artifact',
 'archive jemalloc binary artifact [centos-8] to github',
 'upload test result artifact']


In [25]:
block_size = 128
actions_train,names_train,descriptions_train,names_users_train = extract_data(df_action_name_train,tokenizer,block_size)
actions_test,names_test,descriptions_test,names_users_test = extract_data(df_action_name_test,tokenizer,block_size)

In [27]:
# create a dictionary for with key:value = action:index based on the training dataset.

action_index = {}
for i in range(len(actions_train)):
    action_index[actions_train[i]['action']] = i
    

In [46]:
def create_dataset(actions,names_tokens,descriptions_tokens,names_users_tokens,action_index,train=False):
    
    X = []
    y = []

    for i in range(len(actions)):
        
        if train == True:
            X.append(names_tokens[i])
            X.append(descriptions_tokens[i])
            y.append(action_index[actions[i]['action']])
            y.append(action_index[actions[i]['action']])

        for j in range(len(names_users_tokens[i])):
            X.append(names_users_tokens[i][j])
            y.append(action_index[actions[i]['action']])
        
    return X,y

In [47]:
X_train, y_train = create_dataset(actions_train,names_train,descriptions_train,names_users_train,action_index,True)
X_test, y_test = create_dataset(actions_test,names_test,descriptions_test,names_users_test,action_index,False)


In [184]:
from sklearn.tree import DecisionTreeClassifier
clf_dtc = DecisionTreeClassifier(random_state=0)

clf_dtc.fit(X_train, y_train)
clf_dtc_proba = clf.predict_proba(X_test)

In [185]:
from sklearn.ensemble import RandomForestClassifier
clf_rfc = RandomForestClassifier(random_state=0)
clf_rfc.fit(X_train, y_train)
clf_rfc_proba = clf_rfc.predict_proba(X_test)

In [186]:
def predict_labels(predict_proba,top_n = 3):
    
    '''
    This function sorts the predicted probabilities for each X_test from the highest value to the lowest value
    along with their indexes(labels), then returns the top_n indexes(labels) as the output suggested_labels.
    '''
    
    predicted_labels = []

    for probas in tqdm(predict_proba, position=0, leave=True):
        
        predicted_label= []
        probas_index = [[proba,index] for index,proba in enumerate(probas)]
        predicted_labels.append(sorted(probas_index,reverse=True)[:top_n])
        
    suggested_labels = [[label[1] for label in labels] for labels in predicted_labels]
    
    return suggested_labels

def blurry_accuracy(predicted_labels, action_index):
    '''
    This function calculates the performance of the model, it's called 'blurry' because
    the output result is considered positive if the real label is predicted correctly in the top_n suggestions.
    (the first suggestion doesn't have to be the right one if top_n != 1)
    '''
    correct = 0
    for i in range(len(predicted_labels)):
        if action_index[i] in predicted_labels[i]:
            correct += 1
    return round(correct / len(predicted_labels),4)

In [194]:
clf_rfc_predicted_labels = predict_labels(clf_rfc_proba,top_n = 3)
clf_dtc_predicted_labels = predict_labels(clf_dtc_proba,top_n = 3)

100%|█████████████████████████████████████████████████████████████████████████████████| 784/784 [00:00<00:00, 3863.76it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 784/784 [00:00<00:00, 3471.33it/s]


In [195]:
blurry_accuracy(clf_dtc_predicted_labels, y_test)

0.3265