In [2]:
from models.model import *
from code2seq.code2seq import predict as predict_embeddings
import data_aggregation.get_features
from models.catboost_model import load_catboost_model
from embeddings.match_embeddings_with_methods import match_embeddings_with_methods_from_df
from data_aggregation.union_predictions_and_features import union_preds_features

class BugLocalizationModelAPI:
    def __init__(self, lstm_model_path='', cb_model_path='', frames_limit=384):
        self.model = None
        if lstm_model_path:
            self.model = BugLocalizationModel()
            self.model.load_model(lstm_model_path)

        if cb_model_path:
            self.cb_model = load_catboost_model(cb_model_path)

        self.code2seq_predictor = None
        self.frames_limit = 384


    def get_code_features(self, methods_data):
        self.feature_extractor = data_aggregation.get_features.FeatureExtractor()
        for method in methods_data:
            self.feature_extractor.get_feature_from_code(method['code'])
            self.feature_extractor.get_feature_from_code(method['meta'])
        return self.feature_extractor.to_pandas()

    def collect_data_for_catboost(self, methods_data, lstm_prediction):
        code_features_df = self.get_code_features(methods_data)
        frames_len = len(methods_data)
        df_preds = self.model_prediction_to_df(lstm_prediction, frames_len)
        df_all = union_preds_features(df_preds, code_features_df)
        df_all = df_all.drop(['label', 'method_name', 'report_id', 'indices'], axis=1)
        return df_all

    def predict_bug_lstm(self, embeddings, top_k=3):
        prediction = self.model.model(FloatTensor(embeddings))[:,:, 1]
        prediction = prediction.flatten()
        return (-prediction).argsort()[:top_k], prediction

    def predict_bug_cb(self, catboost_data, top_k=3):
        prediction = self.cb_model.predict_proba(catboost_data)[:,:, 1]
        prediction = prediction.flatten()
        return (-prediction).argsort()[:top_k], prediction
        
    def model_prediction_to_df(self, prediction, frames_len):
        return pd.DataFrame({'report_id':np.zeros(frames_len), 'method_stack_position': np.arange(0,frames_len), 'lstm_prediction':prediction})

    def predict(self, methods_data, pred_type='lstm', top_k=3):
        embeddings = self.get_embeddings(methods_data)
        top_k_pred, lstm_prediction = self.predict_bug_lstm(embeddings, top_k)
        if pred_type == 'lstm':
            return top_k_pred, lstm_prediction

        catboost_data = self.collect_data_for_catboost(methods_data, lstm_prediction)
        
        if pred_type == 'all':
            return self.predict_bug_cb(catboost_data, top_k)


    def get_embeddings(self, methods_data):
        if self.code2seq_predictor is None:
            self.code2seq_predictor = predict_embeddings()
        methods_embeddings = []
        for method in methods_data:
            embeddings_df = None
            if method['code']:
                embeddings_df = self.code2seq_predictor.predict_by_code(method['code'])
            embedding = match_embeddings_with_methods_from_df(embeddings_df, method['meta'])
            methods_embeddings.append(embedding)
        frames_len = len(methods_data)

        for _ in range(frames_len, self.frames_limit):
            methods_embeddings.append(np.zeros(384))

        return np.array(methods_embeddings)

    
api = BugLocalizationModelAPI()
api.predict(methods_data)



  0%|          | 0/26 [00:00<?, ?it/s]

Parameters(lr=0.01, epoch=10, optim=<class 'torch.optim.adam.Adam'>, anneal_coef=0.5, anneal_epoch=5, dim=60)


[1 / 10] Train: Loss = 0.68975, Accuracy = 39.08%, Previous code mean = 20.00%: 100%|██████████| 26/26 [00:22<00:00,  1.14it/s]
[1 / 10]   Val: Loss = 0.68855, Accuracy = 46.72%, Previous code mean = 48.15%: 100%|██████████| 7/7 [00:02<00:00,  2.85it/s]
[2 / 10] Train: Loss = 0.68808, Accuracy = 52.19%, Previous code mean = 18.75%: 100%|██████████| 26/26 [00:20<00:00,  1.27it/s]
[2 / 10]   Val: Loss = 0.68808, Accuracy = 52.36%, Previous code mean = 41.38%: 100%|██████████| 7/7 [00:02<00:00,  3.06it/s]
[3 / 10] Train: Loss = 0.68765, Accuracy = 55.82%, Previous code mean = 14.00%: 100%|██████████| 26/26 [00:19<00:00,  1.33it/s]
[3 / 10]   Val: Loss = 0.68759, Accuracy = 56.37%, Previous code mean = 45.45%: 100%|██████████| 7/7 [00:02<00:00,  3.22it/s]
[4 / 10] Train: Loss = 0.68735, Accuracy = 59.00%, Previous code mean = 28.00%: 100%|██████████| 26/26 [00:20<00:00,  1.27it/s]
[4 / 10]   Val: Loss = 0.68773, Accuracy = 55.87%, Previous code mean = 45.16%: 100%|██████████| 7/7 [00:02<00

In [1]:
#from code2seq.code2seq import predict
from embeddings import match_embeddings_with_methods
import numpy as np
import sys
import os

PACKAGE_PARENT = '../code2seq'
sys.path.append(os.path.normpath(PACKAGE_PARENT))
PACKAGE_DATA_AGG = '../data_aggregation'
sys.path.append(os.path.normpath(PACKAGE_DATA_AGG))
PACKAGE_DATA_AGG = '../embeddings'
sys.path.append(os.path.normpath(PACKAGE_DATA_AGG))

def embed_files(path_to_files, path_to_methods):
    embeddings = match_embeddings_with_methods.process_data(path_to_files, path_to_methods)
    embeddings = np.array(embeddings).reshape(1, -1, 320)
    return embeddings


def predict_bug(model, embeddings):
    prediction = model.model(FloatTensor(embeddings))[:,:, 1]
    prediction = prediction.flatten()
    return (-prediction).argsort()[:3], prediction


path_to_files = '/Users/e.poslovskaya/bug_ml/ex_reports/4'
path_to_methods = '/Users/e.poslovskaya/bug_ml/ex_reports/stacktrace_ex.json'

embeddings = embed_files(path_to_files, path_to_methods)
predict_bug(model, embeddings)

NameError: name 'model' is not defined