In [1]:
import os
import shutil

import pandas as pd
import logpred_method as experiment

from sklearn.model_selection import train_test_split


# Use "FRACTION = None" for full dataset
FRACTION: float = None


# lr: Linear Regression
# ab: Ada Boost
# rf: Random Forest
# dt: Decision Tree
# et: Extra Trees
MODELS = ["lr", "ab", "rf", "dt", "et"]


# You can ignore features on the experiment
IGNORED_FEATURES = ["tryCatchQty_class", "tryCatchQty_method"]


# Hyperparameter tuning
TUNING_ENABLED = True


# Stores estimators and feature importances across experiments
ESTIMATORS = {}
FEATURE_IMPORTANCES = {}

# Utilities

In [2]:
def merge_scores(scores):
    """
    Returns a merged score from a sequence of scores.
    This is useful to see scores as Pandas DataFrames.
    
    Example:
        in  - [{"a": 1, "b": 2}, {"a": 10, "b": 20}]
        out - {"a": [1, 10], "b": [2, 20]}
    """
    merged = {k:[] for k in scores[0].keys()}
    for score in scores:
        for k, v in score.items():
            merged[k].append(v)

    return merged

# Experiment CSV and Output directory

In [3]:
csv_path = os.path.abspath(os.path.join("data", "dataset.csv"))

X, y = experiment.load_dataset(csv_path, drops=IGNORED_FEATURES, fraction=FRACTION)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=experiment.RANDOM_SEED
)

output_dir = os.path.abspath(os.path.join("out", "ml", f"evaluation-tuning-{TUNING_ENABLED}"))
if os.path.exists(output_dir):
    shutil.rmtree(output_dir)
os.makedirs(output_dir)

# RQ 1. What  is  the  performance  of  machine  learning  models in  predicting  log  placement  in  a  large-scale  enterprise system?

In [4]:
def rq1():
    scores = []
    for model in MODELS:
        out = experiment.run(
            model,
            X_train=X_train,
            X_test=X_test,
            y_train=y_train,
            y_test=y_test,
            output_to=os.path.join(output_dir, f"rq1-{model}.log"),
            tuning_enabled=TUNING_ENABLED
        )
        estimator, score, fi = out
        scores.append(score)
        
        # Save to the global state this run
        ESTIMATORS[model] = estimator
        FEATURE_IMPORTANCES[model] = fi

    return scores

rq1_scores = rq1()

## Results

In [5]:
results_rq1 = pd.DataFrame.from_dict(merge_scores(rq1_scores)).set_index(["model"])
results_rq1.reset_index().to_csv(
    os.path.join(output_dir, "rq1-results.csv"),
    index=False,
)
results_rq1["acc prec recall tn fp fn tp total".split(" ")]

Unnamed: 0_level_0,acc,prec,recall,tn,fp,fn,tp,total
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
lr,0.678597,0.656053,0.373446,56232,929,2973,1772,61906
ab,0.712096,0.645349,0.444468,56002,1159,2636,2109,61906
rf,0.803219,0.814496,0.618124,56493,668,1812,2933,61906
dt,0.746284,0.585751,0.523288,55405,1756,2262,2483,61906
et,0.777034,0.740093,0.570706,56210,951,2037,2708,61906


# RQ 2. What is the impact of different class balancing strategies on prediction?

In [6]:
# Similar to rq1 but we include sampling in the experiment now.
def rq2():
    scores = []
    for model in MODELS:
        for balancing in ["smote", "rus"]:
            out = experiment.run(
                model,
                X_train=X_train,
                X_test=X_test,
                y_train=y_train,
                y_test=y_test,
                balancing=balancing,
                output_to=os.path.join(output_dir, f"rq2-{model}-{balancing}.log"),
                tuning_enabled=TUNING_ENABLED
            )
            estimator, score, fi = out
            scores.append(score)
            
            # Save to the global state this run
            key = f"{model}-{balancing}"
            ESTIMATORS[key] = estimator
            FEATURE_IMPORTANCES[key] = fi

    return scores

rq2_scores = rq2()

## Results

In [7]:
results_rq2 = pd.DataFrame.from_dict(merge_scores(rq2_scores)).set_index(["model", "balancing"])
results_rq2.reset_index().to_csv(
    os.path.join(output_dir, "rq2-results.csv"),
    index=False,
)
relevant_cols = "acc prec recall tn fp fn tp".split(" ")
results_rq2[relevant_cols]

Unnamed: 0_level_0,Unnamed: 1_level_0,acc,prec,recall,tn,fp,fn,tp
model,balancing,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
lr,smote,0.886724,0.385021,0.891675,50403,6758,514,4231
lr,rus,0.887904,0.382456,0.89589,50297,6864,494,4251
ab,smote,0.888355,0.314637,0.948156,47361,9800,246,4499
ab,rus,0.900482,0.371991,0.931507,49699,7462,325,4420
rf,smote,0.909537,0.50417,0.891886,52999,4162,513,4232
rf,rus,0.92386,0.412009,0.961644,50649,6512,182,4563
dt,smote,0.879321,0.389187,0.872287,50665,6496,606,4139
dt,rus,0.893832,0.323624,0.953003,47710,9451,223,4522
et,smote,0.909087,0.457025,0.907692,52044,5117,438,4307
et,rus,0.918232,0.408215,0.950896,50620,6541,233,4512


Comparative result to the baseline (no balancing). Positive value indicates improvement.

In [8]:
results_rq2_rel = results_rq2.loc[MODELS, relevant_cols] - results_rq1.loc[MODELS, relevant_cols]
results_rq2_rel.reset_index().to_csv(
    os.path.join(output_dir, "rq2-results-relative.csv"),
    index=False
)
results_rq2_rel

Unnamed: 0_level_0,Unnamed: 1_level_0,acc,prec,recall,tn,fp,fn,tp
model,balancing,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
lr,smote,0.208127,-0.271032,0.51823,-5829,5829,-2459,2459
lr,rus,0.209308,-0.273597,0.522445,-5935,5935,-2479,2479
ab,smote,0.176259,-0.330711,0.503688,-8641,8641,-2390,2390
ab,rus,0.188386,-0.273358,0.487039,-6303,6303,-2311,2311
rf,smote,0.106318,-0.310326,0.273762,-3494,3494,-1299,1299
rf,rus,0.120641,-0.402487,0.343519,-5844,5844,-1630,1630
dt,smote,0.133038,-0.196565,0.348999,-4740,4740,-1656,1656
dt,rus,0.147548,-0.262127,0.429715,-7695,7695,-2039,2039
et,smote,0.132052,-0.283068,0.336986,-4166,4166,-1599,1599
et,rus,0.141198,-0.331878,0.38019,-5590,5590,-1804,1804


# RQ 3. What are the most recurring relevant features across models?

In [9]:
def rank_to_df(rank, top=3):
    cols = ["total"] + [i+1 for i in range(top)]
    data = pd.DataFrame.from_records(
        [(name, sum(count[:top]), *count[:top]) for name, count in rank.items()],
        columns=["feature"] + cols
    )
    return data[data["total"] > 0].sort_values(by=cols, ascending=False)


def feature_importance_rank(selected_models):
    rank = {}
    for model in selected_models:
        ordered_features = sorted(
            FEATURE_IMPORTANCES[model],
            key=lambda pair: abs(pair[1]),
            reverse=True
        )
        for pos, feature_pair, in enumerate(ordered_features):
            feature = feature_pair[0]
            if feature not in rank.keys():
                rank[feature] = [0 for i in range(len(ordered_features))]
            rank[feature][pos] += 1
    return rank

## Results

In [10]:
fi = rank_to_df(
    feature_importance_rank(
        FEATURE_IMPORTANCES.keys()
    ),
    top=5
)
fi.to_csv(
    os.path.join(output_dir, "rq3-results.csv"),
    index=False
)
fi

Unnamed: 0,feature,total,1,2,3,4,5
3,maxNestedBlocks,15,11,1,0,2,1
23,loc_method,9,1,2,2,2,2
10,uniqueWordsQty_method,7,0,2,2,2,1
49,methodsInvokedQty,6,0,0,1,1,4
11,cbo_method,5,0,2,2,0,1
29,wmc_method,4,0,4,0,0,0
16,cbo_class,4,0,1,0,3,0
0,constructor_True,3,1,0,1,1,0
1,type_interface,2,1,1,0,0,0
4,constructor_False,2,0,1,0,0,1


# RQ 4. How  well  a  model  trained  with  open-source  data  can generalize  to  the  context  of  a  large-scale  enterprise system?

In [11]:
def selected_apache_projects():
    """
    Returns the name of the selected Apache projects as listed in the "out/selection" directory.
    """
    selection_dir = os.path.abspath(os.path.join("out", "selection"))
    return sorted([
        selected.replace(".sh", "")
        for selected in os.listdir(selection_dir)
        if selected.endswith(".sh")
    ])


def load_X_y(project: str):
    dataset_path = os.path.abspath(
        os.path.join("out", "dataset", project, "dataset_full.csv")
    )
    X_apache, y_apache = experiment.load_dataset(
        dataset_path, drops=IGNORED_FEATURES
    )
    assert X.shape[1] == X.shape[1]

    return X_apache, y_apache


APACHE_PROJECTS = {
    project: load_X_y(project)
    for project in selected_apache_projects()
}

assert len(APACHE_PROJECTS) == 29

In [12]:
for k, v in APACHE_PROJECTS.items():
    print(f"{k:20} {str(v[0].shape):>15}")

accumulo                 (25458, 63)
ambari                   (21997, 63)
archiva                   (5995, 63)
bookkeeper               (12711, 63)
cloudstack               (52390, 63)
commons-beanutils         (1176, 63)
cxf                      (33589, 63)
fluo                      (2094, 63)
giraph                    (8039, 63)
helix                     (6790, 63)
ignite                   (65181, 63)
jmeter                    (8599, 63)
knox                      (6821, 63)
lens                      (6231, 63)
metamodel                 (4122, 63)
myfaces-tobago            (3866, 63)
nutch                     (3321, 63)
oodt                      (6933, 63)
oozie                     (8821, 63)
openmeetings              (4839, 63)
reef                      (6150, 63)
sqoop                     (3080, 63)
storm                    (24208, 63)
syncope                  (14915, 63)
tez                       (8947, 63)
thrift                    (1797, 63)
tomcat                   (23793, 63)
z

## Learning from all Apache projects

In [13]:
X_apache_all = pd.concat(
    [X_apache for X_apache, _ in APACHE_PROJECTS.values()],
    ignore_index=True,
)
y_apache_all = pd.concat(
    [y_apache for _, y_apache in APACHE_PROJECTS.values()],
    ignore_index=True,
)

# Sum of entries must be equals to the number of final entries
assert sum([X.shape[0] for X, _ in APACHE_PROJECTS.values()]) == X_apache_all.shape[0]

# apache dataset size, all together
X_apache_all.shape

(388095, 63)

In [14]:
def rq4():
    scores = []
    model = "rf"
    out = experiment.run(
        model,
        X_train=X_apache_all,
        X_test=X_test,
        y_train=y_apache_all,
        y_test=y_test,
        output_to=os.path.join(output_dir, f"rq4-{model}-apache-all.log"),
        tuning_enabled=TUNING_ENABLED
    )
    estimator, score, fi = out
    score["project"] = "apache-all"
    score["training_size"] = X_apache_all.shape[0]
    scores.append(score)

    # Save to the global state this run
    key = f"{model}-apache-all"
    ESTIMATORS[key] = estimator
    FEATURE_IMPORTANCES[key] = fi

    return scores


rq4_scores_all = rq4()

## Learning from Projects Individually

In [15]:
def rq4_individual():
    scores = []
    model = "rf"
    for project, Xy in APACHE_PROJECTS.items():
        out = experiment.run(
            model,
            X_train=Xy[0].drop(columns=["type"]),
            X_test=X_test.drop(columns=["type"]),
            y_train=Xy[1].drop(columns=["type"]),
            y_test=y_test.drop(columns=["type"]),
            output_to=os.path.join(output_dir, f"rq4-{model}-{project}.log"),
            tuning_enabled=TUNING_ENABLED
        )
        estimator, score, fi = out
        score["project"] = project
        score["training_size"] = Xy[0].shape[0]
        scores.append(score)

        # Save to the global state this run
        key = f"{model}-{project}"
        ESTIMATORS[key] = estimator
        FEATURE_IMPORTANCES[key] = fi

    return scores


rq4_scores_individual = rq4_individual()

## Results

In [16]:
results_rq4 = pd.DataFrame.from_dict(
    merge_scores(
        rq4_scores_all + rq4_scores_individual
    )
)
results_rq4.to_csv(
    os.path.join(output_dir, "rq4-results.csv"),
    index=False
)
results_rq4.drop(columns=["model", "balancing"]).sort_values(by="acc prec recall".split(" "), ascending=False)

Unnamed: 0,prec,recall,acc,tn,fp,fn,tp,total,mean_fit_time,std_fit_time,mean_test_score,std_test_score,project,training_size
5,0.621649,0.513172,0.743622,55679,1482,2310,2435,61906,34.136095,0.453911,0.701172,0.049714,cloudstack,52390
28,0.56647,0.464278,0.717391,55475,1686,2542,2203,61906,5.086968,0.294842,0.608371,0.052231,zeppelin,10953
18,0.537485,0.463857,0.715361,55267,1894,2544,2201,61906,2.874667,0.0153,0.687929,0.05163,oodt,6933
3,0.512434,0.429926,0.697985,55220,1941,2705,2040,61906,2.429385,0.068277,0.57541,0.027376,archiva,5995
17,0.548102,0.404636,0.688471,55578,1583,2825,1920,61906,1.505548,0.049739,0.743647,0.036559,nutch,3321
10,0.46471,0.403793,0.682592,54954,2207,2829,1916,61906,3.089493,0.042276,0.610663,0.069676,helix,6790
22,0.609648,0.375553,0.677796,56020,1141,2963,1782,61906,0.308523,0.002308,0.637366,0.061589,sqoop,3080
25,0.539005,0.369863,0.671802,55660,1501,2990,1755,61906,4.123706,0.057962,0.689929,0.028932,tez,8947
23,0.556629,0.35216,0.664438,55830,1331,3074,1671,61906,1.305398,0.04908,0.614461,0.034391,storm,24208
9,0.433325,0.356797,0.659032,54947,2214,3052,1693,61906,0.345735,0.018126,0.696688,0.048262,giraph,8039
