In [20]:
import sys
sys.path.append("../")

In [21]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


Options: 

* Selects one app from each cluster: 1

* Select all possible 3 train apps and 9 test apps combinations: 2


In [82]:
import pandas as pd
import os
from pathlib import Path
import json 
import logging
logging.basicConfig(format='%(asctime)s %(levelname)-7s %(message)s',
                    stream=sys.stderr, level=logging.INFO)
mpl_logger = logging.getLogger('matplotlib')
mpl_logger.setLevel(logging.INFO)

from sklearn.semi_supervised import LabelPropagation, LabelSpreading

#General ML 
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, silhouette_score,confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from modules.clustering_helpers import select_labeled_samples
from itertools import combinations
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from modules.clustering_helpers import select_labeled_samples_unseen_inputs

#Active Learning
from modAL.models import ActiveLearner
from modAL.uncertainty import uncertainty_sampling, margin_sampling, entropy_sampling


#In-house Module Imports
from config import Configuration 
from datasets import EclipseSampledDataset, VoltaSampledDataset
from utils import *

def random_sampling(classifier, X_pool):
    n_samples = len(X_pool)
    query_idx = np.random.choice(range(n_samples))
    return query_idx, X_pool[query_idx]


def call_FAR_function(false_alarm_rates,anomaly_miss_rates, test_label, y_pred, conf):
    false_alarm_rate, anom_miss_rate = FAR_AMR_Calculate(
            true_label= test_label['anom'].to_numpy(),
            pred_label= y_pred,
            result_dir= str(conf['results_dir']),
            save_name= "",
            save=False,
            verbose=False,
    )
    false_alarm_rates.append(false_alarm_rate)
    anomaly_miss_rates.append(anom_miss_rate)

query_strategy_dict = {
                       "uncertainty": uncertainty_sampling, 
                       "margin": margin_sampling, 
                       "entropy": entropy_sampling,
                       "random": random_sampling
                      }

In [23]:
user = "aksar"
logging.warning(f'Are you sure that you are: {user}?')



In [66]:
MODEL_CONFIG = "exp_3_active_learning"  # change this
SYSTEM = 'volta'  # volta or eclipse
FE_NAME = 'tsfresh' #tsfresh, or mvts => It will set the EXP_NAME. Be careful. 
NUM_FEATURE = 2000  # example: 250 ,2000, 4000
query_strategy = "random"  # "uncertainty", "margin", "entropy", "random"
CV_INDEX = 0  # it can be integer value within the range 0 1 2 3 4
repeat_num = 0
query_size = 250
classifier_name = 'rf'

In [67]:
#Constants
FS_NAME = "CHI"
method = "random" if query_strategy == 'random' else "active_learning"
num_samples_per_pair = 1
OUTPUT_DIR = f'/projectnb/peaclab-mon/{user}/active_learning_experiments'
EXP_NAME = f'{FE_NAME}_experiments'
FEATURE_SELECTION = False
SCALER = 'None' #For now, do the scaling inside the notebook, then you can move that to the class function

logging.warning('Results will be generated in {}, double check please!'.format(MODEL_CONFIG))



In [68]:
conf = Configuration(ipython=True,
                     overrides={
                         'output_dir': Path(OUTPUT_DIR), #change
                         'system' : SYSTEM,
                         'exp_name':EXP_NAME,                                                  
                         'cv_fold':CV_INDEX, 
                         'model_config': MODEL_CONFIG,
                     })

with open(str(conf['experiment_dir']) + '/anom_dict.json') as f:
    ANOM_DICT = json.load(f)
with open(str(conf['experiment_dir']) + '/app_dict.json') as f:
    APP_DICT = json.load(f) 
    
APP_REVERSE_DICT = {}
for app_name, app_encoding in APP_DICT.items():
    APP_REVERSE_DICT[app_encoding] = app_name    

ANOM_REVERSE_DICT = {}
for anom_name, anom_encoding in ANOM_DICT.items():
    ANOM_REVERSE_DICT[anom_encoding] = anom_name

2022-04-24 19:12:47,626 INFO    Setting directory names
2022-04-24 19:12:47,632 INFO    Model config folder already exists, be careful, otherwise it will overwrite!
2022-04-24 19:12:47,635 INFO    Saving configuration as CSV


# The configuration used for this run:
# {'cv_fold': 0,
#  'exp_name': 'tsfresh_experiments',
#  'experiment_dir': PosixPath('/projectnb/peaclab-mon/aksar/active_learning_experiments/volta/tsfresh_experiments'),
#  'feature_extract': False,
#  'feature_select': False,
#  'hdf_data_path': PosixPath('/projectnb/peaclab-mon/aksar/datasets/tpds_data_hdfs'),
#  'metadata_path': None,
#  'model_config': 'exp_3_active_learning',
#  'model_config_dir': PosixPath('/projectnb/peaclab-mon/aksar/active_learning_experiments/volta/tsfresh_experiments/CV_0/exp_3_active_learning'),
#  'model_dir': PosixPath('/projectnb/peaclab-mon/aksar/active_learning_experiments/volta/tsfresh_experiments/CV_0/exp_3_active_learning/model'),
#  'num_split': 5,
#  'operation': 'read',
#  'output_dir': PosixPath('/projectnb/peaclab-mon/aksar/active_learning_experiments/volta'),
#  'plots_dir': PosixPath('/projectnb/peaclab-mon/aksar/active_learning_experiments/volta/tsfresh_experiments/CV_0/exp_3_active_learning/model/p

In [69]:
if SYSTEM == 'eclipse':
    eclipseDataset = EclipseSampledDataset(conf)
    train_data, train_label, test_data, test_label = eclipseDataset.load_dataset(scaler=SCALER,
                                                                                 cv_fold=CV_INDEX,
                                                                                 borghesi=False, 
                                                                                 mvts=True if FE_NAME == 'mvts' else False, 
                                                                                 tsfresh=True if FE_NAME == 'tsfresh' else False)

elif SYSTEM == 'volta':
    voltaDataset = VoltaSampledDataset(conf)
    train_data, train_label, test_data, test_label = voltaDataset.load_dataset(scaler=SCALER,
                                                                               cv_fold=CV_INDEX,
                                                                               borghesi=False,
                                                                               mvts=True if FE_NAME == 'mvts' else False,
                                                                               tsfresh=True if FE_NAME == 'tsfresh' else False)

assert list(train_data.index) == list(train_label.index) #check the order of the labels     
assert list(test_data.index) == list(test_label.index) #check the order of the labels    

if FEATURE_SELECTION:
    selected_features = pd.read_csv(conf['experiment_dir'] / 'selected_features.csv')
    train_data = train_data[list(selected_features['0'].values)]
    test_data = test_data[list(selected_features['0'].values)]
    
train_label['anom_names'] = train_label.apply(lambda x: ANOM_REVERSE_DICT[x['anom']], axis=1)
train_label['app_names']=train_label['app'].apply(lambda x: APP_REVERSE_DICT[x])
test_label['anom_names'] = test_label.apply(lambda x: ANOM_REVERSE_DICT[x['anom']], axis=1)
test_label['app_names']=test_label['app'].apply(lambda x: APP_REVERSE_DICT[x])

all_data = pd.concat([train_data, test_data])
all_data = all_data.dropna(axis=1, how='any')
all_label = pd.concat([train_label,test_label])

train_data = all_data.loc[train_label.index]
test_data = all_data.loc[test_label.index]
    
logging.info("Train data shape %s",train_data.shape)
logging.info("Train label shape %s",train_label.shape)
logging.info("Test data shape %s",test_data.shape)  
logging.info("Test label shape %s",test_label.shape)

logging.info("Train data label dist: \n%s",train_label['anom'].value_counts())
logging.info("Test data label dist: \n%s",test_label['anom'].value_counts())  

2022-04-24 19:12:48,519 INFO    BaseDataset Class Initialization
2022-04-24 19:12:48,520 INFO    HPCDataset Class Initialization
2022-04-24 19:12:48,521 INFO    VoltaSampledDataset Class Initialization
2022-04-24 19:14:38,361 INFO    Train data shape (6326, 102311)
2022-04-24 19:14:38,363 INFO    Train label shape (6326, 2)
2022-04-24 19:14:38,363 INFO    Test data shape (14589, 102311)
2022-04-24 19:14:38,364 INFO    Test label shape (14589, 2)
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


2022-04-24 19:15:33,399 INFO    Train data shape (6326, 99169)
2022-04-24 19:15:33,400 INFO    Train label shape (6326, 4)
2022-04-24 19:15:33,401 INFO    Test data shape (14589, 99169)
2022-04-24 19:15:33,402 INFO    Test label shape (14589, 4)
2022-04-24 19:15:33,406 INFO    Train data label dist: 
0    5694
2     159
4     159
1     158
3     156
Name: anom, dtype: int64
2022-04-24 19:15:33,409 INFO    Test data label dist: 
0    13286
1      3

In [70]:
SCALER = 'MinMax'

if SCALER == 'MinMax':
    
    minmax_scaler = MinMaxScaler().fit(train_data)
    train_data = pd.DataFrame(minmax_scaler.transform(train_data),columns=train_data.columns,index=train_data.index)
    test_data = pd.DataFrame(minmax_scaler.transform(test_data),columns=test_data.columns,index=test_data.index)
    
elif SCALER == 'Standard':
    
    # Standardize data (per feature Z-normalization, i.e. zero-mean and unit variance)        
    scaler = StandardScaler().fit(train_data)
    train_data = pd.DataFrame(scaler.transform(train_data),columns=train_data.columns,index=train_data.index)
    test_data = pd.DataFrame(scaler.transform(test_data),columns=test_data.columns,index=test_data.index)  
    
#Implement new feature selection strategies below
if FS_NAME == 'CHI':
    
    selector = SelectKBest(chi2, k=NUM_FEATURE)
    selector.fit(train_data,train_label['anom'])
    train_data = train_data[train_data.columns[selector.get_support(indices=True)]]
    selected_columns = train_data.columns
    test_data = test_data[test_data.columns & selected_columns]
    
elif FS_NAME == 'TSFRESH':
    logging.warning("NUM_FEATURE parameter will be overwritten by the automatic selection process")
    
    y_train = train_label['anom']
    X_train = train_data

    relevant_features = set()

    for label in y_train.unique():
        y_train_binary = y_train == label
        X_train_filtered = tsfresh.select_features(X_train, y_train_binary)
        print("Number of relevant features for class {}: {}/{}".format(label, X_train_filtered.shape[1], X_train.shape[1]))
        relevant_features = relevant_features.union(set(X_train_filtered.columns))    
    train_data = train_data[relevant_features]
    test_data = test_data[relevant_features]
    NUM_FEATURE = len(relevant_features)
    
elif FS_NAME == 'NONE':
    logging.info("No feature selection strategy is specified, will be using all features")
    NUM_FEATURE = len(train_data.columns)
    
logging.info(train_data.shape)
logging.info(test_data.shape)

2022-04-24 19:16:12,994 INFO    (6326, 2000)
2022-04-24 19:16:12,995 INFO    (14589, 2000)


In [71]:
#Read the node_ids considered labeled
labeled_train_label = pd.read_csv(conf['experiment_dir'] / f'CV_{CV_INDEX}'/ f'labeled_train_label_{num_samples_per_pair}.csv', index_col=['node_id'])
labeled_test_label = pd.read_csv(conf['experiment_dir'] / f'CV_{CV_INDEX}'/ f'labeled_test_label_{num_samples_per_pair}.csv', index_col=['node_id'])
node_indices_labeled = list(labeled_train_label['anom'].index.values)

logging.info("Labeled data label dist: \n%s",labeled_train_label['anom'].value_counts())
logging.info("Unlabeled data label dist: \n%s",labeled_test_label['anom'].value_counts())

#Set a new column for label status
node_indices_unlabeled = []
for node in train_label.index:
    if node not in node_indices_labeled:
        node_indices_unlabeled.append(node)
train_label['label_status'] = train_label['anom'] # for the full data case
train_label['label_status'] = np.where(train_label.index.get_level_values('node_id').isin(node_indices_unlabeled), -1,train_label['label_status'])

2022-04-24 19:16:13,070 INFO    Labeled data label dist: 
2    12
4    11
3    11
1    11
0    11
Name: anom, dtype: int64
2022-04-24 19:16:13,073 INFO    Unlabeled data label dist: 
0    5683
4     148
2     147
1     147
3     145
Name: anom, dtype: int64


In [72]:
metadata = pd.read_csv(conf['hdf_data_path'] / f'{SYSTEM}_metadata.csv', index_col=['node_id'])
metadata = metadata.loc[all_label.index]

In [73]:
metadata.head()

Unnamed: 0_level_0,anomaly,anomaly_start,app,healthy,input,intensity,node_no,platform,runID,unwanted
node_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
59e500e757f3f44ead63f370,leak,0,ft,A,Y,50,1,volta,59e500e857f3f44ead63f373,0.0
59e500e857f3f44ead63f372,none,0,ft,U,Y,50,3,volta,59e500e857f3f44ead63f373,0.0
59e500e857f3f44ead63f377,none,0,ft,U,Y,50,3,volta,59e500e857f3f44ead63f378,0.0
59e500e957f3f44ead63f37a,leak,0,ft,A,Y,50,1,volta,59e500e957f3f44ead63f37d,0.0
59e500ea57f3f44ead63f384,leak,0,ft,A,Y,50,1,volta,59e500ea57f3f44ead63f387,0.0


In [74]:
train_label['input'] = metadata.loc[train_label.index]['input'].values
test_label['input'] = metadata.loc[test_label.index]['input'].values
labeled_train_label['input'] = metadata.loc[labeled_train_label.index]['input'].values
labeled_test_label['input'] = metadata.loc[labeled_test_label.index]['input'].values

In [75]:
if classifier_name == 'rf':
    selected_classifier = RandomForestClassifier()
elif classifier_name == 'lr':
    selected_classifier = LogisticRegression()
else:
    selected_classifier = RandomForestClassifier()

In [76]:
if SYSTEM == 'volta':
    unique_app_inputs = ['X','Y','Z']
elif SYSTEM == 'eclipse':
    unique_app_inputs = None

In [77]:
all_test_input_groups = []
all_train_input_groups = []

for temp_input in list(combinations(unique_app_inputs,1)):
    
    all_train_input_groups.append([temp_input[0]])
    all_test_input_groups.append(list(set(unique_app_inputs) - set(temp_input)))    
    
for temp_input in list(combinations(unique_app_inputs,2)):
    
    all_train_input_groups.append([temp_input[0],temp_input[1]])
    all_test_input_groups.append(list(set(unique_app_inputs) - set(temp_input)))       

In [79]:
%%time

scores = pd.DataFrame()

for test_inputs, train_inputs in zip(all_test_input_groups, all_train_input_groups):
    
        
        logging.info("Test inputs: %s", test_inputs)
        logging.info("Train inputs: %s", train_inputs)
        
        test_inputs_label = test_label[test_label['input'].isin(test_inputs)]
        assert set(test_inputs_label['input'].unique()) == set(test_inputs)
        
        test_inputs_data = test_data.loc[test_inputs_label.index]
        assert list(test_inputs_data.index) == list(test_inputs_label.index)  
        
        
        
        initial_labeled_pool_labels, unlabeled_pool_labels, node_indices_labeled = select_labeled_samples_unseen_inputs(train_label, train_inputs)
        logging.info("Labeled data label dist: \n%s",labeled_train_label['anom'].value_counts())
        logging.info("Labeled data label dist: \n%s",labeled_train_label['app'].value_counts())
        logging.info("Labeled data label dist: \n%s",labeled_train_label['input'].value_counts())
        logging.info("########################")
        logging.info("Unlabeled data label dist: \n%s",unlabeled_pool_labels['anom'].value_counts())
        logging.info("Unlabeled data label dist: \n%s",unlabeled_pool_labels['app'].value_counts())
        logging.info("Unlabeled data label dist: \n%s",unlabeled_pool_labels['input'].value_counts())
        logging.info("########################")        
        

        #Create the label and data for the starting condition composed of selected apps 
        y_initial = initial_labeled_pool_labels
        x_initial = train_data[train_data.index.get_level_values('node_id').isin(y_initial.index)]

        y_initial = initial_labeled_pool_labels['anom'].to_numpy()
        x_initial = x_initial.to_numpy()

        x_unlabeled = train_data[train_data.index.get_level_values('node_id').isin(unlabeled_pool_labels.index)]
        y_unlabeled = unlabeled_pool_labels#['anom'].to_numpy()
        x_unlabeled = x_unlabeled.to_numpy()
                
        #Initializations
        macro_f1_scores = []
        anomaly_miss_rates = []
        false_alarm_rates = []

        if query_strategy != "random":         
            selected_indices_apps = []
            selected_indices_anoms = []        
            selected_indices_inputs = []

        X_pool = x_unlabeled.copy()
        y_pool = y_unlabeled.copy() 
        y_pool_anom = y_pool['anom'].to_numpy()
        y_pool_app = y_pool['app_names'].to_numpy()                
        y_pool_input = y_pool['input'].to_numpy()

        learner = ActiveLearner(
            estimator=RandomForestClassifier(),
            query_strategy=query_strategy_dict[query_strategy],
            X_training=x_initial, y_training=y_initial
        )        

        #logging.info("Test inputs label check: %s", test_inputs_label['input'].unique())
        y_pred = learner.predict(test_inputs_data.to_numpy())        
        report_dict = classification_report(test_inputs_label['anom'].to_numpy(), y_pred, output_dict = True)
        macro_f1_scores.append(report_dict['macro avg']['f1-score'])                                        
        call_FAR_function(false_alarm_rates,anomaly_miss_rates, test_inputs_label, y_pred, conf)
        logging.info("Active learning query starts")
        
        for i in range(query_size):
            query_idx, query_sample = learner.query(X_pool)

            if query_strategy != "random":
                selected_indices_apps.append(y_pool_app[query_idx][0])
                selected_indices_anoms.append(y_pool_anom[query_idx][0])
                selected_indices_inputs.append(y_pool_input[query_idx][0])

            learner.teach(
                X=X_pool[query_idx].reshape(1,-1),
                y=y_pool_anom[query_idx].reshape(1,)
            )

            X_pool, y_pool_anom, y_pool_app = np.delete(X_pool, query_idx, axis=0), np.delete(y_pool_anom, query_idx, axis=0), np.delete(y_pool_app, query_idx, axis=0)
            y_pred = learner.predict(test_inputs_data.to_numpy())              

            report_dict = classification_report(test_inputs_label['anom'].to_numpy(), y_pred, output_dict = True)
            macro_f1_scores.append(report_dict['macro avg']['f1-score'])                                        
            call_FAR_function(false_alarm_rates,anomaly_miss_rates, test_inputs_label, y_pred, conf)    

        for j in range(len(macro_f1_scores)):
            scores = scores.append({'query_iter':j,
                                    'macro_avg_f1_score':macro_f1_scores[j],
                                    'false_alarm_rate':false_alarm_rates[j],
                                    'anomaly_miss_rate':anomaly_miss_rates[j], 
                                    'repeat_num':repeat_num},
                                   ignore_index = True)

        scores['fold'] = CV_INDEX
        scores['method'] = method
        scores['query_strategy'] = query_strategy
        scores['model'] = selected_classifier.__class__.__name__
        scores['dataset'] = SYSTEM
        scores['fe'] = FE_NAME
        scores['feature_count'] = NUM_FEATURE   
        scores['query_size'] = query_size

        scores = scores.sort_values(by = ['query_iter']).reset_index(drop = True)

        train_input_names = '-'.join(train_inputs)
        test_input_names = '-'.join(test_inputs)    

        filename = f'train:{train_input_names}#test:{test_input_names}#{FE_NAME}#{NUM_FEATURE}#{method}#{query_strategy}#{query_size}#{selected_classifier.__class__.__name__}#{repeat_num}.csv'
        scores.to_csv(Path(conf["results_dir"]) / filename)   

        logging.info("Saving: %s", filename)

        if query_strategy != "random":
            selected_app_anom_df = pd.DataFrame()
            selected_app_anom_df['apps'] = selected_indices_apps
            selected_app_anom_df['anoms'] = selected_indices_anoms
            selected_app_anom_df['inputs'] = selected_indices_inputs
            selected_app_anom_df.to_csv(
            Path(conf["results_dir"])
            / f"train:{train_input_names}#test:{test_input_names}#{FE_NAME}#{NUM_FEATURE}#{method}#{query_strategy}#{query_size}#{selected_classifier.__class__.__name__}#{repeat_num}#app-anom-selection.csv",
            index=False)
            logging.info("Saved selected apps and anoms")    
                
        #break


2022-04-24 19:16:13,447 INFO    Test inputs: ['Z', 'Y']
2022-04-24 19:16:13,448 INFO    Train inputs: ['X']
2022-04-24 19:16:13,882 INFO    Labeled data label dist: 
2    12
4    11
3    11
1    11
0    11
Name: anom, dtype: int64
2022-04-24 19:16:13,886 INFO    Labeled data label dist: 
3     6
10    5
9     5
8     5
7     5
6     5
5     5
4     5
2     5
1     5
0     5
Name: app, dtype: int64
2022-04-24 19:16:13,888 INFO    Labeled data label dist: 
X    19
Z    19
Y    18
Name: input, dtype: int64
2022-04-24 19:16:13,890 INFO    ########################
2022-04-24 19:16:13,892 INFO    Unlabeled data label dist: 
0    5683
2     148
4     148
1     147
3     145
Name: anom, dtype: int64
2022-04-24 19:16:13,895 INFO    Unlabeled data label dist: 
9     582
10    581
0     581
3     580
2     580
1     580
4     580
8     579
6     554
5     549
7     525
Name: app, dtype: int64
2022-04-24 19:16:13,898 INFO    Unlabeled data label dist: 
Y    2120
X    2107
Z    2044
Name: input, dt

CPU times: user 12min 44s, sys: 1min 9s, total: 13min 53s
Wall time: 13min 53s


In [80]:
scores

Unnamed: 0,anomaly_miss_rate,false_alarm_rate,macro_avg_f1_score,query_iter,repeat_num,fold,method,query_strategy,model,dataset,fe,feature_count,query_size
0,0.233220,0.898713,0.186846,0.0,0.0,0,random,random,RandomForestClassifier,volta,tsfresh,2000,250
1,0.207763,0.474932,0.186357,0.0,0.0,0,random,random,RandomForestClassifier,volta,tsfresh,2000,250
2,0.188439,0.916573,0.093996,0.0,0.0,0,random,random,RandomForestClassifier,volta,tsfresh,2000,250
3,0.209906,0.545354,0.157772,0.0,0.0,0,random,random,RandomForestClassifier,volta,tsfresh,2000,250
4,0.103248,0.727537,0.157635,0.0,0.0,0,random,random,RandomForestClassifier,volta,tsfresh,2000,250
5,0.208617,0.849161,0.063806,0.0,0.0,0,random,random,RandomForestClassifier,volta,tsfresh,2000,250
6,0.265306,0.708439,0.100443,1.0,0.0,0,random,random,RandomForestClassifier,volta,tsfresh,2000,250
7,0.205479,0.456865,0.204659,1.0,0.0,0,random,random,RandomForestClassifier,volta,tsfresh,2000,250
8,0.191038,0.472832,0.186067,1.0,0.0,0,random,random,RandomForestClassifier,volta,tsfresh,2000,250
9,0.161253,0.645742,0.153744,1.0,0.0,0,random,random,RandomForestClassifier,volta,tsfresh,2000,250
