In [None]:
!pip install imbalanced-learn
from imblearn.over_sampling import SMOTE
from collections import Counter
import pandas as pd
import numpy as np




binary

In [None]:
def process_alarms_bi(df):
    alarm_columns = [col for col in df.columns if col.startswith('alarm')]
    for col in alarm_columns:
        df[col] = df[col].apply(lambda x: 1 if x > 0 else 0)
    return df

df = pd.read_csv('dataset_alarms_anonymized.csv')

df_processed = process_alarms_bi(df)

df.iloc[:, 14]

0       0
1       1
2       0
3       1
4       1
       ..
1664    1
1665    0
1666    1
1667    1
1668    1
Name: alarm_14, Length: 1669, dtype: int64

Categorical

In [None]:
def process_alarms_cat(df):
    alarm_columns = [col for col in df.columns if col.startswith('alarm')]
    for col in alarm_columns:
        df[col] = df[col].apply(lambda x: 1 if 0 < x <= 45 else (2 if 45 < x <= 450 else (3 if x > 450 else 0)))
    return df

df = pd.read_csv('dataset_alarms_anonymized.csv')


df_processed = process_alarms_cat(df)

df.iloc[:, 14]



Unnamed: 0,alarm_0,alarm_1,alarm_2,alarm_3,alarm_4,alarm_5,alarm_6,alarm_7,alarm_8,alarm_9,...,alarm_155,alarm_156,alarm_157,alarm_158,alarm_159,alarm_160,alarm_161,alarm_162,alarm_163,label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1664,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,3
1665,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,3
1666,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1667,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3


In [None]:
import time
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import hyperopt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier

def optimize_KNN(X_train, y_train, X_valid, y_valid, data_gen_func):
#Inputs: - X_train: training set (features)
#        - y_train: training set (ground truth labels)
#
#Output: - knn: model to be returned by the function
#
#This function should:
#         * Perform KNN hyperparameters optimization via crossvalidation
#         * Print best hyperparameters obtained with crossvalidation
#         * Print best crossvalidation accuracy (across all hyperparameters combinations) and duration
#         * Retrain a new KNN model with best hyperparameters using the entire training set (X_train, y_train)
#         * Print training results (best accuracy and training duration)
#         * Return the trained KNN model

    #F: define the search space for your hyperparameters
    space4knn = {
     'neighb': hp.choice('neighb', [1, 5, 10, 20, 50, 100]),
     'wgts': hp.choice('wgts', ['uniform', 'distance']),
    }
    ############# ADD YOUR CODE BELOW #############
    def hyperopt_train_test(params):
          model = KNeighborsClassifier(n_neighbors=params['neighb'], weights=params['wgts'])

          return cross_val_score(model, X_train, y_train, cv = 5).mean()
          #F: cross_val_score is from scikit learn (https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html)
          #F: will use the default score (for RF it is accuracy)
          #F: this includes also training; cv=5 (5-folds crossvalidation)
          #F: .mean() is taken as cross_val_score returns an array of scores (one for each fold)

      def f(params): #F: this function is used below, as a parameter to fmin
          acc = hyperopt_train_test(params)
          return {'loss': -acc, 'status': STATUS_OK} #F: loss is returned as opposite (negative) of accuracy because we will use in f_min (that only minimizes), where we want to minimize the loss (i.e., maximize accuracy)

      trials = Trials() #F: an object that keeps track of all trials (i.e., combination of hyperparameters) tested during the optimization

      ta = time.time()
      best_params = fmin(f, space4knn, algo=tpe.suggest, max_evals=10, trials=trials)
      #F: see: https://github.com/hyperopt/hyperopt/blob/master/hyperopt/fmin.py
      #F: at this point, best_param is a dictionary where each key is the index of the corresponding best param in space4rf
      tb = time.time()
      print(best_params)

      best_params = hyperopt.space_eval(space4knn, best_params)
      #F: this is used to extract from space4rf the best values
      #   according to the indexes in best_params (and put such values in best_params)
      print(best_params)

      best_cv_acc = -round(trials.best_trial['result']['loss'], 2) #F: best across trials
      print('best_cv_acc: ' + str(best_cv_acc))
      print('Crossvalidation duration for KNN is {} s\n'.format(round(tb - ta)))

    #######################################################################################################
    #F: Now you have best hyperparameters obtained with crossvalidation and should train a new model
    #   using the entire training set using those hyperparameters and return the trained model
    ############# ADD YOUR CODE BELOW #############

    knn = KNeighborsClassifier(n_neighbors=best_params['neighb'], weights=best_params['wgts'])

    t0 = time.time()
    knn.fit(X_train, y_train)
    t1 = time.time()

    print('Best number of neighbors: {}\n'.format(best_params['neighb']))
    print('Best weight function: {}\n'.format(best_params['wgts']))
    print('Crossvalidation accuracy: {}\n'.format(best_cv_acc))
    print('Training duration for KNN is {} s\n'.format(round(t1 - t0)))


    return knn

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 36)

In [None]:
#Add a fixed number of datapoints per class WITH SMOTE (e.g., add 300 data points per each class)
#used for DataGen-2.2
#n_samples:the number points we want to add
def add_datapoints_SMOTE(data, label_column='label', n_samples=300, random_state=42):

    print(f"label distribution before adding points: {Counter(data[label_column])}")

    new_data = []

    # get all the classes
    classes = data[label_column].unique()

    for class_label in classes:
        class_data = data[data[label_column] == class_label]
        other_data = data[data[label_column] != class_label]
        X = class_data.drop(columns=label_column)
        y = class_data[label_column]

        X_smote = pd.concat([X, other_data.drop(columns=label_column)])
        y_smote = pd.concat([y, other_data[label_column]])

        # add points
        smote = SMOTE(sampling_strategy={class_label: len(class_data) + n_samples}, random_state=random_state)
        X_resampled, y_resampled = smote.fit_resample(X_smote, y_smote)

        # filter the new datas:
        new_class_data = X_resampled[y_resampled == class_label].iloc[len(class_data):]
        new_class_labels = y_resampled[y_resampled == class_label].iloc[len(class_data):]

        # add the new data into the old one
        resampled_data = pd.concat([pd.DataFrame(new_class_data, columns=X.columns), pd.DataFrame(new_class_labels, columns=[label_column])], axis=1)
        new_data.append(resampled_data)

    if new_data:
        new_data = pd.concat(new_data, axis=0)
        result_data = pd.concat([data, new_data], axis=0).reset_index(drop=True)


    print(f"label distribution after adding points: {Counter(result_data[label_column])}")

    return result_data

result_data = add_datapoints_SMOTE(df_resampled, n_samples=300)

label distribution before adding points: Counter({1: 611, 0: 611, 3: 611, 2: 611})




label distribution after adding points: Counter({1: 911, 0: 911, 3: 911, 2: 911})




In [None]:
#Remove X% of the data points of class Y from the training set
#used for DataGen-2
#class_label:0/1/2/3
def remove_class_percentage(data, class_label, percentage, label_column='label'):
    X = data.drop(columns=label_column)
    y = data[label_column]
    print(f"label distribution before remove points: {Counter(y)}")

    class_data = data[data[label_column] == class_label]
    # find the number of the data needed to be deleted
    num_to_remove = int(len(class_data) * (percentage / 100))

    indices_to_remove = np.random.choice(class_data.index, size=num_to_remove, replace=False)

    data_remaining = data.drop(indices_to_remove)
    X = data_remaining.drop(columns=label_column)
    y = data_remaining[label_column]
    print(f"label distribution after remove points: {Counter(y)}")

    return data_remaining

result_data = remove_class_percentage(df_processed, class_label=2, percentage=80)



label distribution before remove points: Counter({1: 611, 0: 515, 3: 336, 2: 207})
label distribution after remove points: Counter({1: 611, 0: 515, 3: 336, 2: 42})


In [None]:
#used for DataGen-1,DataGen-2.1
#input:the data needed to be rebalance
def balance_smote(data, label_column='label', random_state=42):
    # separate the label out
    X = data.drop(columns=label_column)
    y = data[label_column]

    print(f"label distribution before: {Counter(y)}")

    smote = SMOTE(random_state=random_state)

    # use smote
    X_resampled, y_resampled = smote.fit_resample(X, y)

    print(f"label distribution after: {Counter(y_resampled)}")

    # make it into a dataframe as output
    df_resampled = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.DataFrame(y_resampled, columns=[label_column])], axis=1)

    return df_resampled
df_resampled = balance_smote(data)
df_resampled


label distribution before: Counter({1: 611, 0: 515, 3: 336, 2: 207})
label distribution after: Counter({1: 611, 0: 611, 3: 611, 2: 611})


Unnamed: 0,alarm_0,alarm_1,alarm_2,alarm_3,alarm_4,alarm_5,alarm_6,alarm_7,alarm_8,alarm_9,...,alarm_155,alarm_156,alarm_157,alarm_158,alarm_159,alarm_160,alarm_161,alarm_162,alarm_163,label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2439,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
2440,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
2441,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
2442,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,3
