# Base algorithm for features' importance classification for MLP

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
import pickle
from lime.lime_tabular import LimeTabularExplainer

labels = ["NoEvents", "Attack", "Natural"]

X = pd.read_csv("Data/data%d.csv"%1) #read file

X = X.replace(np.inf, np.finfo(np.float32).max) #replacing 'inf' with its equivalent in float32 datatype

#preparing the label converter
le = preprocessing.LabelEncoder()
le.fit(labels)

#assigning the training data and the labels into variables
y = le.transform(X['marker'])
X = X.drop(columns='marker')

features = list(X.columns)

clf = MLPClassifier(hidden_layer_sizes=(20,), max_iter=1000, early_stopping=True)
X=X.values

clf.fit(X,y)

explainer = LimeTabularExplainer(X, training_labels = y, feature_names = features, class_names = labels)


Xall = []
for i in range(2,16):
    Xall.append(pd.read_csv("Data/data%d.csv"%i))

Xall = pd.concat(Xall)
Xall = Xall.replace(np.inf, np.finfo(np.float32).max) #replacing 'inf' with its equivalent in float32 datatype

#assigning the training data and the labels into variables
yall = le.transform(Xall['marker'])
Xall = Xall.drop(columns='marker')

boole = (yall != clf.predict(Xall))

res = []

for act_class in [0, 1, 2]:
    faulty = boole & (yall == act_class)
    X_test = Xall[faulty]
    y_test = yall[faulty]
    lst = []
    for idx in range(0, 100):
        exp = explainer.explain_instance(X_test.iloc[idx], clf.predict_proba, num_features=128, labels=[0, 1, 2])
        lst.append(exp.as_list(label=0))
    lst = np.array(lst)
    clst = np.concatenate(lst, axis=0)
    dtfr = pd.DataFrame(clst, columns=['feature', 'importance'])
    dtfr["importance"] = pd.to_numeric(dtfr["importance"])
    dtfr = dtfr.groupby(['feature']).mean()
    res.append(dtfr.sort_values(by="importance"))
  


In [2]:
res 

[                       importance
 feature                          
 R1-PA9:VH > 0.00        -0.155915
 R3-PM1:V <= 128600.80   -0.151731
 R2-PA:Z > 12.11         -0.120849
 R3-PM9:V <= 0.00        -0.075508
 R4-PM8:V <= 0.00        -0.071103
 ...                           ...
 R1-PM7:V > 132060.91     0.042774
 R1:S > 0.00              0.053059
 R3-PM9:V > 0.00          0.066984
 R3-PM3:V <= 128676.02    0.082892
 R4-PM8:V > 0.00          0.111197
 
 [316 rows x 1 columns],                                    importance
 feature                                      
 R3-PM1:V <= 128600.80               -0.149137
 R2-PA:Z > 12.11                     -0.118818
 R3-PM9:V <= 0.00                    -0.075927
 R4-PM8:V <= 0.00                    -0.070603
 R2-PM1:V <= 128762.21               -0.066452
 ...                                       ...
 R3-PM7:V <= 128600.80                0.041327
 128600.80 < R3-PM1:V <= 129704.03    0.041568
 129704.03 < R3-PM1:V <= 130631.74    0.052262
 R

# Choosen features values modification

In [3]:
from sklearn.metrics import classification_report

print(classification_report(yall, clf.predict(Xall), labels=[0,1,2], target_names=labels))

precision    recall  f1-score   support

    NoEvents       0.71      0.99      0.83     51797
      Attack       0.57      0.05      0.09     17382
     Natural       0.00      0.00      0.00      4232

    accuracy                           0.71     73411
   macro avg       0.43      0.35      0.30     73411
weighted avg       0.64      0.71      0.60     73411



In [4]:
Xmod = Xall.copy()

def modify(feat, val):
    Xmod[feat] =Xmod[feat].apply(lambda x: x + val)


modify("R4-PM8:V", 0)
modify("R3-PM3:V", 128676.02)
modify("R3-PM9:V", 0)
modify("R1:S", 0)
modify("R1-PM7:V", -132060.91)

modify("R3-PM3:V", 128676.02)
modify("R3-PM1:V", -130631.74)
modify("R3-PM7:V", 128600.80)

modify("R3-PM3:V", 128676.02)
modify("R2-PA:Z", 4)


In [5]:
print(classification_report(yall, clf.predict(Xmod), labels=[0,1,2], target_names=labels))

precision    recall  f1-score   support

    NoEvents       0.74      0.07      0.12     51797
      Attack       0.24      0.93      0.38     17382
     Natural       0.00      0.00      0.00      4232

    accuracy                           0.27     73411
   macro avg       0.33      0.33      0.17     73411
weighted avg       0.58      0.27      0.17     73411



# Distance calculation 

In [6]:
class Distance:
    def __init__(self):
        self.noevents = None
        self.attack = None
        self.natural = None

    def distance(self, X1, X2):
        return np.abs(((X1 - X2).sum()))

    def important(self, X):
        return X[["R4-PM8:V","R3-PM3:V","R3-PM9:V","R1:S","R1-PM7:V", "R3-PM3:V","R3-PM1:V", "R3-PM7:V","R3-PM3:V","R2-PA:Z"]]
   
    def fit(self, X, y):
        Xnew = self.important(X)
        self.noevents = Xnew[y == 0].mean(axis=0)
        self.attack = Xnew[y == 1].mean(axis=0)
        self.natural = Xnew[y == 2].mean(axis=0)     
        return self

    def transform(self, X):
        Xnew = self.important(X)
        res = np.c_[np.apply_along_axis(lambda x: self.distance(x, self.noevents), axis=1, arr=Xnew), np.apply_along_axis(lambda x: self.distance(x, self.attack), axis=1, arr=Xnew), np.apply_along_axis(lambda x: self.distance(x, self.natural), axis=1, arr=Xnew)]
        return np.c_[Xnew, np.argmin(res, axis=1)]


In [8]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np
from sklearn import preprocessing

labels = ["NoEvents", "Attack", "Natural"]

X = pd.read_csv("Data/data%d.csv"%1) #read file

X = X.replace(np.inf, np.finfo(np.float32).max) #replacing 'inf' with its equivalent in float32 datatype

#preparing the label converter
le = preprocessing.LabelEncoder()
le.fit(labels)

#assigning the training data and the labels into variables
y = le.transform(X['marker'])
X = X.drop(columns='marker')

pipe = Pipeline([('dist', Distance()) , ('MLP', MLPClassifier(hidden_layer_sizes=(20,), max_iter=1000, early_stopping=True))])
pipe.fit(X,y)

Pipeline(steps=[('dist', <__main__.Distance object at 0x0000023C52FBC9C8>),
                ('MLP',
                 MLPClassifier(early_stopping=True, hidden_layer_sizes=(20,),
                               max_iter=1000))])

In [9]:
Xall = []
for i in range(2,16):
    Xall.append(pd.read_csv("Data/data%d.csv"%i))

Xall = pd.concat(Xall)
Xall = Xall.replace(np.inf, np.finfo(np.float32).max) #replacing 'inf' with its equivalent in float32 datatype

#assigning the training data and the labels into variables
yall = le.transform(Xall['marker'])
Xall = Xall.drop(columns='marker')

from sklearn.metrics import classification_report
print(classification_report(yall, pipe.predict(Xall), labels=[0,1,2], target_names=labels))

precision    recall  f1-score   support

    NoEvents       0.70      0.98      0.82     51797
      Attack       0.18      0.01      0.02     17382
     Natural       0.00      0.00      0.00      4232

    accuracy                           0.69     73411
   macro avg       0.29      0.33      0.28     73411
weighted avg       0.54      0.69      0.58     73411

