# Base algorithm for features' importance classification for RandomForest

In [7]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
import pickle
from lime.lime_tabular import LimeTabularExplainer

labels = ["NoEvents", "Attack", "Natural"]

X = pd.read_csv("Data/data%d.csv"%1) #read file

X = X.replace(np.inf, np.finfo(np.float32).max) #replacing 'inf' with its equivalent in float32 datatype

#preparing the label converter
le = preprocessing.LabelEncoder()
le.fit(labels)

#assigning the training data and the labels into variables
y = le.transform(X['marker'])
X = X.drop(columns='marker')

features = list(X.columns)

clf = RandomForestClassifier(n_estimators=100, max_features = 'log2')
X=X.values

clf.fit(X,y)

explainer = LimeTabularExplainer(X, training_labels = y, feature_names = features, class_names = labels)


Xall = []
for i in range(2,16):
    Xall.append(pd.read_csv("Data/data%d.csv"%i))

Xall = pd.concat(Xall)
Xall = Xall.replace(np.inf, np.finfo(np.float32).max) #replacing 'inf' with its equivalent in float32 datatype

#assigning the training data and the labels into variables
yall = le.transform(Xall['marker'])
Xall = Xall.drop(columns='marker')

boole = (yall != clf.predict(Xall))

res = []

for act_class in [0, 1, 2]:
    faulty = boole & (yall == act_class)
    X_test = Xall[faulty]
    y_test = yall[faulty]
    lst = []
    for idx in range(0, 100):
        exp = explainer.explain_instance(X_test.iloc[idx], clf.predict_proba, num_features=128, labels=[0, 1, 2])
        lst.append(exp.as_list(label=0))
    lst = np.array(lst)
    clst = np.concatenate(lst, axis=0)
    dtfr = pd.DataFrame(clst, columns=['feature', 'importance'])
    dtfr["importance"] = pd.to_numeric(dtfr["importance"])
    dtfr = dtfr.groupby(['feature']).mean()
    res.append(dtfr.sort_values(by="importance"))
  


In [8]:
res 

[                       importance
 feature                          
 R4-PM8:V > 0.00         -0.012310
 R3:F > 60.00            -0.011923
 R1-PA3:VH > 78.00       -0.011004
 R3-PA7:VH <= -101.22    -0.010964
 R3-PA10:IH > 102.93     -0.010511
 ...                           ...
 R4-PA4:IH <= -97.95      0.005985
 R2-PM4:I <= 320.95       0.006062
 R1-PA:Z > 12.43          0.006171
 R3-PM2:V <= 128425.29    0.007994
 R4-PA2:VH > 117.68       0.010149
 
 [368 rows x 1 columns],                               importance
 feature                                 
 R3:F > 60.00                   -0.010821
 R1-PA3:VH > 78.00              -0.010649
 R3-PA1:VH <= -101.21           -0.009657
 R3-PA7:VH <= -101.22           -0.009517
 R3-PA10:IH > 102.93            -0.009034
 ...                                  ...
 R2-PA2:VH > 114.00              0.009784
 R2-PA6:IH <= -114.38            0.010825
 -97.40 < R1-PA1:VH <= -35.86    0.010908
 R4-PA2:VH > 117.68              0.010962
 -97.43 < R1-PA

# Choosen features values modification

In [9]:
from sklearn.metrics import classification_report

print(classification_report(yall, clf.predict(Xall), labels=[0,1,2], target_names=labels))

precision    recall  f1-score   support

    NoEvents       0.72      0.87      0.78     51797
      Attack       0.29      0.16      0.21     17382
     Natural       0.25      0.05      0.08      4232

    accuracy                           0.65     73411
   macro avg       0.42      0.36      0.36     73411
weighted avg       0.59      0.65      0.61     73411



In [10]:
Xmod = Xall.copy()

def modify(feat, val):
    Xmod[feat] =Xmod[feat].apply(lambda x: x + val)


modify("R4-PA2:VH", -117.68)
modify("R3-PM2:V", 128525.29)
modify("R1-PA:Z", -12.43)
modify("R2-PM4:I", 320.95)
modify("R4-PA4:IH", -97.95)

modify("R1-PA7:VH", 61.58)
modify("R4-PA2:VH", 117.68)
modify("R1-PA1:VH", 61.54)
modify("R2-PA6:IH", 114.38)
modify("R2-PA2:VH", -114)

modify("R3-PM2:V", 128425.29)
modify("R1-PA:Z", -12.43)
modify("R2-PM4:I", 320.95)
modify("R3-PA7:VH", -65.92)
modify("R3-PA10:IH", 97.29)


In [11]:
print(classification_report(yall, clf.predict(Xmod), labels=[0,1,2], target_names=labels))

precision    recall  f1-score   support

    NoEvents       0.71      0.89      0.79     51797
      Attack       0.28      0.13      0.18     17382
     Natural       0.26      0.05      0.08      4232

    accuracy                           0.66     73411
   macro avg       0.42      0.36      0.35     73411
weighted avg       0.58      0.66      0.61     73411



# Distance calculation 

In [2]:
class Distance:
    def __init__(self):
        self.noevents = None
        self.attack = None
        self.natural = None

    def distance(self, X1, X2):
        return np.abs(((X1 - X2).sum()))

    def important(self, X):
        return X[["R4-PA4:IH" , "R2-PM4:I", "R1-PA:Z", "R3-PM2:V", "R4-PA2:VH", "R2-PA2:VH", "R2-PA6:IH", "R1-PA1:VH", "R4-PA2:VH", "R1-PA7:VH", "R3-PA10:IH", "R3-PA7:VH", "R2-PM4:I", "R1-PA:Z", "R3-PM2:V"]]

    def fit(self, X, y):
        Xnew = self.important(X)
        self.noevents = Xnew[y == 0].mean(axis=0)
        self.attack = Xnew[y == 1].mean(axis=0)
        self.natural = Xnew[y == 2].mean(axis=0)     
        return self

    def transform(self, X):
        Xnew = self.important(X)
        res = np.c_[np.apply_along_axis(lambda x: self.distance(x, self.noevents), axis=1, arr=Xnew), np.apply_along_axis(lambda x: self.distance(x, self.attack), axis=1, arr=Xnew), np.apply_along_axis(lambda x: self.distance(x, self.natural), axis=1, arr=Xnew)]
        return np.c_[Xnew, np.argmin(res, axis=1)]


In [6]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
from sklearn import preprocessing

labels = ["NoEvents", "Attack", "Natural"]

X = pd.read_csv("Data/data%d.csv"%1) #read file

X = X.replace(np.inf, np.finfo(np.float32).max) #replacing 'inf' with its equivalent in float32 datatype

#preparing the label converter
le = preprocessing.LabelEncoder()
le.fit(labels)

#assigning the training data and the labels into variables
y = le.transform(X['marker'])
X = X.drop(columns='marker')

pipe = Pipeline([('dist', Distance()) , ('RandomForest', RandomForestClassifier(n_estimators=100, max_features = 'log2'))])
pipe.fit(X,y)

Pipeline(steps=[('dist', <__main__.Distance object at 0x00000262202DFF88>),
                ('RandomForest', RandomForestClassifier(max_features='log2'))])

In [7]:
Xall = []
for i in range(2,16):
    Xall.append(pd.read_csv("Data/data%d.csv"%i))

Xall = pd.concat(Xall)
Xall = Xall.replace(np.inf, np.finfo(np.float32).max) #replacing 'inf' with its equivalent in float32 datatype

#assigning the training data and the labels into variables
yall = le.transform(Xall['marker'])
Xall = Xall.drop(columns='marker')

from sklearn.metrics import classification_report
print(classification_report(yall, pipe.predict(Xall), labels=[0,1,2], target_names=labels))

precision    recall  f1-score   support

    NoEvents       0.71      0.83      0.77     51797
      Attack       0.27      0.19      0.22     17382
     Natural       0.18      0.05      0.07      4232

    accuracy                           0.63     73411
   macro avg       0.39      0.36      0.36     73411
weighted avg       0.58      0.63      0.60     73411



# Hidden Markov Models

In [16]:
from hmmlearn.hmm import GaussianHMM
clf2 = GaussianHMM(3)
clf2.fit(X)

GaussianHMM(n_components=3)

In [17]:
coefs = clf2.get_stationary_distribution()

In [18]:
ctest = RandomForestClassifier(n_estimators = 100, max_features = 'log2', class_weight = {0: coefs[0], 1:coefs[1], 2:coefs[2]})
ctest1 = RandomForestClassifier(n_estimators = 100, max_features = 'log2' ,class_weight = {0: coefs[0], 1:coefs[1], 2:coefs[2]}, criterion="entropy")
ctest2= RandomForestClassifier(n_estimators = 100, max_features = 'log2', class_weight = "balanced")
ctest3= RandomForestClassifier(n_estimators = 100, max_features = 'log2', class_weight = "balanced", criterion= "entropy")

In [19]:
ctest.fit(X,y)
ctest1.fit(X,y)
ctest2.fit(X,y)
ctest3.fit(X,y)

clf3= RandomForestClassifier(n_estimators = 100, max_features = 'log2', criterion="entropy")
clf3.fit(X,y)


from sklearn.metrics import classification_report
print(classification_report(yall, clf.predict(Xall), labels=[0,1,2], target_names=labels))
print(classification_report(yall, clf3.predict(Xall), labels=[0,1,2], target_names=labels))
print(classification_report(yall, ctest.predict(Xall), labels=[0,1,2], target_names=labels))
print(classification_report(yall, ctest1.predict(Xall), labels=[0,1,2], target_names=labels))
print(classification_report(yall, ctest2.predict(Xall), labels=[0,1,2], target_names=labels))
print(classification_report(yall, ctest3.predict(Xall), labels=[0,1,2], target_names=labels))

precision    recall  f1-score   support

    NoEvents       0.72      0.87      0.78     51797
      Attack       0.29      0.16      0.21     17382
     Natural       0.25      0.05      0.08      4232

    accuracy                           0.65     73411
   macro avg       0.42      0.36      0.36     73411
weighted avg       0.59      0.65      0.61     73411

              precision    recall  f1-score   support

    NoEvents       0.71      0.86      0.78     51797
      Attack       0.29      0.17      0.21     17382
     Natural       0.28      0.05      0.09      4232

    accuracy                           0.65     73411
   macro avg       0.43      0.36      0.36     73411
weighted avg       0.59      0.65      0.61     73411

              precision    recall  f1-score   support

    NoEvents       0.72      0.85      0.78     51797
      Attack       0.30      0.19      0.24     17382
     Natural       0.25      0.05      0.09      4232

    accuracy                      