# Base algorithm for features' importance classification

In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
import pickle
from lime.lime_tabular import LimeTabularExplainer

labels = ["NoEvents", "Attack", "Natural"]

X = pd.read_csv("Data/data%d.csv"%1) #read file

X = X.replace(np.inf, np.finfo(np.float32).max) #replacing 'inf' with its equivalent in float32 datatype

#preparing the label converter
le = preprocessing.LabelEncoder()
le.fit(labels)

#assigning the training data and the labels into variables
y = le.transform(X['marker'])
X = X.drop(columns='marker')

features = list(X.columns)

clf = DecisionTreeClassifier()
X=X.values

clf.fit(X,y)

explainer = LimeTabularExplainer(X, training_labels = y, feature_names = features, class_names = labels)


Xall = []
for i in range(2,16):
    Xall.append(pd.read_csv("Data/data%d.csv"%i))

Xall = pd.concat(Xall)
Xall = Xall.replace(np.inf, np.finfo(np.float32).max) #replacing 'inf' with its equivalent in float32 datatype

#assigning the training data and the labels into variables
yall = le.transform(Xall['marker'])
Xall = Xall.drop(columns='marker')

# boole = (yall != clf.predict(Xall))

# faulty = boole & (yall == 0)
# X_test = Xall[faulty]
# y_test = yall[faulty]
# lst = []
# for idx in range(0, 100):
#     exp = explainer.explain_instance(X_test[idx], clf.predict_proba, num_features=128, labels=[0, 1, 2])
#     lst.append(exp.as_list(label=0))
# lst = np.array(lst)
# clst = np.concatenate(lst, axis=0)
# dtfr = pd.DataFrame(clst, columns=['feature', 'importance'])
# dtfr["importance"] = pd.to_numeric(dtfr["importance"])
# dtfr = dtfr.groupby(['feature']).mean()
# res = dtfr.sort_values(by="importance")
  


In [73]:
res 

Unnamed: 0_level_0,importance
feature,Unnamed: 1_level_1
R3:F > 60.00,-0.016330
R2-PA7:VH <= -101.20,-0.010771
R4-PA2:VH <= -95.89,-0.008911
R2-PA6:IH > 81.51,-0.007957
R1-PA1:VH > 71.28,-0.007917
...,...
R3-PM6:I <= 318.25,0.005361
R1-PA:Z > 12.43,0.005471
R4-PA5:IH > 115.38,0.005963
R1-PA1:VH <= -97.40,0.006100


# Choosen features values modification

In [2]:
from sklearn.metrics import classification_report

print(classification_report(yall, clf.predict(Xall), labels=[0,1,2], target_names=labels))

precision    recall  f1-score   support

    NoEvents       0.72      0.76      0.74     51797
      Attack       0.27      0.26      0.26     17382
     Natural       0.19      0.08      0.12      4232

    accuracy                           0.60     73411
   macro avg       0.39      0.37      0.37     73411
weighted avg       0.58      0.60      0.59     73411



In [27]:
Xmod = Xall.copy()

def modify(feat, val):
    Xmod[feat] =Xmod[feat].apply(lambda x: x + val)


modify("R4-PA5:IH", -115.38)
modify("R3-PM2:V", 128525.29)
modify("R2-PM1:V", 2000)
modify("R1-PA12:IH", 32.04)
modify("R3-PM5:I", 330.7)

modify("R3:S", 0)
modify("R2-PA7:VH", 101.20)
modify("R2-PM1:V", -1300872.03)
modify("R3-PA7:VH", 101.22)
modify("R3-PA2:VH", 93.75)



modify("R2:F", -60)
modify("R3:F", -60)
modify("R2-PA5:IH",- 63.30)
modify("R2-PM7:V", -130857.40)
modify("R1-PA1:VH", -72.28)


In [28]:
print(classification_report(yall, clf.predict(Xmod), labels=[0,1,2], target_names=labels))

precision    recall  f1-score   support

    NoEvents       0.71      0.83      0.77     51797
      Attack       0.26      0.17      0.21     17382
     Natural       0.10      0.03      0.05      4232

    accuracy                           0.63     73411
   macro avg       0.36      0.34      0.34     73411
weighted avg       0.57      0.63      0.59     73411



In [76]:
yall[15]

2

In [78]:
exp = explainer.explain_instance(Xall.iloc[15], clf.predict_proba, num_features=128, labels=[0, 1, 2])
lst = exp.as_list()
lst = np.array(lst)
dtfr = pd.DataFrame(lst, columns=['feature', 'importance'])
dtfr["importance"] = pd.to_numeric(dtfr["importance"])
dtfr = dtfr.groupby(['feature']).mean()
res = dtfr.sort_values(by="importance")

In [79]:
res

Unnamed: 0_level_0,importance
feature,Unnamed: 1_level_1
R2-PA8:VH <= 0.00,-0.010952
relay4_log <= 0.00,-0.010254
R3-PA8:VH <= 0.00,-0.007786
R1-PA8:VH <= 0.00,-0.007464
R3-PM6:I <= 318.25,-0.007126
...,...
R2-PM1:V > 130872.03,0.004933
R2-PM2:V > 130805.80,0.005550
R4:S <= 0.00,0.005780
R4-PA2:VH <= -95.89,0.007224


In [85]:
Xall.iloc[15]["R1-PA9:VH"] 

0.0

In [114]:
Xall.at[15, "R1-PA9:VH"] = 0
Xall.at[15, "R4-PA2:VH"] = -95,89


# Distance calculation 

In [70]:
class Distance:
    def __init__(self):
        self.noevents = None
        self.attack = None
        self.natural = None

    def distance(self, X1, X2):
        return np.abs(((X1 - X2).sum()))

    def important(self, X):
        return X[["R4-PA5:IH", "R3-PM2:V", "R2-PM1:V", "R1-PA12:IH", "R3-PM5:I", "R3:S", "R2-PA7:VH", "R2-PM1:V", "R3-PA7:VH","R3-PA2:VH", "R2:F", "R3:F", "R2-PA5:IH","R2-PM7:V","R1-PA1:VH"]]

    def fit(self, X, y):
        Xnew = self.important(X)
        self.noevents = Xnew[y == 0].mean(axis=0)
        self.attack = Xnew[y == 1].mean(axis=0)
        self.natural = Xnew[y == 2].mean(axis=0)     
        return self

    def transform(self, X):
        Xnew = self.important(X)
        res = np.c_[np.apply_along_axis(lambda x: self.distance(x, self.noevents), axis=1, arr=Xnew), np.apply_along_axis(lambda x: self.distance(x, self.attack), axis=1, arr=Xnew), np.apply_along_axis(lambda x: self.distance(x, self.natural), axis=1, arr=Xnew)]
        return np.c_[Xnew, np.argmin(res, axis=1)]


In [71]:
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np
from sklearn import preprocessing

labels = ["NoEvents", "Attack", "Natural"]

X = pd.read_csv("Data/data%d.csv"%1) #read file

X = X.replace(np.inf, np.finfo(np.float32).max) #replacing 'inf' with its equivalent in float32 datatype

#preparing the label converter
le = preprocessing.LabelEncoder()
le.fit(labels)

#assigning the training data and the labels into variables
y = le.transform(X['marker'])
X = X.drop(columns='marker')

pipe = Pipeline([('dist', Distance()) , ('DecisionTree', DecisionTreeClassifier())])
pipe.fit(X,y)

Pipeline(steps=[('dist', <__main__.Distance object at 0x0000020B004B6348>),
                ('DecisionTree', DecisionTreeClassifier())])

In [72]:
Xall = []
for i in range(2,16):
    Xall.append(pd.read_csv("Data/data%d.csv"%i))

Xall = pd.concat(Xall)
Xall = Xall.replace(np.inf, np.finfo(np.float32).max) #replacing 'inf' with its equivalent in float32 datatype

#assigning the training data and the labels into variables
yall = le.transform(Xall['marker'])
Xall = Xall.drop(columns='marker')

from sklearn.metrics import classification_report
print(classification_report(yall, pipe.predict(Xall), labels=[0,1,2], target_names=labels))

precision    recall  f1-score   support

    NoEvents       0.72      0.80      0.75     51797
      Attack       0.28      0.22      0.25     17382
     Natural       0.19      0.08      0.11      4232

    accuracy                           0.62     73411
   macro avg       0.39      0.37      0.37     73411
weighted avg       0.58      0.62      0.60     73411



# Hidden Markov Models

In [3]:
from hmmlearn.hmm import GaussianHMM
clf2 = GaussianHMM(3)
clf2.fit(X)

GaussianHMM(n_components=3)

In [11]:
coefs = clf2.get_stationary_distribution()

In [211]:
ctest = DecisionTreeClassifier(class_weight = {0: coefs[0], 1:coefs[1], 2:coefs[2]})
ctest1 = DecisionTreeClassifier(class_weight = {0: coefs[0], 1:coefs[1], 2:coefs[2]}, criterion="entropy")
ctest2= DecisionTreeClassifier(class_weight = "balanced")
ctest3= DecisionTreeClassifier(class_weight = "balanced", criterion= "entropy")

In [214]:
ctest.fit(X,y)
ctest1.fit(X,y)
ctest2.fit(X,y)
ctest3.fit(X,y)

clf3= DecisionTreeClassifier(criterion="entropy")
clf3.fit(X,y)


from sklearn.metrics import classification_report
print(classification_report(yall, clf.predict(Xall), labels=[0,1,2], target_names=labels))
print(classification_report(yall, clf3.predict(Xall), labels=[0,1,2], target_names=labels))
print(classification_report(yall, ctest.predict(Xall), labels=[0,1,2], target_names=labels))
print(classification_report(yall, ctest1.predict(Xall), labels=[0,1,2], target_names=labels))
print(classification_report(yall, ctest2.predict(Xall), labels=[0,1,2], target_names=labels))
print(classification_report(yall, ctest3.predict(Xall), labels=[0,1,2], target_names=labels))

precision    recall  f1-score   support

    NoEvents       0.72      0.76      0.74     51797
      Attack       0.27      0.26      0.26     17382
     Natural       0.24      0.10      0.14      4232

    accuracy                           0.60     73411
   macro avg       0.41      0.37      0.38     73411
weighted avg       0.58      0.60      0.59     73411

              precision    recall  f1-score   support

    NoEvents       0.71      0.78      0.74     51797
      Attack       0.25      0.23      0.24     17382
     Natural       0.26      0.09      0.13      4232

    accuracy                           0.61     73411
   macro avg       0.41      0.36      0.37     73411
weighted avg       0.58      0.61      0.59     73411

              precision    recall  f1-score   support

    NoEvents       0.71      0.75      0.73     51797
      Attack       0.27      0.26      0.26     17382
     Natural       0.16      0.06      0.09      4232

    accuracy                      

In [7]:
from random import seed
from random import randrange
 
# Split a dataset into k folds
def cross_validation_split(dataset, n_folds):
	dataset_split = list()
	dataset_copy = list(dataset)
	fold_size = int(len(dataset) / n_folds)
	for i in range(n_folds):
		fold = list()
		while len(fold) < fold_size:
			index = randrange(len(dataset_copy))
			fold.append(dataset_copy.pop(index))
		dataset_split.append(fold)
	return dataset_split
 
 
# Split a dataset based on an attribute and an attribute value
def test_split(index, value, dataset):
	left, right = list(), list()
	for row in dataset:
		if row[index] < value:
			left.append(row)
		else:
			right.append(row)
	return left, right
 
# Calculate the Gini index for a split dataset
def gini_index(groups, classes):
	# count all samples at split point
	n_instances = float(sum([len(group) for group in groups]))
	# sum weighted Gini index for each group
	gini = 0.0
	for group in groups:
		size = float(len(group))
		# avoid divide by zero
		if size == 0:
			continue
		score = 0.0
		# score the group based on the score for each class
		for class_val in classes:
			p = [row[-1] for row in group].count(class_val) / size
			score += p * p * coefs[int(class_val)]
		# weight the group score by its relative size
		gini += (1.0 - score) * (size / n_instances)
	return gini
 
# Select the best split point for a dataset
def get_split(dataset):
	class_values = list(set(row[-1] for row in dataset))
	b_index, b_value, b_score, b_groups = 999, 999, 999, None
	for index in range(len(dataset[0])-1):
		for row in dataset:
			groups = test_split(index, row[index], dataset)
			gini = gini_index(groups, class_values)
			if gini < b_score:
				b_index, b_value, b_score, b_groups = index, row[index], gini, groups
	return {'index':b_index, 'value':b_value, 'groups':b_groups}
 
# Create a terminal node value
def to_terminal(group):
	outcomes = [row[-1] for row in group]
	return max(set(outcomes), key=outcomes.count)
 
# Create child splits for a node or make terminal
def split(node, max_depth, min_size, depth):
	left, right = node['groups']
	del(node['groups'])
	# check for a no split
	if not left or not right:
		node['left'] = node['right'] = to_terminal(left + right)
		return
	# check for max depth
	if depth >= max_depth:
		node['left'], node['right'] = to_terminal(left), to_terminal(right)
		return
	# process left child
	if len(left) <= min_size:
		node['left'] = to_terminal(left)
	else:
		node['left'] = get_split(left)
		split(node['left'], max_depth, min_size, depth+1)
	# process right child
	if len(right) <= min_size:
		node['right'] = to_terminal(right)
	else:
		node['right'] = get_split(right)
		split(node['right'], max_depth, min_size, depth+1)
 
# Build a decision tree
def build_tree(train, max_depth, min_size):
	root = get_split(train)
	split(root, max_depth, min_size, 1)
	return root
 
# Make a prediction with a decision tree
def predict(node, row):
	if row[node['index']] < node['value']:
		if isinstance(node['left'], dict):
			return predict(node['left'], row)
		else:
			return node['left']
	else:
		if isinstance(node['right'], dict):
			return predict(node['right'], row)
		else:
			return node['right']
 
# Classification and Regression Tree Algorithm
def decision_tree(train, test, max_depth, min_size):
	tree = build_tree(train, max_depth, min_size)
	predictions = list()
	for row in test:
		prediction = predict(tree, row)
		predictions.append(prediction)
	return(predictions)

In [8]:
Xbis = pd.read_csv("Data/data%d.csv"%1) #read file

Xbis = Xbis.replace(np.inf, np.finfo(np.float32).max) #replacing 'inf' with its equivalent in float32 datatype

#preparing the label converter
le = preprocessing.LabelEncoder()
le.fit(labels)

#assigning the training data and the labels into variables
Xbis["marker"] = le.transform(Xbis['marker'])
Xbis = Xbis.values

In [9]:
Xallv = Xall.values

In [15]:
ybis = decision_tree(Xbis, Xallv, 10000, 3)

In [17]:
from sklearn.metrics import classification_report


print(classification_report(yall, ybis, labels=[0,1,2], target_names=labels))

precision    recall  f1-score   support

    NoEvents       0.72      0.77      0.74     51797
      Attack       0.27      0.25      0.26     17382
     Natural       0.24      0.08      0.13      4232

    accuracy                           0.61     73411
   macro avg       0.41      0.37      0.38     73411
weighted avg       0.58      0.61      0.59     73411



In [18]:
def gini_index(groups, classes):
	# count all samples at split point
	n_instances = float(sum([len(group) for group in groups]))
	# sum weighted Gini index for each group
	gini = 0.0
	for group in groups:
		size = float(len(group))
		# avoid divide by zero
		if size == 0:
			continue
		score = 0.0
		# score the group based on the score for each class
		for class_val in classes:
			p = [row[-1] for row in group].count(class_val) / size
			score += p * p
		# weight the group score by its relative size
		gini += (1.0 - score) * (size / n_instances)
	return gini

print(classification_report(yall, decision_tree(Xbis, Xallv, 10000, 3), labels=[0,1,2], target_names=labels))

precision    recall  f1-score   support

    NoEvents       0.72      0.77      0.74     51797
      Attack       0.27      0.25      0.26     17382
     Natural       0.24      0.08      0.13      4232

    accuracy                           0.61     73411
   macro avg       0.41      0.37      0.38     73411
weighted avg       0.58      0.61      0.59     73411

