In [2]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import pickle
from lime.lime_tabular import LimeTabularExplainer

labels = ["NoEvents", "Attack", "Natural"]

X = pd.read_csv("Data/data%d.csv"%1) #read file

X = X.replace(np.inf, np.finfo(np.float32).max) #replacing 'inf' with its equivalent in float32 datatype

#preparing the label converter
le = preprocessing.LabelEncoder()
le.fit(labels)

#assigning the training data and the labels into variables
y = le.transform(X['marker'])
X = X.drop(columns='marker')

features = list(X.columns)

clf = DecisionTreeClassifier()
X=X.values

clf.fit(X,y)

explainer = LimeTabularExplainer(X, training_labels = y, feature_names = features, class_names = labels)


Xall = []
for i in range(2,16):
    Xall.append(pd.read_csv("Data/data%d.csv"%i))

Xall = pd.concat(Xall)
Xall = Xall.replace(np.inf, np.finfo(np.float32).max) #replacing 'inf' with its equivalent in float32 datatype

#assigning the training data and the labels into variables
yall = le.transform(Xall['marker'])
Xall = Xall.drop(columns='marker').values

boole = (yall != clf.predict(Xall))

faulty = boole & (yall == 0)
X_test = Xall[faulty]
y_test = yall[faulty]
lst = []
for idx in range(0, 100):
    exp = explainer.explain_instance(X_test[idx], clf.predict_proba, num_features=128, labels=[0, 1, 2])
    lst.append(exp.as_list(label=0))
lst = np.array(lst)
clst = np.concatenate(lst, axis=0)
dtfr = pd.DataFrame(clst, columns=['feature', 'importance'])
dtfr["importance"] = pd.to_numeric(dtfr["importance"])
dtfr = dtfr.groupby(['feature']).mean()
res = dtfr.sort_values(by="importance")
  


In [8]:
res 

Unnamed: 0_level_0,importance
feature,Unnamed: 1_level_1
R2-PA5:IH > 63.30,-0.202310
R4-PA1:VH <= -97.13,-0.089672
R1-PM10:I <= 318.61,-0.072388
R2:F > 60.00,-0.063643
R1-PA:Z <= 8.19,-0.055955
...,...
9.79 < R1-PA:Z <= 12.43,0.043825
390.48 < R1-PM10:I <= 465.42,0.053801
R1-PA:Z > 12.43,0.065033
R2-PA4:IH <= -64.39,0.086162


In [612]:
clf.predict(((X_test[14]-75).reshape(1, -1)))

array([0])

In [608]:
clf.predict(((Xall.iloc[14].values).reshape(1, -1)))

array([0])

In [583]:
yall[14]

2

In [192]:
exp = explainer.explain_instance(Xall[14], clf.predict_proba, num_features=128, labels=[0, 1, 2])
lst = exp.as_list()
lst = np.array(lst)
dtfr = pd.DataFrame(lst, columns=['feature', 'importance'])
dtfr["importance"] = pd.to_numeric(dtfr["importance"])
dtfr = dtfr.groupby(['feature']).mean()
res = dtfr.sort_values(by="importance")

In [193]:
res

Unnamed: 0_level_0,importance
feature,Unnamed: 1_level_1
R1-PM8:V <= 0.00,-0.148195
R3-PA8:VH <= 0.00,-0.146174
R1-PA9:VH <= 0.00,-0.099529
R4-PA8:VH <= 0.00,-0.092479
R3-PA9:VH <= 0.00,-0.089755
...,...
R4-PM5:I <= 326.49,0.063332
R4-PA9:VH <= 0.00,0.064508
R1-PM10:I <= 318.61,0.064621
R2-PM1:V > 130872.03,0.065484


In [344]:
Xall.iloc[14]["R2-PM8:V"] 

10.0

In [607]:
Xall.at[14, "R2-PM8:V"] = 1000
Xall.at[14, "R2-PM1:V"] = 110000
Xall.at[14, "R1-PM8:V"] = 1000
Xall.at[14, "R1-PM10:I"] = 10000

In [413]:
Xtrue = Xall.iloc[clf.predict(Xall) == yall]
ytrue = yall[clf.predict(Xall) == yall]
Xfalse = Xall.iloc[clf.predict(Xall) != yall]
yfalse = yall[clf.predict(Xall) != yall]
Xnoevents = Xfalse.iloc[clf.predict(Xfalse) == 0]
ynoevents = yfalse[clf.predict(Xfalse) == 0]
Xattack = Xfalse.iloc[clf.predict(Xfalse) == 1]
yattack = yfalse[clf.predict(Xfalse) == 1]
Xnatural = Xfalse.iloc[clf.predict(Xfalse) == 2]
ynatural = yfalse[clf.predict(Xfalse) == 2]

In [563]:
def distance(X1, X2):
    return np.sqrt((((X1 - X2).apply(np.square)).sum()))


In [529]:
distance(Xtrue.iloc[0], Xtrue.iloc[1])

114.23250721467586

In [445]:
ytrue[[3,4]]

array([2, 2])

In [532]:
x0 = Xtrue[ytrue == 0]
x1 = Xtrue[ytrue == 1]
x2 = Xtrue[ytrue == 2]

In [572]:
np.average([distance(x0.iloc[k], x0.iloc[k+1]) for k in np.random.randint(0, x0.shape[0]-1, 1000)])

2.7767756968533383e+37

In [573]:
np.average([distance(x1.iloc[k], x1.iloc[k+1]) for k in np.random.randint(0, x1.shape[0]-1, 1000)])

6.64400544631047e+37

In [574]:
np.average([distance(x2.iloc[k], x2.iloc[k+1]) for k in np.random.randint(0, x2.shape[0]-1, 1000)])

1855.457402041155

In [579]:
np.average([distance(Xnoevents.iloc[k], Xnoevents.iloc[k+1]) for k in np.random.randint(0, Xnoevents.shape[0]-1, 1000)])

3.3025702936441036e+37