In [28]:
import pandas as pd
from scipy.io import arff
import numpy as np
from scipy import stats

from sklearn import preprocessing

## 1.0 Data
First we load all datasets and combine them

In [2]:
print('Loading dataset #1')
data = pd.DataFrame(arff.loadarff('input/power_multiclass/data1.arff')[0])

for i in range(2,16):
    print('Loading dataset #{}'.format(i))
    dataTemp = pd.DataFrame(arff.loadarff('input/power_multiclass/data{}.arff'.format(i))[0])
    data = pd.concat([data,dataTemp],axis=0)

print("Finished Loading. Final Size = {}".format(data.shape))

Loading dataset #1
Loading dataset #2
Loading dataset #3
Loading dataset #4
Loading dataset #5
Loading dataset #6
Loading dataset #7
Loading dataset #8
Loading dataset #9
Loading dataset #10
Loading dataset #11
Loading dataset #12
Loading dataset #13
Loading dataset #14
Loading dataset #15
Finished Loading. Final Size = (78377, 129)


In [3]:
data.head()

Unnamed: 0,R1-PA1:VH,R1-PM1:V,R1-PA2:VH,R1-PM2:V,R1-PA3:VH,R1-PM3:V,R1-PA4:IH,R1-PM4:I,R1-PA5:IH,R1-PM5:I,...,control_panel_log4,relay1_log,relay2_log,relay3_log,relay4_log,snort_log1,snort_log2,snort_log3,snort_log4,marker
0,70.399324,127673.0908,-49.572308,127648.0176,-169.578319,127723.2374,65.689611,605.91099,-57.003571,626.78553,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,b'41'
1,73.688102,130280.7109,-46.300719,130255.6377,-166.278082,130355.9307,71.831719,483.59351,-50.947407,500.98896,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,b'41'
2,73.733939,130305.7842,-46.254883,130280.7109,-166.232245,130381.004,71.8088,483.59351,-50.91303,500.98896,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,b'41'
3,74.083443,130581.5902,-45.899649,130556.5169,-165.882741,130656.81,72.152575,482.86107,-50.437475,499.15786,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,b'41'
4,74.553268,131083.0556,-45.424094,131057.9823,-165.424375,131158.2754,72.118198,484.50906,-50.013486,497.69298,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,b'41'


In [27]:
data.columns.to_numpy()

array(['index', 'R1-PA1:VH', 'R1-PM1:V', 'R1-PA2:VH', 'R1-PM2:V',
       'R1-PA3:VH', 'R1-PM3:V', 'R1-PA4:IH', 'R1-PM4:I', 'R1-PA5:IH',
       'R1-PM5:I', 'R1-PA6:IH', 'R1-PM6:I', 'R1-PA7:VH', 'R1-PM7:V',
       'R1-PA8:VH', 'R1-PM8:V', 'R1-PA9:VH', 'R1-PM9:V', 'R1-PA10:IH',
       'R1-PM10:I', 'R1-PA11:IH', 'R1-PM11:I', 'R1-PA12:IH', 'R1-PM12:I',
       'R1:F', 'R1:DF', 'R1-PA:Z', 'R1-PA:ZH', 'R1:S', 'R2-PA1:VH',
       'R2-PM1:V', 'R2-PA2:VH', 'R2-PM2:V', 'R2-PA3:VH', 'R2-PM3:V',
       'R2-PA4:IH', 'R2-PM4:I', 'R2-PA5:IH', 'R2-PM5:I', 'R2-PA6:IH',
       'R2-PM6:I', 'R2-PA7:VH', 'R2-PM7:V', 'R2-PA8:VH', 'R2-PM8:V',
       'R2-PA9:VH', 'R2-PM9:V', 'R2-PA10:IH', 'R2-PM10:I', 'R2-PA11:IH',
       'R2-PM11:I', 'R2-PA12:IH', 'R2-PM12:I', 'R2:F', 'R2:DF', 'R2-PA:Z',
       'R2-PA:ZH', 'R2:S', 'R3-PA1:VH', 'R3-PM1:V', 'R3-PA2:VH',
       'R3-PM2:V', 'R3-PA3:VH', 'R3-PM3:V', 'R3-PA4:IH', 'R3-PM4:I',
       'R3-PA5:IH', 'R3-PM5:I', 'R3-PA6:IH', 'R3-PM6:I', 'R3-PA7:VH',
       'R3-PM7:V', 'R3

In [4]:
data.marker.unique().shape

(37,)

Now we drop the non-relevant columns from the data and remove infinity values

In [5]:
data = data.astype(np.float64)
    
data = data.drop(['snort_log1','snort_log2','snort_log3','snort_log4',
            'control_panel_log1','control_panel_log2','control_panel_log3','control_panel_log4',
            'relay1_log','relay2_log','relay3_log','relay4_log'], axis=1)
    
data = data.reset_index()
    
data = data.replace(-np.inf, 0)
data = data.replace(np.inf, 0)
data['marker'] = data['marker'].astype(int)

In [122]:
def splitDF(df, train_percentage = 0.8):
    msk = np.random.rand(len(df)) < train_percentage
    train = df[msk]
    test = df[~msk]
    
    return train, test

In [123]:
train, test = splitDF(data)
print(train.shape)
print(test.shape)

(62497, 118)
(15880, 118)


## Attack Types

In [13]:
normal_df = data[data['marker'] == 41]
dataInjection_df = data[ (data['marker'] >= 7) & (data['marker'] <= 12) ]

print ("Normal Data: {}".format(normal_df.shape))
print("Data Injection: {}".format(dataInjection_df.shape))

Normal Data: (4405, 118)
Data Injection: (9582, 118)


In [104]:
def getStats (df):
    means = []
    stds = []
    
    for column in df:
        if (column!= 'index'):
            mean_temp = df[column].mean()
            std_temp = df[column].std()

            if (mean_temp == 0):
                mean_temp = 0.000001
            if (std_temp == 0):
                std_temp = 0.000001   
            
            means.append(mean_temp)
            stds.append(std_temp)
        
    return means, stds

In [121]:
normal_means, normal_stds = getStats (normal_df)
# print(normal_means)

In [106]:
dataInjection_df.head()

Unnamed: 0,index,R1-PA1:VH,R1-PM1:V,R1-PA2:VH,R1-PM2:V,R1-PA3:VH,R1-PM3:V,R1-PA4:IH,R1-PM4:I,R1-PA5:IH,...,R4-PA11:IH,R4-PM11:I,R4-PA12:IH,R4-PM12:I,R4:F,R4:DF,R4-PA:Z,R4-PA:ZH,R4:S,marker
3734,3734,108.501018,131258.5685,-11.482074,130631.7367,-131.465166,131333.7883,111.766877,351.20498,-12.771229,...,168.945016,12.273788,-57.922668,12.519836,59.999001,0.01,11.853362,-0.071435,0.0,12
3735,3735,108.048381,131684.814,-11.928981,131057.9823,-131.929262,131734.9606,109.389102,359.81115,-13.561911,...,167.324526,7.759094,-64.27277,8.153915,59.999001,0.0,11.189126,-0.039691,0.0,12
3736,3736,108.042651,131684.814,-11.928981,131032.909,-131.929262,131734.9606,109.343266,359.99426,-13.561911,...,166.659853,7.741928,-64.234316,8.073807,60.0,0.0,11.174615,-0.038948,0.0,12
3737,3737,107.847846,131785.1071,-12.140976,131133.2021,-132.118338,131835.2537,107.956708,364.93823,-13.974441,...,159.936229,4.892349,-60.82306,5.115509,60.0,0.0,10.751103,-0.007225,0.0,12
3738,3738,107.727525,131785.1071,-12.249838,131158.2754,-132.238659,131860.3269,107.326454,366.58622,-14.146328,...,160.414123,3.799438,-67.925719,4.228592,60.0,0.0,10.621664,-0.003238,0.0,12


In [None]:
def getPD(sample_np):
    '''
    This function takes in a sample as a numpy array and calculates the percent difference from the mean
    of every measurement from the normal data.
    '''
    
    p_diff = []

    for i in range(len(sample_np)):
        comparison_value = abs(normal_means[i]) + abs(normal_stds[i])
        percent_diff = abs((sample_np[i] - comparison_value)/comparison_value)
        print("Feature : {} measurement difference = {}".format(sample.index[i], percent_diff ))
        p_diff.append(percent_diff)
    
    return p_diff

In [119]:
sample = dataInjection_df.iloc[10].drop('index')
sample_np = sample.to_numpy()

p_diff = []

print(sample)
for i in range(len(sample_np)):
    comparison_value = abs(normal_means[i]) + abs(normal_stds[i])
    percent_diff = abs((sample_np[i] - comparison_value)/comparison_value)
    print("Feature : {} measurement difference = {}".format(sample.index[i], percent_diff ))
    p_diff.append(percent_diff)
    
result_df = pd.DataFrame({"Feature": sample.index, 'Percent Difference': p_diff})

R1-PA1:VH       109.795902
R1-PM1:V     134192.141000
R1-PA2:VH       -10.192919
R1-PM2:V     133540.236000
R1-PA3:VH      -130.193200
                 ...      
R4:DF             0.030000
R4-PA:Z          10.970277
R4-PA:ZH         -0.037112
R4:S              0.000000
marker           12.000000
Name: 3744, Length: 117, dtype: float64
Feature : R1-PA1:VH measurement difference = 0.048817568828808604
Feature : R1-PM1:V measurement difference = 0.013624269432345966
Feature : R1-PA2:VH measurement difference = 1.08962664086345
Feature : R1-PM2:V measurement difference = 0.008956075722394994
Feature : R1-PA3:VH measurement difference = 2.1060334246335897
Feature : R1-PM3:V measurement difference = 0.013693823729076644
Feature : R1-PA4:IH measurement difference = 0.0009066114499083102
Feature : R1-PM4:I measurement difference = 0.2044551576104669
Feature : R1-PA5:IH measurement difference = 1.1359015656775104
Feature : R1-PM5:I measurement difference = 0.21108858488413143
Feature : R1-PA6:I

In [120]:
result_df= result_df.sort_values(by=['Percent Difference'], ascending = False)
result_df.head(10)

Unnamed: 0,Feature,Percent Difference
80,R3-PA12:IH,3.924647
78,R3-PA11:IH,2.913807
33,R2-PA3:VH,2.262429
62,R3-PA3:VH,2.26045
10,R1-PA6:IH,2.187947
91,R4-PA3:VH,2.107213
4,R1-PA3:VH,2.106033
97,R4-PA6:IH,2.082856
107,R4-PA11:IH,1.960826
85,R3-PA:ZH,1.791655
