In this file, we store all the functions needed for the main file. As a consequence, the main file is easier to read. 

In [1]:
import pandas as pd 
import numpy as np

from keras.models import Sequential
from keras.layers import Dense,Activation

from sklearn.model_selection import train_test_split

Using TensorFlow backend.


## Understanding the Data

In [None]:
def read_raw_data(data_name):
    # Read the original BSM data.
    col_index = [1,2,3,4,6,7,9,10,12,13,15,16]
    col_names = ['re_time','re_ID','re_x','re_y','tr_time','tr_ID','tr_x','tr_y','tr_vx','tr_vy','RSSI','Label']
    
    data_dir = 'dataset/' + data_name + '.csv'
    
    data = pd.read_csv(data_dir, usecols = col_index, header = None, names = col_names)
    data = data.dropna(axis = 0, how = 'any')
    data = data.reset_index(drop = True)
    
    return data 

In [None]:
def distance(X):
    distance = np.linalg.norm([X['re_x']-X['tr_x'], X['re_y']-X['tr_y']])
    return distance

def check_range(data, threshold):
    # We obtain the index for which the distance between sender and receiver is higher 800.
    length = data.shape[0]
    drop_index = np.zeros(length)
    for i in range(length):
        distance_this = distance(data.iloc[i])
        if distance_this > threshold:
            drop_index[i]=1
    drop_index_ = np.where(drop_index>0)
    drop_row = np.asarray(drop_index_)[0]
    return drop_row

In [None]:
def check_session(data):
    # get the statistics of the dataset.
    type_of_label = np.array([])
    sender_ID = np.unique(np.array(data.iloc[:,5]))
    
    count_session = 0
    length = data.shape[0]
    
    data_normal = data[data['Label'] == 0]
    A = data_normal.shape[0]
    B = A/length
    C = 0
    
    
    data_malicious = data[data['Label'] != 1]
    a = data_malicious.shape[0]
    b = a/length
    c = 0
    
    for i in range(len(sender_ID)):
        data_sender = data[data['tr_ID'] == sender_ID[i]]
        sender_receiver = np.unique(np.array(data_sender.iloc[:,1]))
        count_session += len(sender_receiver)
        for j in range(len(sender_receiver)):
            data_receiver = data_sender[data_sender['re_ID'] == sender_receiver[j]]
            m = np.unique(np.array(data_receiver.iloc[:,11]))
            if m==0:
                C+=1
            if m!=0:
                c+=1
            t = len(np.unique(np.array(data_receiver.iloc[:,11])))
            type_of_label = np.append(type_of_label, t) 
    D = C/count_session
    d = c/count_session
    print('There are ',length, 'rows in the dataset' )
    print(A, 'rows are normal, i.e., ', B, 'percent of rows in the dataset')
    print(a, 'rows are malicious, i.e., ', b, 'percent of rows in the dataset')
    
    print('There are ', count_session, 'sessions in the dataset')
    print(C, 'session are normal, i.e., ', D, 'percent of sessions in the dataset ')
    print(c, 'session are malicious, i.e., ', d, 'percent of sessions in the dataset ')
    
    if np.unique(t)==1:
        # it means for all session there are 1 labels, whether 1 or 0
        print('For all the session, there are only', np.unique(t)[0], 'kind of label, in other word, a session is whether attack or normal')
    return type_of_label

## Feature vectors

In [None]:
def location_plausibility(receiver_of_sender):
    #x_95 = [-5.6983, 5.2265]
    #x_99 = [-7.1795, 7.7077]
    #y_95 = [-8.1203, 8.0501]
    #y_99 = [-12.1629, 12.0927]
    
    x_95 = [-10, 10]
    x_99 = [-18, 18]
    y_95 = [-10, 10]
    y_99 = [-18, 18]
    
    score = []
    length = receiver_of_sender.shape[0]
    # for the start of the series, we think it is two. 
    score.append(2) 
    if length <=1:
        return score
    for k in range(length-1):
            time_interval = (receiver_of_sender.iloc[k+1]['re_time'] - receiver_of_sender.iloc[k]['re_time'])

            x_pre_95_low = receiver_of_sender.iloc[k]['tr_x'] + time_interval * (receiver_of_sender.iloc[k]['tr_vx'] +  x_95 [0] * time_interval * 0.1)
            x_pre_95_up  = receiver_of_sender.iloc[k]['tr_x'] + time_interval * (receiver_of_sender.iloc[k]['tr_vx'] +  x_95 [1] * time_interval * 0.1)
            x_pre_99_low = receiver_of_sender.iloc[k]['tr_x'] + time_interval * (receiver_of_sender.iloc[k]['tr_vx'] +  x_99 [0] * time_interval * 0.1)
            x_pre_99_up  = receiver_of_sender.iloc[k]['tr_x'] + time_interval * (receiver_of_sender.iloc[k]['tr_vx'] +  x_99 [1] * time_interval * 0.1)

            y_pre_95_low = receiver_of_sender.iloc[k]['tr_y'] + time_interval * (receiver_of_sender.iloc[k]['tr_vy'] +  y_95 [0] * time_interval * 0.1)
            y_pre_95_up  = receiver_of_sender.iloc[k]['tr_y'] + time_interval * (receiver_of_sender.iloc[k]['tr_vy'] +  y_95 [1] * time_interval * 0.1)
            y_pre_99_low = receiver_of_sender.iloc[k]['tr_y'] + time_interval * (receiver_of_sender.iloc[k]['tr_vy'] +  y_99 [0] * time_interval * 0.1)
            y_pre_99_up  = receiver_of_sender.iloc[k]['tr_y'] + time_interval * (receiver_of_sender.iloc[k]['tr_vy'] +  y_95 [1] * time_interval * 0.1)

            t_x = 0
            t_y = 0
            
            #print(receiver_of_sender.iloc[k+1]['tr_x'])
            if receiver_of_sender.iloc[k+1]['tr_x']<=x_pre_95_low or receiver_of_sender.iloc[k+1]['tr_x'] >= x_pre_95_up:
                t_x = 1

            if receiver_of_sender.iloc[k+1]['tr_x']<=x_pre_99_low or receiver_of_sender.iloc[k+1]['tr_x'] >= x_pre_99_up:
                t_x = 2

            if receiver_of_sender.iloc[k+1]['tr_y']<=y_pre_95_low or receiver_of_sender.iloc[k+1]['tr_y'] >= y_pre_95_up:
                t_y = 1

            if receiver_of_sender.iloc[k+1]['tr_y']<=y_pre_99_low or receiver_of_sender.iloc[k+1]['tr_y'] >= y_pre_99_up:
                t_y = 2  
                
            score.append(t_x+t_y)
    return np.mean(score)

In [None]:
def movement_plausibility(receiver_of_sender):
    flag  = 0.
    length = receiver_of_sender.shape[0]
    
    if length <=1:
        flag = np.random.randint(2)
        return flag
    
    x_placement = receiver_of_sender.iloc[-1]['tr_x'] - receiver_of_sender.iloc[0]['tr_x']
    y_placement = receiver_of_sender.iloc[-1]['tr_y'] - receiver_of_sender.iloc[0]['tr_y']

    average_velocity_x = np.average(receiver_of_sender['tr_vx'].values)
    average_velocity_y = np.average(receiver_of_sender['tr_vy'].values)
    if(x_placement==0 and y_placement==0):
        if (average_velocity_x!=0 or average_velocity_y!=0):
            flag = 1.

    return flag

In [None]:
def quantititative_information(receiver_of_sender):
    feature3 = []
    feature4 = []
    feature5 = []
    feature6 = []
    
    length = receiver_of_sender.shape[0]
    
    if length == 1:
        feature3 = [0]
        feature4 = [0]
        feature5 = [0]
        feature6 = [0]
        return feature3, feature4, feature5, feature6
    
    time_interval = receiver_of_sender.iloc[-1]['re_time'] - receiver_of_sender.iloc[0]['re_time']
    v_bar_dist_x = (receiver_of_sender.iloc[-1]['tr_x'] - receiver_of_sender.iloc[0]['tr_x'])/(time_interval)
    v_bar_dist_y = (receiver_of_sender.iloc[-1]['tr_y'] - receiver_of_sender.iloc[0]['tr_y'])/(time_interval)
    
    # Calculate the v_velocity
    v_bar_velocity_all_x = 0
    v_bar_velocity_all_y = 0
    for i in range(length-1):
        delta_t = receiver_of_sender.iloc[i+1]['re_time'] - receiver_of_sender.iloc[i]['re_time']
        v_bar_velocity_all_x = v_bar_velocity_all_x + receiver_of_sender.iloc[i]['tr_vx'] * delta_t
        v_bar_velocity_all_y = v_bar_velocity_all_y + receiver_of_sender.iloc[i]['tr_vy'] * delta_t
    v_bar_velo_x = v_bar_velocity_all_x/time_interval
    v_bar_velo_y = v_bar_velocity_all_y/time_interval
    
    v_measure_x = np.abs(v_bar_dist_x - v_bar_velo_x)
    v_measure_y = np.abs(v_bar_dist_y - v_bar_velo_y)
    v_mag = np.linalg.norm([v_measure_x, v_measure_y])

    v_total = np.abs(np.linalg.norm([v_bar_dist_x*time_interval, v_bar_dist_y*time_interval]) 
                     - np.linalg.norm([v_bar_velocity_all_x, v_bar_velocity_all_y]))
    
    feature3 = v_measure_x
    feature4 = v_measure_y
    feature5 = v_mag 
    feature6 = v_total
    return feature3, feature4, feature5, feature6


In [None]:
def distance_check(receiver_of_sender, threshold):
    length = receiver_of_sender.shape[0]
    distance_score = []
    for i in range(length):
        distance_score.append(0)
        x = receiver_of_sender.iloc[i]['tr_x'] - receiver_of_sender.iloc[i]['re_x']
        y = receiver_of_sender.iloc[i]['tr_y'] - receiver_of_sender.iloc[i]['re_y']
        distance = np.linalg.norm([x,y])
        if distance >= 800:
            distance_score[i] = 1
    return np.mean(distance_score)

In [None]:
def add_feature_vectors(data):
    
    sender_ID = np.unique(np.array(data.iloc[:,5]))
    number_id_tr_s = len(sender_ID)

    for i in range(number_id_tr_s):
        this = data.loc[(data['tr_ID'] == sender_ID[i])]
        this_recevier_ID = np.unique(np.array(this.iloc[:,1]))
        number_id_re_s = len(this_recevier_ID)
        for j in range(number_id_re_s):
            b = this.loc[this['re_ID'] == this_recevier_ID[j]]
            
            feature_1 = location_plausibility(b)
            feature_2 = movement_plausibility(b)
            feature_3,feature_4,feature_5,feature_6 = quantititative_information(b)
            feature_7 = distance_check(b, 800)
            
            b = b.head(1)
            
            b['feature_1'] = feature_1
            b['feature_2'] = feature_2
            b['feature_3'] = feature_3
            b['feature_4'] = feature_4
            b['feature_5'] = feature_5
            b['feature_6'] = feature_6
            b['feature_7'] = feature_7
            
            if i==0 and j==0:
                feature_vector = b
            else:
                feature_vector = pd.concat([feature_vector, b])
    return feature_vector

In [None]:
import os
def put_csv(data_with_features, attack_type):
    # we put the extracted feature vector into the csv file. 
    outname = './attack'+ str(attack_type) +'with7FeatureVector.csv'
    outdir = './dataset'
    if not os.path.exists(outdir):
        os.mkdir(outdir)
    fullname = './dataset/attack'+ str(attack_type) +'with7FeatureVector.csv'   
    
    data_with_features.to_csv(fullname, encoding='utf-8', index=False)

## Neural Network: Construction and Evaluation

In [None]:
def read_vector_data(data_name):
    # read the feature vector data from csv file.
    
    data_dir = 'dataset/' + data_name + '.csv'
    
    data = pd.read_csv(data_dir)
    data = data.dropna(axis = 0, how = 'any')
    data = data.reset_index(drop = True)
    
    return data

def evaluate_model(y_predict, y_test):
    # Our evaluation function. 
    count_ccr = 0
    TP = 0
    FP = 0
    FN = 0
    for i in range(len(y_test)):
        if y_predict[i]==y_test[i]:
            count_ccr+=1
        if y_predict[i]==1 and y_test[i]==1:
            TP+=1
        if y_predict[i]==1 and y_test[i]==0:
            FP+=1
        if y_predict[i]==0 and y_test[i]==1:
            FN+=1
    ccr = count_ccr/len(y_test)
    if (TP+FP)==0:
        print('All the prediction is normal')
        preci = 0
    else:
        preci = TP/(TP+FP)
    recall= TP/(TP+FN)
    print('For this model, the CCR is', ccr, ', the Precision is', preci, 'and the Recall is', recall )
    
def stats(y, y_train, y_test, y_predict):
    # displays the statistics
    print('There are ', len(y), 'session in total')
    
    malicious_train = sum(y_train)/len(y_train)
    normal_train = 1-malicious_train
    print('The training dataset has,', len(y_train), 'sessions, there are ', malicious_train, 'malicious data, and', normal_train, 'normal data')
    
    malicious_test = sum(y_test)/len(y_test)
    normal_test = 1-malicious_test
    print('The testing dataset has',len(y_test), 'sessions, and there are ', malicious_test, 'malicious data, and', normal_test, 'normal data')
    
    malicious_predict = sum(y_predict)/len(y_predict)
    normal_predict = 1-malicious_predict
    print('The prediction includes ', malicious_predict, 'malicious data, and', normal_predict, 'normal data')

In [None]:
def NN_model(data, Hyper_parameter, NN_structure):
    
    X = data.iloc[:,12:] 
    n = X.shape[1]
    y = data.iloc[:,11]
    X = np.reshape(X.values, (X.shape[0], X.shape[1]))
    y = np.reshape(y.values, (y.shape[0],  1))
    
    model = Sequential()
    model.add(Dense(NN_structure[0], input_dim=n, activation = Hyper_parameter[0]))
    model.add(Dense(NN_structure[1], activation = Hyper_parameter[0]))
    model.add(Dense(NN_structure[2], activation = Hyper_parameter[0]))

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, )

    model.compile(loss = Hyper_parameter[1], optimizer = Hyper_parameter[2], metrics = [Hyper_parameter[3]])
    model.fit(X_train, y_train, epochs=15, batch_size=128)
    
    print('***********************************************')
    y_predict = np.round(model.predict(X_test))
    evaluate_model(y_predict, y_test)
    stats(y,y_train, y_test, y_predict)
    
    return model

In [None]:
def to_labels(malicicous_1, malicicous_2, malicicous_4, malicicous_8, malicicous_16):
    len1 = malicicous_1.shape[0]
    len2 = malicicous_2.shape[0]
    len4 = malicicous_4.shape[0]
    len8 = malicicous_8.shape[0]
    len16 = malicicous_16.shape[0]

    length = (len1 + len2 + len4 + len8 + len16)
    malicious_labels = np.zeros((length, 5))
    for i in range(length):
        if i < len1:
            malicious_labels[i, 0] = 1
        if i >= len1 and i< (len1+len2):
            malicious_labels[i, 1] = 1
        if i >= (len1+len2) and i< (len1+len2+len4):
            malicious_labels[i, 2] = 1
        if i >= (len1+len2+len4) and i< (len1+len2+len4+len8):
            malicious_labels[i, 3] = 1
        if i >= (len1+len2+len4+len8):
            malicious_labels[i, 4] = 1
    return malicious_labels

def check_multi_classification(t,y_test):
    a = np.argmax(t, axis=1)
    b = np.argmax(y_test, axis=1)
    length = len(b)
    results_table = np.zeros((5,5))
    for i in range(length):
        if b[i] == 0:
            if a[i] == 0:
                results_table[0,0]+=1
            if a[i] == 1:
                results_table[0,1]+=1
            if a[i] == 2:
                results_table[0,2]+=1
            if a[i] == 3:
                results_table[0,3]+=1
            if a[i] == 4:
                results_table[0,4]+=1
        if b[i] == 1:
            if a[i] == 0:
                results_table[1,0]+=1
            if a[i] == 1:
                results_table[1,1]+=1
            if a[i] == 2:
                results_table[1,2]+=1
            if a[i] == 3:
                results_table[1,3]+=1
            if a[i] == 4:
                results_table[1,4]+=1
        if b[i] == 2:
            if a[i] == 0:
                results_table[2,0]+=1
            if a[i] == 1:
                results_table[2,1]+=1
            if a[i] == 2:
                results_table[2,2]+=1
            if a[i] == 3:
                results_table[2,3]+=1
            if a[i] == 4:
                results_table[2,4]+=1
        if b[i] == 3:
            if a[i] == 0:
                results_table[3,0]+=1
            if a[i] == 1:
                results_table[3,1]+=1
            if a[i] == 2:
                results_table[3,2]+=1
            if a[i] == 3:
                results_table[3,3]+=1
            if a[i] == 4:
                results_table[3,4]+=1
        if b[i] == 4:
            if a[i] == 0:
                results_table[4,0]+=1
            if a[i] == 1:
                results_table[4,1]+=1
            if a[i] == 2:
                results_table[4,2]+=1
            if a[i] == 3:
                results_table[4,3]+=1
            if a[i] == 4:
                results_table[4,4]+=1
    MM_per = np.zeros((5,5))
    for i in range(5):
        MM_per[i,:] = results_table[i,:]/sum(results_table[i,:])
    
    return MM_per

In [None]:
def Classification_model(data, label, Hyper_parameter, NN_structure):
    X = data.iloc[:,12:] 
    n = X.shape[1]
    y = label
    X = np.reshape(X.values, (X.shape[0], X.shape[1]))

    model_all = Sequential()
    model_all.add(Dense(NN_structure[0], input_dim=n, activation=Hyper_parameter[0]))
    model_all.add(Dense(NN_structure[1], activation=Hyper_parameter[0]))
    model_all.add(Dense(NN_structure[2], activation=Hyper_parameter[0]))

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, )

    model_all.compile(loss=Hyper_parameter[1], optimizer=Hyper_parameter[2], metrics=[Hyper_parameter[3]])

    model_all.fit(X_train, y_train, epochs=20, batch_size=100)
    
    t = model_all.predict(X_test)
    evaluation_matrix = check_multi_classification(t,y_test)
    print(evaluation_matrix)
    return model_all

## Validation

In [None]:
def get_data(data, mode):
    # Randomly pick a transmission session from the dataset. 
    data_category = data.loc[(data['Label']==mode)]
    data_sender_ID = np.unique(np.array(data_category.iloc[:,5]))
    n = np.random.randint(len(data_sender_ID))
    data_sender = data_category.loc[data_category['tr_ID'] == data_sender_ID[n]]
    data_recevier_ID = np.unique(np.array(data_sender.iloc[:,1]))
    m = np.random.randint(len(data_recevier_ID))
    data_select = data_sender.loc[data_sender['re_ID']==data_recevier_ID[m]]
    if data_select.shape[0]==1:
        return get_data(data, mode)
    return data_select

def test_real_data(model, data, mode):
    # Test whether the data is malicious or not.
    data_vector =np.zeros((1, 7))
    data_vector[0][0] = location_plausibility(data)
    data_vector[0][1] = movement_plausibility(data)
    data_vector[0][2], data_vector[0][3], data_vector[0][4], data_vector[0][5] =quantititative_information(data)
    data_vector[0][6] = distance_check(data,800)
    
    y_predict = np.round(model.predict(data_vector))
    
    if y_predict==1:
        print('The detection system said: The BSM is malicious!')
        if mode==1:
            print('The detection is correct!')
        else:
            print('But in fact, the BSM is normal, the detection is incorrect')
    if y_predict==0:
        print('The detection system said: The BSM is normal!')
        if mode==0:
            print('The detection is correct!')
        else:
            print('But in fact, the BSM is malicious, the detection is incorrect')

def classification(model, data, n):
    
    data_vector =np.zeros((1, 7))
    data_vector[0][0] = location_plausibility(data)
    data_vector[0][1] = movement_plausibility(data)
    data_vector[0][2], data_vector[0][3], data_vector[0][4], data_vector[0][5] =quantititative_information(data)
    data_vector[0][6] = distance_check(data,800)
    
    y_predict = model.predict(data_vector)
    
    t = np.argmax(y_predict)
    
    if t == n :
        print('The classification is correct!')
    if t != n :
        print('The classification is incorrect:')
        print('The attack is ', atk_type[n], ', however, the model classifies it as ', atk_type[t])
        
from IPython.display import display, HTML

def validation(input_type, mode):
    data = get_data(raw_data[input_type], mode)
    print('The session we randomly choose from the dataset', atk_type[input_type], ' is shown as follows')
    display(data)
    print('*************We firstly check whether it is malicious by its corresponding model*************')
    test_real_data(model_trained[input_type], data, mode)
    print('*************We then check whether it is malicious by its general model*************')
    test_real_data(model_all, data, mode)
    
    if mode==1:
        print('*************If it is malicious, we classify it*************')
        classification(model_classification, data, input_type)
    if mode!=1:
        print('The session is normal!!!')
    