In [27]:
import json 
import matplotlib.pyplot as plt
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import hamming_loss

from sklearn.model_selection import train_test_split
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import classification_report

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.multiclass import OneVsRestClassifier
from xgboost import plot_tree
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer


loadFromMerged=True
loadFromIndexes= False
Mapper='S'
IgnoreEmpty= True
FoldID ="1"
Epoch_count=100
Batch_size=5000


def vectorize_docs (sequences):
    vectorizer = CountVectorizer(analyzer='word', ngram_range=(2, 5))
    X = vectorizer.fit_transform(sequences)
    #print(vectorizer.get_feature_names())
    return X


def print_info(y_test, pred , classes , confidence=0.90 ):
    # print(classification_report(y_test, pred, target_names=classes))
    # print(hamming_loss(y_test,pred))
   
    def make_recall_shit( inp ):
        tp = inp[1][1]
        tn = inp[0][0]
        fp = inp[0][1] 
        fn = inp[1][0]
        
        acc = (tp+tn)*1.0 / ( tp+tn+fp+fn)*1.0
        recall = tp*1.0/ ( tp+fn ) *1.0
        prec = tp*1.0 / ( tp+fp )*1.0
        
    #   F= 2.0*( prec* recall )/ (prec+recall)
        F= 2.0*( tp)/ (2*tp + fp + fn)
        
        return acc, recall, prec, F

    counts = np.sum( y_test.astype(int) , axis=0)
    
    pred[pred>=confidence] = 1
    pred[pred<confidence] = 0
    
    #   acc_wierd  =acc_match_wierd(y_test, pred)
    
    conf= multilabel_confusion_matrix( y_test , pred.astype(int), labels= range(len(classes)))
    accs = [make_recall_shit(x) for x in conf]
    print( "%30s  %8s   %8s  %8s  %8s %8s %22s" %( "Class","Accuracy", "Recall","Precision","F Score" , "Count", "TP/TN/FP/FN"))
    print( "------------------------------------------------------------------------" )
    
    for index in range(len(classes)):
        tp = conf[index][1][1]
        tn = conf[index][0][0]
        fp = conf[index][0][1] 
        fn = conf[index][1][0]
        
        print( "%30s  %8.3f   %8.3f  %8.3f  %8.3f  %8d %5d/%5d/%5d/%5d"  %
             (classes[index],
              accs[index][0],
              accs[index][1],
              accs[index][2],
              accs[index][3],
              counts[index],
                  tp ,
                tn ,
                fp ,
                fn ))
    n_zeros_true = len([ x  for x  in  [np.sum(np.abs( y_test[i] )) for i in range(len(y_test))] if x  == 0]  )
    n_zeros_pred = len([ x  for x  in  [np.sum(np.abs( pred[i] )) for i in range(len(pred))] if x  == 0]  )
    
    accs = np.nan_to_num(accs)
    
    def acc_match( true, pred ):
        """
        returns exact mathc accuracy
        """
        
        return (len( [ x  for x  in  [np.sum(np.abs( true[i]- pred[i] )) for i in range(len(true))] if x  == 0]))*1.0 / len(true)
    
    def acc_match_wierd( true, pred ):
        """
        returns exact mathc accuracy
        """
        level = 6 
        switch = 11
        threeAxis=13
        accel = 0 
        status=10
        contact=5

        counter  = 0 
        for i in range( len (true) ):
            if np.sum(np.abs( true[i]- pred[i] ))==0 :
                counter+=1
            else : 
                t_rec = np.array(list( pred[i]))
                
                if true[i][level]==1 or true[i][switch]==1 or t_rec[level]==1 :
                    t_rec[switch]=1
                
                if true[i][threeAxis]==1 or true[i][accel]==1 or t_rec[threeAxis]==1:
                    t_rec[accel] =1
                
                if true[i][status]==1 or true[i][contact]==1 or t_rec[status]==1:
                    t_rec[contact]=1
            #             print(t_rec , true[i])    
                if np.sum(np.abs( true[i]- t_rec ))==0 :
                    counter+=1   
        return counter*1.0 / len(true)

    print ("------------------------------------------------------------------------")
    print( "%30s  %8.3f   %8.3f  %8.3f  %8.3f  %8d %5d/%5d/%5d/%5d"  %
             ("AVERAGES",
              np.average( accs, axis=0)[0],
              np.average( accs, axis=0)[1],
              np.average( accs, axis=0)[2],
              np.average( accs, axis=0)[3],
              len(y_test),
                  0 ,
                0,
                0 ,
                0 ))
    print ( "Exact Match ACC : %.5f " % acc_match( y_test, pred )  )

    # print ( "Wierd Exact Match ACC : %.5f" % acc_wierd)
    print ( "Total Records : %d " % len(y_test)  )
    print ( "Total ZXeros in True : %d (%.3f)%%" % (n_zeros_true ,  n_zeros_true * 1.0/ len(y_test)  ))
    print ( "Total ZXeros in Test : %d (%.3f)%%" % (n_zeros_pred ,  n_zeros_pred * 1.0/ len(y_test)  ) )
    print ('=============================================================================')


# this cleans the data removing emply nodes and turning the nodes into sarrays by calling the mapping function 
def clean_data( x_data, y_data , removeempty=True, Mapping='S'):

    #  mapps the input records to a integer array for the input
    def mapping_x( inp, includeDirection = False , TrimAt= 15 ):
        includeDirection=True
        if includeDirection:
            return np.array([ x["frame_length"] + (' hub' if x['packet_source']=='hub' else ' server')  for x in inp ][:15])
        else:
            return np.array([ int(int(x["frame_length"])/10)*10  for x in inp ][:15])

    def mapping_y_service(inp):
        try:
            return np.array(  list(set([x["event"] for x in inp])) if (len(inp )>0) else ["none"] )
        except:
            print(inp)

    def mapping_y_service_event(inp):
        return np.array(  list(set([ "%s-%s"%( x["event"] ,x["val"] ) for x in inp])) if (len(inp )>0) else ["none"] )

    def mapping_y_device_service(inp):
        return np.array(  list(set([ "%s & %s"%( x["device"] ,x["event"] ) for x in inp])) if (len(inp )>0) else ["none"] )

    def mapping_y_full(inp):
        return np.array(  list(set([ "%s & %s & %s"%( x["device"] ,x["event"], x['val'] ) for x in inp])) if (len(inp )>0) else ["none"] )

    # print(y_data)
    cleans = [] 
    cleans = (sorted([ x for x in y_data if  len(y_data[x]) > 0 ] ))
    
    ret_x  = [x_data[x] for x in cleans]
    ret_y  = [y_data[x] for x in cleans] 
    
    print( len(y_data), len(cleans) )
    
    ret_x  = [ mapping_x(x) for x in ret_x ] 
    ret_y_s = [ mapping_y_service(y) for y in ret_y ]
    if Mapping=='S':
        ret_y  = [ mapping_y_service(y) for y in ret_y ]
    elif Mapping=='SE':
        ret_y  = [ mapping_y_service_event(y) for y in ret_y ]
    elif Mapping=='DS':
        ret_y  = [ mapping_y_device_service(y) for y in ret_y ]
    elif Mapping=='F':
        ret_y  = [ mapping_y_full(y) for y in ret_y ]
    return ret_x, ret_y, ret_y_s

def pre_process_raw( x_data,y_data, dim_size = 128, test = False, normalize = False ,classes=None, twoD= False, string =False ):
    #  y data 
    # """
    # this functino is in charge of preprocessing the records , the sourc e json contains a lot of extra stuff, this function tailors
    # the data and it fixes their lenghth
    # """
    services=classes
    if 'unknown' not in classes:
        classes.append('unknown')
    if classes is None:
        classes  = sorted(list(np.unique(  np.concatenate( y_data  ))))
    else :
        classes = sorted(classes)
    
    y_data_categorical = []  
    x_data_temp = [] 
    y_class_data = []
    for i,x in enumerate(y_data):
        classFound=False
        temp = np.zeros( len(classes) )
        # print(x)
        y_class=[]
        for y in x : 
            if y=='unknown' and len(x)>1:
                continue
            if y in services :
                y_class.append(y)
                temp[ classes.index( y ) ] = 1
                classFound=True if not classFound else classFound
        if 'known' in classes and not classFound:
            temp[ classes.index( 'known') ] = 1
            y_data_categorical.append( temp )
            x_data_temp.append(x_data[i])            
            y_class_data.append(x)
        elif classFound or test:
            # if 'lock' in y_class:
            #     print(x_data[i])
            y_class_data.append(' '.join(y_class))
            if not classFound:
                temp[ classes.index( 'unknown') ] = 1
            y_data_categorical.append(temp )
            x_data_temp.append(x_data[i])
            
    y_data_categorical = np.vstack(y_data_categorical)
    # /////////print(y_data_categorical)
    x_data=x_data_temp

    x_data_temp=[]
    temp = [] 
    lst = list(x)
    for x in x_data:
        temp = [] #list(x)
        lst = list(x)
        temp = lst
        while dim_size - len(temp )   > len(lst):
            temp.extend(lst)

        while len(temp) < dim_size:
            temp.append( 0 )
        if not string:
            x_data_temp.append(np.array(temp))
        else:
            x_data_temp.append(' '.join(list(map(str,temp))))
        # print(temp)
   
    
    if normalize:
        x_data_temp = np.array( x_data_temp) / (np.amax( x_data_temp) + 0.000000000001)
    else :
        x_data_temp = np.array(x_data_temp)
    
   
    return  x_data_temp ,y_data_categorical , y_class_data

x_data= []
y_data= []
x_test = []
y_test = []
y_test_service= []
x_train = {}
y_train = {}
test_names = []

def load_data():
    global x_train, y_train, y_train_service

    with open('../files/train/test/test_homes/final_upload/Merged_final_with_home/merged_hub_segments_final.json'  ) as f:
        y_data = json.load(f)

    with open('../files/train/test/test_homes/final_upload/Merged_final_with_home/merged_pcap_segments_final.json'  ) as f:
        x_data = json.load(f)
        
        
    x_train,y_train, y_train_service= clean_data( x_data, y_data, IgnoreEmpty , Mapping=Mapper )


    print("loading from test files")

    test_files = sorted(os.listdir('../files/train/test/test_homes/final_upload/usecases/pcap_segments_final_final/'))
    print( "found files : " , len(test_files) )

    for pick  in test_files:        
        if 'home_os_final'  in pick:
            fname  = os.path.basename(pick)
            test_names.append( fname )

            with open( os.path.join( '../files/train/test/test_homes/final_upload/usecases/hub_segments_final_final/', fname) ) as f:
                y_data_test = json.load(f)

            with open( os.path.join('../files/train/test/test_homes/final_upload/usecases/pcap_segments_final_final/', fname) ) as f:
                x_data_test = json.load(f)

            t_x,t_y, t_z= clean_data( x_data_test, y_data_test, False , Mapping=Mapper )
            x_test.append(t_x)
            y_test.append(t_y)
            y_test_service.append(t_z)

def predict_labels(classes,x_test_new,y_test_new,confidence):
    is_string=True
 
    x_test_processed,y_test_processed_encoded, y_test_processed =\
            pre_process_raw( x_test_new, y_test_new , dim_size, 
    test=True, normalize=False, classes=classes, string=is_string)
    # rf_tests  = [ pre_process_raw( x_test[i], y_test[i] , dim_size, zero_pad=True, normalize=False, \
    #   classes=classes, string=is_string) for i in range(len(x_test)) ] 

    # def classify_RandomForest():

    #     clf = RandomForestClassifier(n_estimators=960, max_depth=9050,random_state=0 )
    #     print('filtered data')
    #     t_hist = clf.fit(x_train_processed, y_train_processed_encoded)

    #     for i in range(len(rf_tests)) :
    #         print( "==================HOME Case : %s =============" % test_names[ i] )
    #         rf_pred= clf.predict( rf_tests[i][0])
            
    #         print(sum([x[0] for x in rf_tests[i][1]]))
    #         print(sum(x[1] for x in rf_tests[i][1]))
    #         print(classes)
    #         print_info( rf_tests[i][1], rf_pred, classes)
        
    def draw_xgb_tree():
        xgb = XGBClassifier()
        y = [x.split(' ')[0] for x in y_train_processed]
        xgb.fit(x_train_processed, y)
        # plot single tree
        plot_tree(xgb)
        plt.show()

    def classify_xgb(x_test_processed):
        xgb_classifier = OneVsRestClassifier(XGBClassifier())
        if is_string:
            vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 5))
            x_train_str = vectorizer.fit_transform(x_train_processed)
            
            x_test_str = vectorizer.transform(x_test_processed)
            
            xgb_classifier.fit(x_train_str, y_train_processed_encoded)

        else:
            xgb_classifier.fit(x_train_processed, y_train_processed_encoded)

        known_x_data=[]
        known_y_data=[]
        print( "==================HOME Case : Omid ============="  )
        rf_pred= xgb_classifier.predict_proba(x_test_str)
        print_info(y_test_processed_encoded, rf_pred, classes, confidence)
        for ind,y_true in enumerate(y_test_processed):   
               
            y_pred_class = [classes[i] for i,x in enumerate(rf_pred[ind]) if x>confidence]
            # if 'known' not in classes: 
            #     # print(y_true, y_pred_class)
            #     if not y_true and y_pred_class and 'unknown' not in y_pred_class:
            #         print("True: ", y_true,"\n Pred: ",y_pred_class )
            #         print("Last 5 true: ",y_test_processed[ind-10:ind])
            #         print(x_test_processed[ind])
            y_pred_class = y_pred_class if y_pred_class else ['no_prediction']
            if 'known'  in y_pred_class:
                # print(x_test_processed[ind],"\nTrue: ",y_true, "\nPrediction: ", ' '.join(y_pred_class))
                known_x_data.append(x_test_processed[ind].split(' '))
                known_y_data.append(y_test_processed[ind])
                    
        return known_x_data,known_y_data
    # classify_RandomForest()
    return classify_xgb(x_test_processed)

# all_classes = sorted(list(np.unique(  np.concatenate( y_train  ))))

load_data()

58958 57867
loading from test files
found files :  4
19968 19516
9109 8941


In [29]:
all_classes = sorted(list(np.unique(  np.concatenate( y_train  ))))

In [31]:
dim_size= 20
is_string =True
services_to_keep = ['unknown','known']
classes = sorted(services_to_keep)
x_train_processed,y_train_processed_encoded, y_train_processed =  pre_process_raw( x_train, y_train , dim_size, test=False, normalize=False, classes=classes, string=is_string)
confidence=0.4
x_test_new,y_test_new = predict_labels(classes,x_test[0],y_test[0],confidence)

print(len(x_test_new))
# print(y_test_new)
confidence=0.75
services_to_keep =  all_classes # ['contact','lock','motion', 'switch','colorTemperature','button','ping']
classes = sorted(services_to_keep)
x_train_processed,y_train_processed_encoded, y_train_processed =pre_process_raw( x_train, y_train , dim_size, test=False, normalize=False, classes=classes, string=is_string)
predict_labels(classes,x_test_new,y_test_new,confidence)

                         Class  Accuracy     Recall  Precision   F Score    Count            TP/TN/FP/FN
------------------------------------------------------------------------
                         known     0.887      0.846     0.827     0.837      6685  5654/11652/ 1179/ 1031
                       unknown     0.917      0.989     0.895     0.940     12831 12688/ 5203/ 1482/  143
------------------------------------------------------------------------
                      AVERAGES     0.902      0.917     0.861     0.888     19516     0/    0/    0/    0
Exact Match ACC : 0.86365 
Total Records : 19516 
Total ZXeros in True : 0 (0.000)%
Total ZXeros in Test : 0 (0.000)%
6833




                         Class  Accuracy     Recall  Precision   F Score    Count            TP/TN/FP/FN
------------------------------------------------------------------------
                  acceleration     0.994      0.051     0.667     0.095        39     2/ 6793/    1/   37
                      activity     0.999      0.857     0.857     0.857        14    12/ 6817/    2/    2
                       battery     1.000      0.000       nan     0.000         2     0/ 6831/    0/    2
                        button     1.000      0.923     1.000     0.960        13    12/ 6820/    0/    1
              colorTemperature     1.000      0.833     1.000     0.909         6     5/ 6827/    0/    1
                       contact     0.992      0.585     0.962     0.727       130    76/ 6700/    3/   54
                         level     0.990      0.720     0.234     0.353        25    18/ 6749/   59/    7
                          lock     0.999      0.886     0.886     0.886        3

([], [])

In [24]:
x_train

{}

In [36]:
all_classes.remove('threeAxis')

TypeError: remove() takes exactly one argument (2 given)

In [19]:
 sorted(list(np.unique(  np.concatenate( y_train  ))))

TypeError: The first input argument needs to be a sequence

In [20]:
sorted(list(np.unique(  np.concatenate( y_train  ))))

TypeError: The first input argument needs to be a sequence

In [21]:
y_train

{}