In [1]:
# disable warning but only for sake of rendering the notebook
# delete verbosity and warning stuff, I am using some depracated code which could cause trouble in the future
import warnings
warnings.filterwarnings('ignore')
import tensorflow as tf
tf.logging.set_verbosity(tf.logging.ERROR)




import pandas as pd
import numpy as np
from pandas.io.json import json_normalize
import json
from collections import Counter
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import confusion_matrix,classification_report
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder
from keras.callbacks import EarlyStopping

Using TensorFlow backend.


### Data preparation

In [2]:
# convert data files to proper json format
def dataset_to_json(old_name, new_name):
    with open(old_name, 'r') as f:
        with open(new_name,'w') as wf:
            wf.write("[")
            wf.write(next(f))
            for line in f:
                wf.write(","+line)
            wf.write("]")
            
            
# dataset_to_json("dataset/test.json","dataset/test_mod.json")
# dataset_to_json("dataset/train.json","dataset/train_mod.json")


with open("dataset/test_mod.json") as f:
    test_data = json.load(f)
test = json_normalize(test_data)

with open("dataset/train_mod.json") as f:
    train_data = json.load(f)
train = json_normalize(train_data)
split = len(train)
train = pd.concat([train,test], sort=False, ignore_index=True)

In [3]:
def ssdp_features(data, number_of_most_used = 30):
    lists = {'st':[], 'nt':[], 'server':[],'user_agent':[]}
    keys = lists.keys()
    for ssdps in data.ssdp:
        if type(ssdps) != float:
            for ssdp in ssdps:
                for key in keys:
                    if key in ssdp:
                        lists[key].append(ssdp[key])

    most_used_ssdps = { 'st': sorted(zip(Counter(lists['st']).keys(),Counter(lists['st']).values()), key=lambda x: x[1], reverse=True),
                        'nt' : sorted(zip(Counter(lists['nt']).keys(),Counter(lists['nt']).values()), key=lambda x: x[1], reverse=True),
                        'server' : sorted(zip(Counter(lists['server']).keys(),Counter(lists['server']).values()), key=lambda x: x[1], reverse=True),
                        'user_agent' : sorted(zip(Counter(lists['user_agent']).keys(),Counter(lists['user_agent']).values()), key=lambda x: x[1], reverse=True),
                      }

    def filter_ssdp(ssdp, key):
        def filter_(cell):
            if type(cell)!=float:
                for each in cell:
                    if key in each and ssdp == each[key]:
                        return True
            return False
        return filter_

    ssdps = {}
    for key, sorted_array in most_used_ssdps.items():
        ssdps[key] = [each for each, _ in sorted_array][:number_of_most_used]

    features_names = []
    for key, array in ssdps.items():
        for ssdp in array:
            data[key+'_'+ssdp] = 0
            data.loc[data.ssdp.apply(filter_ssdp(ssdp, key)), key+'_'+ssdp] = 1
            features_names.append(key+'_'+ssdp)
    
    return features_names

In [4]:
def upnp_services_features(data, number_of_most_used = 60):
    services = []
    for each in data.upnp:
        if type(each)!=float:
            for e in each:
                if 'services' in e:
                    services.extend(e['services'])

    most_used_services = sorted(zip(Counter(services).keys(),Counter(services).values()), key=lambda x:x[1], reverse=True)

    def filter_ser(ser):
        def filter_(cell):
            if type(cell)!=float:
                for each in cell:
                    if 'services' in each and ser in each['services']:
                        return True
            return False
        return filter_

    srvs = [ser for ser, _ in most_used_services][:number_of_most_used]
    features_names = []
    for ser in srvs:
        data[ser] = 0
        data.loc[data.upnp.apply(filter_ser(ser)), ser] = 1
        features_names.append(ser)

    
    return features_names

In [5]:
def upnp_description_features(data):
            
    # mining data from description
    def get_model_descriptions(cell):
        descriptions = []
        if type(cell)!=float:
            for each in cell:
                if 'model_description' in each:
                    descriptions.append('des_' + each['model_description'].lower())
        return descriptions

    def filter_des(des):
        def filter_(cell):
            if type(cell)!=float:
                for each in cell:
                    if 'model_description' in each and des in each['model_description'].lower():
                        return True
            return False
        return filter_

    descs = ['files','dial','sony','lime','device','plugin','personal','roku','wireless','dmrplus','tv', 'network', 'camera', 'video', 'audio', 'bluetooth', 'radio', 'desktop', 'render', 'media', 'dongle', 'sensor', 'ip', 'music', 'netcast', 'home', 'control', 'stream', 'nas', 'server', 'cloud', 'xbox', 'play']
    features_names = []
    for des in descs:
        data['d_'+des] = 0
        data.loc[data.upnp.apply(filter_des(des)), 'd_'+des] = 1
        features_names.append('d_'+des)
    return features_names

In [6]:
def upnp_model_name_features(data, number_of_most_used = 60):
    def get_models(cell):
        models = []
        if type(cell)!=float:
            for each in cell:
                if 'model_name' in each:
                    models.append('mod_' + each['model_name'].lower())
        return models

    def filter_model(model):
        def filter_(cell):
            if type(cell)!=float:
                for each in cell:
                    if 'model_name' in each and each['model_name'].lower() == model:
                        return True
            return False
        return filter_

    models = []
    for each in data.upnp.apply(lambda x: get_models(x)):
        models.extend(each)
    most_used_models = sorted(zip(Counter(models).keys(),Counter(models).values()), key=lambda x: x[1], reverse=True)

    models = [model for model, _ in most_used_models][:number_of_most_used]
    features_names = []
    for model in models:
        data[model] = 0
        data.loc[data.upnp.apply(filter_model(model.split('_')[1])), model] = 1
        features_names.append(model)
    return features_names

In [7]:
def upnp_manufacturer_features(data, number_of_most_used = 60):

    def get_manufacturer(cell):
        manufacturers = []
        if type(cell)!=float:
            for each in cell:
                if 'manufacturer' in each:
                    manufacturers.append('man_' + each['manufacturer'].lower())
        return manufacturers

    manufa = []
    for each in data.upnp.apply(lambda x: get_manufacturer(x)):
        manufa.extend(each)
    most_used_manufa = sorted(zip(Counter(manufa).keys(),Counter(manufa).values()), key=lambda x: x[1], reverse=True)

    def filter_manu(manu):
        def filter_(cell):
            if type(cell)!=float:
                for each in cell:
                    if 'manufacturer' in each and each['manufacturer'].lower() == manu:
                        return True
            return False
        return filter_

    manufas = [manu for manu, _ in most_used_manufa][:number_of_most_used]
    features_names = []
    for manu in manufas:
        data[manu] = 0
        data.loc[data.upnp.apply(filter_manu(manu.split('_')[1])), manu] = 1
        features_names.append(manu)
        
    return features_names

In [8]:
def upnp_device_type(data):
    import re
    def get_types(cell):
        cell_types = []
        for each in cell:
            if 'device_type' in each:
                search = re.search(":device:(.*?):",each['device_type'])
                if search is not None:
                    cell_types.append(search.group(1).lower())
        return sorted(cell_types)   
    def get_type_and_device(data):
        return zip(data[data.upnp.notna()].upnp.apply(get_types),data[data.upnp.notna()].device_class)

    all_types = []
    for device_type, _ in get_type_and_device(data):
        all_types.extend(device_type)
    all_types = sorted(set(all_types))

    features_names = []
    for each in all_types:
        data[each] = 0
        data.loc[ data.upnp.apply(lambda x: True if not type(x) == float and each in get_types(x) else False) , each] = 1
        features_names.append(each)
        
    return features_names

In [9]:
def filter_mac(filter_):
    def _filter(mac):
        if mac:
            if mac.startswith(filter_):
                return True
        return False
    return _filter

In [10]:
def mac_features(data, number_of_most_used = 100):
    macs = data.mac.apply(lambda x: x[3:8])

    most_used_macs = sorted(zip(Counter(macs).keys(),Counter(macs).values()), key=lambda x: x[1], reverse=True)
    macs = [mac for mac, _ in most_used_macs][:number_of_most_used]
    features_names = []
    for mac in macs:
        data[mac] = 0
        data.loc[data.mac.apply(filter_mac(mac)), mac] = 1
        features_names.append(mac)

    return features_names

  

In [11]:
def dhcp_text_features(data):
    def is_dhcp_smth(cell, smth):
        for each in cell:
            if 'classid' in each and smth in each['classid']:
                return True
        return False

    dhcps = ['android','dhcpcd', 'yealink', 'MSFT', 'EMLAB', 'udhcp', 'stb', 'IP-STB', 'Linux']

    features_names = []
    for dhcp in dhcps:
        data[dhcp] = 0
        data.loc[data.dhcp.apply(lambda x: is_dhcp_smth(x,dhcp) if type(x) != float else False), dhcp] = 1
        features_names.append(dhcp)

    return features_names


In [12]:
def mdns_features(data, number_of_most_used = 50):
    services = []
    for each in data[data.mdns_services.notna()].mdns_services:
        services.extend(each)

    most_used_mdns = sorted(zip(Counter(services).keys(),Counter(services).values()), key=lambda x: x[1], reverse=True)


    mdns = [ each for each, _ in most_used_mdns[:number_of_most_used]]
    features_names = []
    for ident in mdns:
        data[ident] = 0
        data.loc[ (data.mdns_services.notna()) & (data.mdns_services.apply(lambda x:True if type(x) != float and ident in x else False) ), ident] = 1
        features_names.append(ident)
    return features_names


In [13]:
def mobile_mac_features(data, number_of_most_used = 30):
    m = data[data.device_class == 'MOBILE'].mac.apply(lambda x: x[:8])

    mm = sorted(zip(Counter(m).keys(),Counter(m).values()), key=lambda x: x[1], reverse=True)

    m_macs = [mac for mac, _ in mm][:number_of_most_used]

    features_names = []
    for mac in m_macs:
        data[mac] = 0
        data.loc[data.mac.apply(filter_mac(mac)), mac] = 1
        features_names.append(mac)
    return features_names
    
    

In [14]:
def voice_assistant_mac_features(data, number_of_most_used = 30):
    m = data[data.device_class == 'VOICE_ASSISTANT'].mac.apply(lambda x: x[:8])

    mm = sorted(zip(Counter(m).keys(),Counter(m).values()), key=lambda x: x[1], reverse=True)

    m_macs = [mac for mac, _ in mm][:number_of_most_used]

    features_names = []
    for mac in m_macs:
        data[mac] = 0
        data.loc[data.mac.apply(filter_mac(mac)), mac] = 1
        features_names.append(mac)
    return features_names

In [15]:
def ip_features(data, number_of_most_used = 30):
    ips = []
    for each in data.ip:
        ips.append(each.split('.')[0])

    most_used_ips = sorted(zip(Counter(ips).keys(),Counter(ips).values()), key=lambda x: x[1], reverse=True)


    ips = [ each for each, _ in most_used_ips][:number_of_most_used]
    
    features_names = []
    for ip in ips:
        data['ip_' + ip] = 0
        data.loc[ data['ip'].str.startswith( ip ) , 'ip_' + ip] = 1
        features_names.append('ip_' + ip)
    
    return features_names

In [16]:
def port_features(data, number_of_most_used = 60):
    def is_port(cell, port, protocol):
        for service in cell:
            if service['port'] == port and service['protocol'] == protocol:
                return True
        return False
    def filter_port(data, port, protocol):
        return data['services'].apply(lambda cell: True if type(cell)!=float and is_port(cell,port, protocol) else False)

    all_ports = []
    for each in data['services']:
        if type(each) != float:
            for e in each:
                all_ports.append(str(e['port'])+'_'+e['protocol'])

    most_used = sorted(zip(Counter(all_ports).keys(),Counter(all_ports).values()), key=lambda x: x[1], reverse=True )

    ports = [ 'port_'+each for each, _ in most_used[:number_of_most_used]]

    features_names = []
    for ident in ports:
        port, protocol = ident.split('_')[1:]
        data[ident] = 0
        data.loc[filter_port(data, int(port), protocol), ident] = 1
        features_names.append(ident)

    return features_names

In [17]:
def dhcp_params_features(data, number_of_most_used = 60):

    params = []
    for each in train[train.dhcp.notna()].dhcp:
        if 'paramlist' in each[0]:
            params.append(each[0]['paramlist'])

    most_used_params = sorted(zip(Counter(params).keys(),Counter(params).values()), key=lambda x: x[1], reverse=True)

    def is_param(cell, param):
        if 'paramlist' in cell[0]:
            if cell[0]['paramlist'] == param:
                return True
        return False
    def filter_param(train, param):
        return train['dhcp'].apply(lambda cell: True if type(cell)!=float and is_param(cell,param) else False)

    params = [ each for each, _ in most_used_params[:number_of_most_used]]

    features_names = []
    for param in params:
        train['par_'+param] = 0
        train.loc[ filter_param(train, param) , 'par_' + param] = 1
        features_names.append('par_'+param)
        
    return features_names

In [18]:
train['glob_ad'] = train.mac.apply(lambda x: (int(x[1:2],16)&2)//2 )
train['zero_mac'] = train.mac.apply(lambda x: 1 if x == "00:00:00:00:00:00" else 0)
train['dhcp_nan'] = 0
train.loc[ (train['dhcp'].isna()), 'dhcp_nan'] = 1
train['mdns_services_nan'] = 0
train.loc[train['mdns_services'].isna(), 'mdns_services_nan'] = 1
train['upnp_nan'] = 0
train.loc[train['upnp'].isna(), 'upnp_nan'] = 1
train['ip_valid'] = 0
train.loc[train['ip'].str.startswith('10') | train['ip'].str.startswith('192.168') | train['ip'].str.startswith('172'), 'ip_valid'] = 1
train['ip_nan'] = 0
train.loc[train['ip'] == '', 'ip_nan'] = 1
train['ssdp_nan'] = 0
train.loc[train['ssdp'].isna(), 'ssdp_nan'] = 1
train['n_of_tcp'] = train.services.apply(lambda x: len( [] if x is np.nan else [ a['protocol'] for a in x if a['protocol'] == 'tcp'] ) )
train['n_of_udp'] = train.services.apply(lambda x: len( [] if x is np.nan else [ a['protocol'] for a in x if a['protocol'] == 'udp'] ) )


In [19]:
ssdp_features(train, number_of_most_used = 30)
upnp_services_features(train, number_of_most_used = 60)
upnp_description_features(train)
upnp_model_name_features(train, number_of_most_used = 60)
upnp_manufacturer_features(train, number_of_most_used = 60)
upnp_device_type(train)
mac_features(train, number_of_most_used = 100)
dhcp_text_features(train)
mdns_features(train, number_of_most_used = 50)  
mobile_mac_features(train, number_of_most_used = 30)
# voice_assistant_mac_features(train, number_of_most_used = 30)
ip_features(train, number_of_most_used = 30)
port_features(train, number_of_most_used = 60)
dhcp_params_features(train, number_of_most_used = 60);

### Selected data and model accuracy

In [20]:
def del_default_features(data):
    data.drop(['mac','ip','services','device_class','device_id','upnp', 'ssdp', 'mdns_services','dhcp'], axis = 1, inplace = True)
    return data

In [21]:
def get_train_test():
    x = train.copy()
    del_default_features(x)
    x = pd.get_dummies(x)
    y = train[['device_class']].copy()
    X_train = x[:split].copy()
    y_train = y[:split].copy()
    X_val = x[split:].copy()
    # use all data for final model.
    # for finding right params use test size 0.4 or smth you prefer
    return train_test_split(X_train, y_train, test_size=0.0001), X_val


(X_train, X_test, y_train, y_test), X_val = get_train_test()
# use SMOTE to generate synthetic data because original data are imbalanced
X_res, y_res = SMOTE().fit_sample(X_train, y_train)


In [22]:
encoder = LabelEncoder()
encoder.fit(y_res)
encoded_y = encoder.transform(y_res)
dummy_y = np_utils.to_categorical(encoded_y)

In [23]:
def baseline_model():
    # create model
    model = Sequential()
    model.add(Dense(20, input_dim=X_res.shape[1], activation='relu'))
    model.add(Dense(dummy_y.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model
nn_smote_model = baseline_model()

In [24]:
es = EarlyStopping(monitor='val_loss',
                              min_delta=0,
                              patience=10,
                              verbose=0, mode='auto')
history = nn_smote_model.fit(X_res, dummy_y,
                       epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [25]:
from sklearn.metrics import classification_report, confusion_matrix
def print_report(model, X_train, y_train):
    y_pred_bool = np.argmax(model.predict(X_train.values), axis=1)
    print(classification_report(y_train, encoder.inverse_transform(y_pred_bool)))

In [26]:
# here you should use X_train and y_train from train_test_split function
# but since I am using whole dataset I will use train data and have biased accuracy
print_report(nn_smote_model, X_train, y_train)

                 precision    recall  f1-score   support

          AUDIO       1.00      1.00      1.00      3141
   GAME_CONSOLE       0.98      0.99      0.99      2383
    GENERIC_IOT       0.99      0.95      0.97      2959
HOME_AUTOMATION       1.00      1.00      1.00     12302
       IP_PHONE       1.00      1.00      1.00      5193
      MEDIA_BOX       0.97      0.93      0.95      6059
         MOBILE       0.86      0.98      0.91      1382
            NAS       1.00      1.00      1.00      2813
             PC       1.00      0.99      0.99      6665
        PRINTER       1.00      1.00      1.00      4342
   SURVEILLANCE       0.99      1.00      0.99      2037
             TV       1.00      0.97      0.98      5712
VOICE_ASSISTANT       0.87      0.98      0.93      2912

       accuracy                           0.98     57900
      macro avg       0.97      0.98      0.98     57900
   weighted avg       0.98      0.98      0.98     57900



In [27]:
prediction = encoder.inverse_transform(np.argmax(nn_smote_model.predict(X_val), axis=1))

In [28]:
d = {'Id': test.device_id.array, 'Predicted': prediction}
pd.DataFrame(data=d).to_csv('pred.csv', index=False)