In [57]:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
import seaborn as sns

import os

from imblearn.over_sampling import SMOTE
from collections import Counter

from pcap_info import *

%matplotlib inline

In [2]:
sns.set()

In [3]:
rfpath = './reduced_features'
red_feat_file = './reduced_feature_list.txt'

In [4]:
with open(red_feat_file) as f:
    red_feat = list(map(lambda l: l.strip(), f.readlines()))
print(red_feat)

['ratetot_num', 'ratetot_sz', 'ratet_num_0', 'ratet_num_2', 'ratet_sz_0', 'ratet_sz_2', 'rate_num_0', 'rate_num_2', 'rate_num_4', 'rate_num_6', 'rate_num_16', 'rate_num_18', 'rate_num_20', 'rate_num_34', 'rate_num_40', 'rate_num_44', 'rate_num_48', 'rate_num_52', 'rate_num_54', 'rate_sz_0', 'rate_sz_2', 'rate_sz_4', 'rate_sz_6', 'rate_sz_16', 'rate_sz_18', 'rate_sz_20', 'rate_sz_34', 'rate_sz_40', 'rate_sz_44', 'rate_sz_48', 'rate_sz_52', 'rate_sz_54', 'fract_num_0', 'fract_num_2', 'fract_sz_0', 'fract_sz_2', 'frac_num_0', 'frac_num_2', 'frac_num_4', 'frac_num_16', 'frac_num_18', 'frac_num_20', 'frac_num_34', 'frac_num_40', 'frac_num_44', 'frac_num_48', 'frac_num_52', 'frac_sz_0', 'frac_sz_2', 'frac_sz_4', 'frac_sz_16', 'frac_sz_18', 'frac_sz_20', 'frac_sz_34', 'frac_sz_40', 'frac_sz_44', 'frac_sz_48', 'frac_sz_52', 'sratt_num_0', 'sratt_num_2', 'sratt_sz_0', 'sratt_sz_2', 'srat_num_2', 'srat_num_18', 'srat_num_34', 'srat_num_44', 'srat_num_48', 'srat_num_52', 'srat_sz_2', 'srat_sz_18'

In [5]:
def shuffle_unison(a, b):
    rng_state = np.random.get_state()
    np.random.shuffle(a)
    np.random.set_state(rng_state)
    np.random.shuffle(b)

In [6]:
all_data = {}
for i, dev in enumerate(all_device_list):
    dev_data = []
    for pcap in pcaps[dev]:
        fn = os.path.join(rfpath, os.path.basename(pcap) + '.txt')
        with open(fn) as f:
            arr = map(lambda l: list(map(float, l.strip().split()[1:])),
                      f.readlines())
            dev_data.extend(arr)
    classes = [i] * len(dev_data)
    all_data[dev] = (dev_data, classes)

In [7]:
train_data = {}
test_data = {}
for dev, (feat, cls) in all_data.items():
    np.random.shuffle(feat)
    train_amt = int(np.floor(len(feat) * 0.8))
    train_data[dev] = (feat[:train_amt][:], cls[:train_amt])
    test_data[dev] = (feat[train_amt:][:], cls[train_amt:])

In [10]:
print(train_data['wemo'][0])

[[22.0, 6315.12, 0.0, 22.0, 0.0, 6315.12, 0.0, 13.36, 0.0, 0.0, 0.0, 0.0, 0.0, 8.64, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4805.76, 0.0, 0.0, 0.0, 0.0, 0.0, 1509.36, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.607272727273, 0.0, 0.0, 0.0, 0.0, 0.392727272727, 0.0, 0.0, 0.0, 0.0, 0.0, 0.760992665223, 0.0, 0.0, 0.0, 0.0, 0.239007334777, 0.0, 0.0, 0.0, 0.0, 0.0, 0.532727272727, 0.0, 0.736714425062, 0.607784431138, 0.0, 0.416666666667, 0.0, 0.0, 0.0, 0.834415368225, 0.0, 0.42563735623, 0.0, 0.0, 0.0, 287.050909091, 0.0, 287.050909091, 0.0, 359.71257485, 0.0, 0.0, 0.0, 0.0, 0.0, 174.694444444, 0.0, 0.0, 0.0, 0.0, 0.0, 392.49221118, -0.0, 392.49221118, -0.0, 479.813284528, -0.0, -0.0, -0.0, -0.0, 126.119135876, -0.0, -0.0, -0.0, -0.0], [7.2, 5039.24, 0.0, 7.2, 0.0, 5039.24, 0.0, 6.04, 0.0, 0.0, 0.0, 0.0, 0.0, 1.16, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4860.04, 0.0, 0.0, 0.0, 0.0, 0.0, 179.2, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.838888888889, 0.0, 0.0, 0.0, 0.0, 0.161111111111, 0.0, 0

In [26]:
clf = GridSearchCV(estimator=RandomForestClassifier(),
                   param_grid=dict(n_estimators=list(range(8, 22, 2)),
                                   max_depth=[None, 8, 4, 2]), n_jobs=-1)
tr = ([], [])
ts = ([], [])
for dev in all_device_list:
    tr[0].extend(train_data[dev][0])
    tr[1].extend(train_data[dev][1])
    ts[0].extend(test_data[dev][0])
    ts[1].extend(test_data[dev][1])
clf.fit(tr[0], tr[1])

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'n_estimators': [8, 10, 12, 14, 16, 18, 20], 'max_depth': [None, 8, 4, 2]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [27]:
confusion = np.zeros((len(all_device_list), len(all_device_list)))
for tsv, tsc in zip(ts[0], ts[1]):
    prd = clf.predict(np.array(tsv).reshape(1, -1))[0]
    confusion[prd][tsc] += 1
print('All trained:')
print(confusion.astype(int))

All trained:
[[355   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0 364   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0 334   0   0   0   1   4   0  15   0   0   0   0]
 [  0   0   0 353   1   0   0   0   0   0   0   2   0   0]
 [  0   0   0   1 355   0   0   0   0   0   0   0   1   2]
 [  0   0   0   0   0 357   0   0   0   0   0   0   0   0]
 [  0   0   2   0   0   0 338   4   0   4   0   0   0   0]
 [  0   0   8   0   0   0  12 348   0   2   0   0   0   0]
 [  0   0   0   0   0   0   0   0 356   0   0   0   0   0]
 [  0   0  14   0   0   0   5   0   0 333   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0 375   0   0   0]
 [  0   0   0   0   1   0   0   0   0   0   0 354   0   0]
 [  0   0   0   2   0   0   0   0   0   0   0   0 349   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0 382]]


In [54]:
# Train on nine plugs, test on the remainder
grouped_train_data_exclude = {}
grouped_test_data_exclude = {}
for out in device_list:
    collected_train = ([], [])
    collected_test = ([], [])
    for dev in all_device_list:
        if dev not in [out] + non_switch + non_actuated:
            collected_train[0].extend(train_data[dev][0])
            collected_train[1].extend([0] * len(train_data[dev][1]))
        elif dev != out:
            collected_train[0].extend(train_data[dev][0])
            collected_train[1].extend(train_data[dev][1])
        
        if dev == out:
            collected_test[0].extend(test_data[dev][0])
            collected_test[1].extend([0] * len(test_data[dev][1]))
        #elif dev in non_switch + non_actuated:
            #collected_test[0].extend(test_data[dev][0])
            #collected_test[1].extend(test_data[dev][1])

    shuffle_unison(*collected_train)
    shuffle_unison(*collected_test)
    
    sm = SMOTE(random_state=42)
    collected_train = sm.fit_sample(*collected_train)

    grouped_train_data_exclude[out] = collected_train
    grouped_test_data_exclude[out] = collected_test

In [72]:
clfs = []
for out in device_list:
    clf = GridSearchCV(estimator=RandomForestClassifier(),
                       param_grid=dict(n_estimators=list(range(8, 22, 2)),
                                       max_depth=[None, 60, 50, 40, 30, 20]), n_jobs=-1)
    tr = grouped_train_data_exclude[out]
    ts = grouped_test_data_exclude[out]
    clf.fit(tr[0], tr[1])
    print('Excluding {}:'.format(out), clf.score(ts[0], ts[1]))
    clfs.append(clf)

Excluding wemo: 0.0704225352113
Excluding neo: 1.0
Excluding jinvoo: 1.0
Excluding ihome: 0.22191011236
Excluding dlink: 0.137254901961
Excluding tplink: 0.0
Excluding mjerry: 1.0
Excluding tuya: 1.0
Excluding edimax: 0.154494382022
Excluding cevitor: 1.0


In [90]:
forest = clfs[0].best_estimator_
importances = list(zip(red_feat, forest.feature_importances_))
sorted(importances, key=lambda t: t[1], reverse=True)

[('ratet_num_2', 0.10771507477960914),
 ('rate_num_2', 0.086285683146575115),
 ('rate_sz_2', 0.077911477458968786),
 ('ratetot_num', 0.069834878465822994),
 ('ltot_sz', 0.053505151886625281),
 ('fract_num_0', 0.044861888828269136),
 ('sratt_num_0', 0.043434779456159041),
 ('rate_num_34', 0.038517330412551169),
 ('ratet_sz_2', 0.033508496297027741),
 ('rate_sz_34', 0.032989014122481328),
 ('ratetot_sz', 0.03096179782383001),
 ('sd_sz2', 0.027751403468139757),
 ('frac_num_2', 0.027180366279625407),
 ('lt_sz_2', 0.025926638171271131),
 ('srat_sz_34', 0.025031806103233772),
 ('frac_num_34', 0.024872328454186229),
 ('sratt_sz_0', 0.022952942081428094),
 ('ratet_sz_0', 0.021927333223650612),
 ('lt_sz_0', 0.021366472492438511),
 ('frac_sz_2', 0.019219480280359077),
 ('srat_sz_52', 0.019085905915974793),
 ('rate_num_16', 0.018882885760963843),
 ('sratt_num_2', 0.013636998555524037),
 ('l_sz_34', 0.012800228831848889),
 ('srat_num_2', 0.012596679332784433),
 ('l_sz_2', 0.012507412225349448),
 (

In [73]:
for out, clf in zip(device_list, clfs):
    ts = grouped_test_data_exclude[out]
    confusion = np.zeros((len(all_device_list), len(all_device_list)))
    for tsv, tsc in zip(ts[0], ts[1]):
        prd = clf.predict(np.array(tsv).reshape(1, -1))[0]
        confusion[prd][tsc] += 1
    print('Excluding {}:'.format(out))
    print(confusion.astype(int))

Excluding wemo:
[[ 25   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [330   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0]]
Excluding neo:
[[364   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0   

### SVC, Linear Kernel

In [58]:
clf = GridSearchCV(estimator=SVC(),
                   param_grid=dict(kernel=['linear'],
                                   random_state=[42]), n_jobs=-1)
tr = ([], [])
ts = ([], [])
for dev in all_device_list:
    tr[0].extend(train_data[dev][0])
    tr[1].extend(train_data[dev][1])
    ts[0].extend(test_data[dev][0])
    ts[1].extend(test_data[dev][1])
clf.fit(tr[0], tr[1])

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'random_state': [42], 'kernel': ['linear']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [59]:
confusion = np.zeros((len(all_device_list), len(all_device_list)))
for tsv, tsc in zip(ts[0], ts[1]):
    prd = clf.predict(np.array(tsv).reshape(1, -1))[0]
    confusion[prd][tsc] += 1
print('All trained:')
print(confusion.astype(int))

All trained:
[[355   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0 364   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0 252   0   0   0  14  19   0  97   0   0   0   0]
 [  0   0   0 312   0   0   0   0   0   0   0  12   0   0]
 [  0   0   0   0 347   0   0   0   0   0   0  17   1   4]
 [  0   0   0   0   0 357   0   0   0   0   0   0   0   0]
 [  0   0  18   0   0   0 222  55   0  16   0   0   0   0]
 [  0   0  55   0   0   0 104 271   0  12   0   0   0   0]
 [  0   0   0   0   0   0   0   0 356   0   0   0   1   0]
 [  0   0  33   0   0   0  16  11   0 229   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0 375   0   0   0]
 [  0   0   0  42  10   0   0   0   0   0   0 327   0   0]
 [  0   0   0   2   0   0   0   0   0   0   0   0 348   0]
 [  0   0   0   0   0   0   0   0   0   0   0   0   0 380]]


### SVC, RBF Kernel

In [62]:
clf = GridSearchCV(estimator=SVC(),
                   param_grid=dict(kernel=['rbf'], gamma=['auto', 0.05, 0.1],
                                   random_state=[42]), n_jobs=-1)
tr = ([], [])
ts = ([], [])
for dev in all_device_list:
    tr[0].extend(train_data[dev][0])
    tr[1].extend(train_data[dev][1])
    ts[0].extend(test_data[dev][0])
    ts[1].extend(test_data[dev][1])
clf.fit(tr[0], tr[1])

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'gamma': ['auto', 0.05, 0.1], 'random_state': [42], 'kernel': ['rbf']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [63]:
confusion = np.zeros((len(all_device_list), len(all_device_list)))
for tsv, tsc in zip(ts[0], ts[1]):
    prd = clf.predict(np.array(tsv).reshape(1, -1))[0]
    confusion[prd][tsc] += 1
print('All trained:')
print(confusion.astype(int))

All trained:
[[261   0   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0 337   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0 303   0   0   0   0   5   0  12   0   0   0   0]
 [  0   0   0 316   0   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0 320   0   0   0   0   0   0   0   0   0]
 [  0   0   0   0   0 350   0   0   0   0   0   0   0   0]
 [  0   0   8   0   0   0 311  11   0   3   0   0   0   0]
 [  0   0   5   0   0   0   8 314   0   3   0   0   0   0]
 [  0   0   0   0   0   0   0   0 295   0   0   0   0   0]
 [  0   0  11   0   0   0   3   1   0 295   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0 186   0   0   0]
 [  0   0   0   1   0   0   0   0   0   0   0 341   0   0]
 [  0   0   0   2   0   0   0   0   0   0   0   0 304   0]
 [ 94  27  31  37  37   7  34  25  61  41 189  15  46 384]]


### MLP Classifier

In [67]:
mlp_params = [dict(hidden_layer_sizes=[(90,)], random_state=[42]),
              dict(hidden_layer_sizes=[(70,)], random_state=[42]),
              dict(hidden_layer_sizes=[(50,)], random_state=[42]),
              dict(hidden_layer_sizes=[(90,70)], random_state=[42]),
              dict(hidden_layer_sizes=[(90,50)], random_state=[42]),
              dict(hidden_layer_sizes=[(70,50)], random_state=[42]),
              dict(hidden_layer_sizes=[(90,70,50)], random_state=[42])]

clf = GridSearchCV(estimator=MLPClassifier(),
                   param_grid=mlp_params, n_jobs=-1)
tr = ([], [])
ts = ([], [])
for dev in all_device_list:
    tr[0].extend(train_data[dev][0])
    tr[1].extend(train_data[dev][1])
    ts[0].extend(test_data[dev][0])
    ts[1].extend(test_data[dev][1])
clf.fit(tr[0], tr[1])

GridSearchCV(cv=None, error_score='raise',
       estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'random_state': [42], 'hidden_layer_sizes': [(90,)]}, {'random_state': [42], 'hidden_layer_sizes': [(70,)]}, {'random_state': [42], 'hidden_layer_sizes': [(50,)]}, {'random_state': [42], 'hidden_layer_sizes': [(90, 70)]}, {'random_state': [42], 'hidden_layer_sizes': [(90, 50)]}, {'random_state': [42], 'hidden_layer_sizes': [(70, 50)]}, {'random_state': [42], 'hidden_layer_sizes': [(90, 70, 50)]}],
       pre_dispatch='2*n_jobs', refit=T

In [68]:
confusion = np.zeros((len(all_device_list), len(all_device_list)))
for tsv, tsc in zip(ts[0], ts[1]):
    prd = clf.predict(np.array(tsv).reshape(1, -1))[0]
    confusion[prd][tsc] += 1
print('All trained:')
print(confusion.astype(int))

All trained:
[[354   0   0   0   0   0   0   0   0   0  43   0   0   0]
 [  0 364   0   0   0   0   0   0   0   0   0   0   0   0]
 [  0   0 207   0   0   0   5   9   0  60   0   0   0   0]
 [  0   0   0 351   1   0   0   0   0   0   0   6   1   0]
 [  0   0   0   0 336   0   0   0   0   0   0  18   1   5]
 [  1   0   0   0   0 356   0   0   0   0   0   0   0   1]
 [  0   0   3   0   0   0 155   7   0   3   0   0   0   0]
 [  0   0 111   0   0   0 186 335   0  44   0   0   0   6]
 [  0   0   0   0   9   1   0   0 356   0   0   0   0   6]
 [  0   0  37   0   0   0  10   4   0 247   0   0   0   0]
 [  0   0   0   0   0   0   0   0   0   0 332   0   0   0]
 [  0   0   0   3   9   0   0   0   0   0   0 332   0   0]
 [  0   0   0   2   0   0   0   0   0   0   0   0 348   0]
 [  0   0   0   0   2   0   0   1   0   0   0   0   0 366]]


In [70]:
clf.best_estimator_

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(70, 50), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=42, shuffle=True,
       solver='adam', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)