## Usage
Using nPrint: parsing wlan pcap to npt format


In [1]:
filename = 'wlan_2020_11_05_03'
# cmd_test = f'nprint -w -P {filename}.pcap -W {filename}.npt'
cmd_test = f'../nprint -w -P {filename}.pcap -W {filename}.npt'
!{cmd_test}


## Print 
nPrint-wlan result

In [2]:
import pandas as pd

nprint_wlan = pd.read_csv(f'{filename}.npt', index_col=0)

print('nPrint_wlan: Number of Packets: {0}, Features per packet: {1}'.format(nprint_wlan.shape[0], nprint_wlan.shape[1]))
print(nprint_wlan.columns)
print(nprint_wlan.head(3).iloc[:,0:2])

nPrint_wlan: Number of Packets: 4422, Features per packet: 528
Index(['radiotap_reversion_0', 'radiotap_reversion_1', 'radiotap_reversion_2',
       'radiotap_reversion_3', 'radiotap_reversion_4', 'radiotap_reversion_5',
       'radiotap_reversion_6', 'radiotap_reversion_7', 'radiotap_pad_0',
       'radiotap_pad_1',
       ...
       'wlan_rx_addr_38', 'wlan_rx_addr_39', 'wlan_rx_addr_40',
       'wlan_rx_addr_41', 'wlan_rx_addr_42', 'wlan_rx_addr_43',
       'wlan_rx_addr_44', 'wlan_rx_addr_45', 'wlan_rx_addr_46',
       'wlan_rx_addr_47'],
      dtype='object', length=528)
                   radiotap_reversion_0  radiotap_reversion_1
src_mac                                                      
f8:bc:0e:53:3d:a9                     0                     0
f8:bc:0e:53:3d:a3                     0                     0
f8:bc:0e:53:3d:a6                     0                     0


## Label

An example label

`sum(radiotap_rate_0:radiotap_rate_3) > 0` indicates if the wlan rate is great than 11Mb/s or not 

We then remove all the rate related features. Let's see if it works.

In [3]:
import numpy as np

samples = []
labels = []
drop_columns = [f'radiotap_rate_{i}' for i in range(8)]

for _, row in nprint_wlan.iterrows():
    labels.append('>=11Mb/s') if sum(row['radiotap_rate_0':'radiotap_rate_3']) > 0 else labels.append('<11Mb/s')
    row.drop(drop_columns, inplace=True)
    samples.append(np.array(row))

print(f'{len(samples)}, {len(samples[0])}')

4422, 520


# Training

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

# Split data
X_train, X_test, y_train, y_test = train_test_split(samples, labels)

# Initialize Classifier
clf = RandomForestClassifier(n_estimators=1000, max_depth=None, min_samples_split=2, random_state=0)

# Train 
clf.fit(X_train, y_train) 

# Predict
y_pred = clf.predict(X_test)

# Statistics

# First, lets get a stat report about the precision and recall:
report = classification_report(y_test, y_pred)
print(report)

# Let's also get the ROC AUC score while we're here, which requires a probability instead of just the prediction
y_pred_proba = clf.predict_proba(X_test)
# predict_proba gives us a probability estimate of each class, while roc_auc just cares about the "positive" class
y_pred_proba_pos = [sublist[1] for sublist in y_pred_proba]
roc = roc_auc_score(y_test, y_pred_proba_pos)
print('ROC AUC Score: {0}'.format(roc))

              precision    recall  f1-score   support

     <11Mb/s       0.99      0.99      0.99       845
    >=11Mb/s       0.97      0.96      0.97       261

    accuracy                           0.98      1106
   macro avg       0.98      0.97      0.98      1106
weighted avg       0.98      0.98      0.98      1106

ROC AUC Score: 0.9982180507379446


## Analysis

Understanding the mode

Show the most important features

In [5]:
# Get Raw feature importances
feature_importances = clf.feature_importances_
# Match the feature names we know with the importances
named_importances = []

# Get names for columns
columns_list = list(nprint_wlan.columns)
columns_list_without_label = columns_list[:columns_list.index('radiotap_rate_0')] \
                           + columns_list[columns_list.index('radiotap_rate_7') + 1:]

for column_name, importance in zip(columns_list_without_label, feature_importances):
    named_importances.append((column_name, importance))
# Sort the named feature importances
sorted_feature_importances = sorted(named_importances, key=lambda tup: tup[1], reverse=True)
# Now lets print the top 20 important features (bits)
print(*sorted_feature_importances[0:20], sep='\n') 

('radiotap_antenna_signal_5', 0.0485558038500492)
('radiotap_channel_1', 0.044220049365283114)
('wlan_type_5', 0.039731837431827614)
('radiotap_channel_6', 0.03563846261795648)
('radiotap_channel_0', 0.03505149489317758)
('radiotap_antennas_21', 0.0346606331499566)
('radiotap_channel_2', 0.0333421038991275)
('radiotap_antennas_20', 0.029048346394381648)
('radiotap_antenna_signal_4', 0.028237931340483354)
('radiotap_channel_3', 0.01997868581910432)
('radiotap_channel_flags_1', 0.01589404999541859)
('radiotap_timestamp_13', 0.014899643400472493)
('radiotap_antenna_signal_3', 0.014623286583089923)
('radiotap_channel_7', 0.014150853190490652)
('radiotap_channel_4', 0.014011209839497477)
('radiotap_channel_flags_2', 0.012890521055154579)
('wlan_rx_addr_38', 0.01223887474071424)
('radiotap_antennas_5', 0.011695608265088737)
('wlan_rx_addr_1', 0.011652823759201509)
('radiotap_antennas_4', 0.01073747822317586)


## Try nPrintML

In [11]:
labels_path = 'rate-labels.txt'

labels_map = {}

for _, row in nprint_wlan.iterrows():
    labels_map[row.name] = '>=11Mb/s' if sum(row['radiotap_rate_0':'radiotap_rate_3']) > 0 else '<11Mb/s'


with open(labels_path, 'w') as f:
    f.write('item,label\n')
    for k, v in labels_map.items():
        f.write(f'{k},{v}\n')

In [12]:
pcap_path = 'wlan_2020_11_05_03.pcap'

nml_cmd = f'nprintml --wlan --pcap-file {pcap_path} --label-file {labels_path} --aggregator index'
!{nml_cmd}

[warn] nprint expected version for nprintML (1.1.6) does not match version on PATH (1.0.4 at /usr/local/bin/nprint)
step:Net → NetResult(nprint_path=PosixPath('nprintml/run-canary-1614290696-6452/nprint'))
Loading nPrint: nprintml/run-canary-1614290696-6452/nprint/wlan_2020_11_05_03.npt
Loaded 1 nprint
  nPrint shape: (4422, 528)
Loading labels: rate-labels.txt
  number of labels: 92
Attaching labels to nPrints
  labels attached: missing labels for:
    missing labels caused samples to be dropped: 0
step:Label → LabelResult(features=                   radiotap_reversion_0  ...    label
06:7f:2c:c3:10:90                     0  ...  <11Mb/s
08:02:8e:ca:ae:73                     0  ...  <11Mb/s
08:02:8e:ca:ae:73                     0  ...  <11Mb/s
08:02:8e:ca:ae:73                     0  ...  <11Mb/s
08:ea:40:f9:0e:44                     0  ...  <11Mb/s
...                                 ...  ...      ...
ff:ff:ff:ff:60:e1                     0  ...  <11Mb/s
ff:ff:ff:ff:80:e8            