In [1]:
regressors_to_remove = ["destinationMacAddress", "sourceMacAddress", "egressInterface", "ingressInterface", "initialTCPFlags", 
                    "reverseInitialTCPFlags", "reverseTcpUrgTotalCount", "reverseUnionTCPFlags", "silkAppLabel", 
                    "tcpSequenceNumber", "tcpUrgTotalCount", "unionTCPFlags", "vlanId", "sourceIPv4Address", 
                    "destinationIPv4Address", "reverseTcpSequenceNumber", "observationDomainId", "reverseStandardDeviationInterarrivalTime",
                    "reverseStandardDeviationPayloadLength", "reverseSmallPacketCount", "reverseNonEmptyPacketCount",
                    "reverseMaxPacketSize", "reverseLargePacketCount", "reverseFirstNonEmptyPacketSize", "reverseDataByteCount",
                    "reverseBytesPerPacket", "reverseAverageInterarrivalTime", "collectorName"]

regressor_has_null = ["protocolIdentifier"]

This code will just read in the downloaded test dataset

In [2]:
import pandas as pd
df1 = pd.read_json('train_df_noGoogle.json', lines=True)
df2 = pd.read_json('train_google.json', lines=True)
train_df = pd.concat([df1,df2],ignore_index=True)


In [3]:
test_df = pd.read_json("test_df.json", lines=True)

Cleans the data for XGboost

In [4]:
columns_to_remove = ["flowStartMilliseconds", "flowEndMilliseconds",'firstEightNonEmptyPacketDirections']
for regressors in columns_to_remove:
    if regressors in train_df.columns:
        train_df = train_df.drop(columns=regressors)
        test_df = test_df.drop(columns=regressors)


In [5]:
# Encoding the response to numeric values

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train_df['response'] = le.fit_transform(train_df['response'])
test_df['response'] = le.fit_transform(test_df['response'])

In [6]:
# Converting Variables into category
columns_categorical = ["flowAttributes", "protocolIdentifier", "ipClassOfService", "flowEndReason",
                      'reverseFlowAttributes']


for regressors in columns_categorical:
    train_df[regressors] = train_df[regressors].astype('category')
    test_df[regressors] = test_df[regressors].astype('category')


In [7]:
for col in columns_categorical:
    train_df[col] = train_df[col].cat.codes
    test_df[col] = test_df[col].cat.codes


In [8]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125000 entries, 0 to 124999
Data columns (total 24 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   flowDurationMilliseconds           125000 non-null  float64
 1   reverseFlowDeltaMilliseconds       125000 non-null  float64
 2   protocolIdentifier                 125000 non-null  int8   
 3   sourceTransportPort                125000 non-null  int64  
 4   packetTotalCount                   125000 non-null  int64  
 5   octetTotalCount                    125000 non-null  int64  
 6   flowAttributes                     125000 non-null  int8   
 7   destinationTransportPort           125000 non-null  int64  
 8   reversePacketTotalCount            125000 non-null  int64  
 9   reverseOctetTotalCount             125000 non-null  int64  
 10  reverseFlowAttributes              125000 non-null  int8   
 11  ipClassOfService                   1250

### XG Boost

In [9]:
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

response = ['response']
predictors = [x for x in list(train_df.columns) if x not in response]

X = train_df[predictors]
y = train_df[response]
X_test = test_df[predictors]
y_test = test_df[response]
#label = train_df['response']
#dtrain = xgb.DMatrix(train_df, label=label, enable_categorical=True)
# Instantiate an XGBClassifier
model = xgb.XGBClassifier(objective ='multi:softprob', 
                          num_class = 19,
                          colsample_bytree = 0.3, 
                          learning_rate = 0.1,
                          max_depth = 10, 
                          alpha = 1, 
                          n_estimators = 50)

# Train the model
model.fit(X, y)

# Predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.40736


In [10]:
## Hyper parameter tuning
model = xgb.XGBClassifier(use_label_encoder=False)
param_grid = {
    'n_estimators': [50,100],
    'learning_rate': [0.1,0.2],
    'max_depth': [5,10],
    'alpha': [1,2]
}
model = xgb.XGBClassifier(use_label_encoder=False)


In [11]:
grid_search = GridSearchCV(model, param_grid, scoring='accuracy', cv=3, verbose=1, n_jobs=-1)
grid_search.fit(X, y)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


In [12]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print("Best hyperparameters:", grid_search.best_params_)
print("Test accuracy:", accuracy_score(y_test, y_pred))

Best hyperparameters: {'alpha': 2, 'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 50}
Test accuracy: 0.0816


In [13]:
import numpy as np
conversion = np.arange(0,25)
decoded_category = le.inverse_transform(conversion)
for i in conversion:
    print(f'{i} = {decoded_category[i]}')

0 = amazon_echo_gen2
1 = au_network_camera
2 = au_wireless_adapter
3 = bitfinder_awair_breathe_easy
4 = candy_house_sesami_wi-fi_access_point
5 = google_home_gen1
6 = i-o_data_qwatch
7 = irobot_roomba
8 = jvc_kenwood_cu-hb1
9 = jvc_kenwood_hdtv_ip_camera
10 = line_clova_wave
11 = link_japan_eremote
12 = mouse_computer_room_hub
13 = nature_remo
14 = panasonic_doorphone
15 = philips_hue_bridge
16 = planex_camera_one_shot!
17 = planex_smacam_outdoor
18 = planex_smacam_pantilt
19 = powerelectric_wi-fi_plug
20 = qrio_hub
21 = sony_bravia
22 = sony_network_camera
23 = sony_smart_speaker
24 = xiaomi_mijia_led


In [14]:
import numpy as np
tot = 0
between = 1000
i = 0
print(f"Gradient Boosted Decision Tree Accuracy: {accuracy*100}%\n \nclass by class:")
while(between <= 25000):
    unique, counts = np.unique(y_pred[between-1000:between], return_counts=True)
    i_count = np.sum(y_pred[between-1000:between] == i)
    tot += i_count
    percentage = i_count
    print(f'{decoded_category[i]}, precison = {percentage}%')
    #print(f'{i} = {decoded_category[i]}, we are accurate {percentage}% amount of times\n{dict(zip(unique, counts))}')
    between += 1000
    i += 1

Gradient Boosted Decision Tree Accuracy: 40.736%
 
class by class:
amazon_echo_gen2, precison = 0%
au_network_camera, precison = 0%
au_wireless_adapter, precison = 0%
bitfinder_awair_breathe_easy, precison = 0%
candy_house_sesami_wi-fi_access_point, precison = 0%
google_home_gen1, precison = 1000%
i-o_data_qwatch, precison = 0%
irobot_roomba, precison = 0%
jvc_kenwood_cu-hb1, precison = 0%
jvc_kenwood_hdtv_ip_camera, precison = 5%
line_clova_wave, precison = 0%
link_japan_eremote, precison = 0%
mouse_computer_room_hub, precison = 944%
nature_remo, precison = 0%
panasonic_doorphone, precison = 0%
philips_hue_bridge, precison = 0%
planex_camera_one_shot!, precison = 0%
planex_smacam_outdoor, precison = 0%
planex_smacam_pantilt, precison = 91%
powerelectric_wi-fi_plug, precison = 0%
qrio_hub, precison = 0%
sony_bravia, precison = 0%
sony_network_camera, precison = 0%
sony_smart_speaker, precison = 0%
xiaomi_mijia_led, precison = 0%


In [15]:
tot/25000

0.0816

In [16]:
import numpy as np
y_test_arr = np.ravel(y_test)

def classByClassAccuracy(predictions, actual):
    ind = 0
    classes = {}
    for i in predictions:
        if i in classes.keys():
            if i == actual[ind]:
                classes[i]["count"] += 1
            classes[i]["total"] += 1
        else:
            if i == actual[ind]:
                classes[i] = {"count":1, "total":1}
            else:
                classes[i] = {"count":0, "total":1}
        
        ind += 1
 
    results = {}
    for k in classes.keys():
        results[k] = classes[k]["count"] / classes[k]["total"]
 
    return dict(sorted(results.items(), key=lambda x:x[1], reverse=True))
class_acc = classByClassAccuracy(y_pred, y_test_arr)
class_acc

{12: 0.9989417989417989,
 9: 0.7142857142857143,
 18: 0.5416666666666666,
 5: 0.04511006856730422,
 7: 0.0,
 13: 0.0,
 4: 0.0,
 22: 0.0,
 24: 0.0,
 19: 0.0,
 1: 0.0,
 0: 0.0,
 10: 0.0,
 23: 0.0,
 17: 0.0,
 20: 0.0}

### RANDOM FOREST

In [17]:
# Converting Variables into category
columns_categorical = ["flowAttributes", "protocolIdentifier", "ipClassOfService", "flowEndReason",
                      'reverseFlowAttributes']


for regressors in columns_categorical:
    train_df = pd.get_dummies(train_df, columns=[regressors], prefix=regressors)
    test_df = pd.get_dummies(test_df, columns=[regressors], prefix=regressors)



In [18]:
# Automatically remove inconsistent columns/predictors

for regressors in test_df.columns:
    if regressors not in train_df.columns:
         test_df = test_df.drop(columns=regressors)

In [19]:
# Manually

category = ['ipClassOfService_0xd0']
for regressors in category:
    if regressors in train_df.columns:
        train_df = train_df.drop(columns=regressors)
    if regressors in test_df.columns:
        test_df = test_df.drop(columns=regressors)


In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid2 = [{"n_estimators" : [200,400]}, {"class_weight" : ["balanced_subsample", "balanced"]}]

response = ['response']
predictors = [x for x in list(train_df.columns) if x not in response]

predictors.remove('flowAttributes_5')
predictors.remove('flowAttributes_6')
predictors.remove('ipClassOfService_4')


X = train_df[predictors]
y = np.ravel(train_df[response])
X_test = test_df[predictors]
y_test = np.ravel(test_df[response])
forest1 = RandomForestClassifier(criterion='entropy', bootstrap=True)

forestSearch = GridSearchCV(forest1, param_grid=param_grid2, scoring='accuracy', cv=3, verbose=1, n_jobs=-1)
forestFit = forestSearch.fit(X,y)
best_forest = forestFit.best_estimator_


# classByClassAccuracy(res, y_test)


Fitting 3 folds for each of 4 candidates, totalling 12 fits


In [37]:
res = best_forest.predict_proba(X_test)
guesses = best_forest.predict(X_test)
# probabilities = pd.DataFrame(res)
# means = probabilities.mean(axis=0)

threshold_met = res > 0.50

predictions = np.full((res.shape[0],), -1)

for i, instance in enumerate(threshold_met):

    if any(instance):
        predictions[i] = np.argmax(res[i])

accuracies = classByClassAccuracy(predictions, y_test)
for oldmate in accuracies.items():
    print(oldmate)

(16, 1.0)
(12, 0.9979879275653923)
(6, 0.9869976359338062)
(21, 0.9840490797546012)
(14, 0.968978102189781)
(2, 0.9677852348993289)
(0, 0.9331683168316832)
(15, 0.7794871794871795)
(3, 0.6827586206896552)
(18, 0.668918918918919)
(19, 0.6688311688311688)
(20, 0.6437994722955145)
(10, 0.5865384615384616)
(9, 0.5760869565217391)
(23, 0.5357142857142857)
(22, 0.48514851485148514)
(11, 0.45454545454545453)
(17, 0.25675675675675674)
(7, 0.2182741116751269)
(4, 0.20675105485232068)
(24, 0.2033898305084746)
(8, 0.14285714285714285)
(5, 0.062275839582654056)
(1, 0.05128205128205128)
(13, 0.01744186046511628)
(-1, 0.0)


In [31]:
from sklearn.metrics import accuracy_score
accuracies = accuracy_score(y_test, guesses)

print(f"sklearn: {accuracies}")

sklearn: 0.31164
