In [2]:
regressors_to_remove = ["destinationMacAddress", "sourceMacAddress", "egressInterface", "ingressInterface", "initialTCPFlags", 
                    "reverseInitialTCPFlags", "reverseTcpUrgTotalCount", "reverseUnionTCPFlags", "silkAppLabel", 
                    "tcpSequenceNumber", "tcpUrgTotalCount", "unionTCPFlags", "vlanId", "sourceIPv4Address", 
                    "destinationIPv4Address", "reverseTcpSequenceNumber", "observationDomainId", "reverseStandardDeviationInterarrivalTime",
                    "reverseStandardDeviationPayloadLength", "reverseSmallPacketCount", "reverseNonEmptyPacketCount",
                    "reverseMaxPacketSize", "reverseLargePacketCount", "reverseFirstNonEmptyPacketSize", "reverseDataByteCount",
                    "reverseBytesPerPacket", "reverseAverageInterarrivalTime", "collectorName"]

regressor_has_null = ["protocolIdentifier"]

This code will just read in the downloaded test dataset

In [158]:
import pandas as pd
df1 = pd.read_json('train_df_noGoogle.json', lines=True)
df2 = pd.read_json('train_google.json', lines=True)
train_df = pd.concat([df1,df2],ignore_index=True)
test_df = pd.read_json('test_df.json', lines=True)

This code will individually read the raw json data and create a train response

Cleans the data for XGboost

In [166]:
columns_to_remove = ["flowStartMilliseconds", "flowEndMilliseconds",'firstEightNonEmptyPacketDirections']
for regressors in columns_to_remove:
    if regressors in train_df.columns:
        train_df = train_df.drop(columns=regressors)
        test_df = test_df.drop(columns=regressors)


In [185]:
# Encoding the response to numeric values

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
train_df['response'] = le.fit_transform(train_df['response'])
test_df['response'] = le.fit_transform(test_df['response'])

In [4]:
# Converting Variables into category
columns_categorical = ["flowAttributes", "protocolIdentifier", "ipClassOfService", "flowEndReason",
                      'reverseFlowAttributes']


for regressors in columns_categorical:
    train_df[regressors] = train_df[regressors].astype('category')
    test_df[regressors] = test_df[regressors].astype('category')


In [5]:
for col in columns_categorical:
    train_df[col] = train_df[col].cat.codes
    test_df[col] = test_df[col].cat.codes


In [6]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 125000 entries, 0 to 124999
Data columns (total 24 columns):
 #   Column                             Non-Null Count   Dtype  
---  ------                             --------------   -----  
 0   flowDurationMilliseconds           125000 non-null  float64
 1   reverseFlowDeltaMilliseconds       125000 non-null  float64
 2   protocolIdentifier                 125000 non-null  int8   
 3   sourceTransportPort                125000 non-null  int64  
 4   packetTotalCount                   125000 non-null  int64  
 5   octetTotalCount                    125000 non-null  int64  
 6   flowAttributes                     125000 non-null  int8   
 7   destinationTransportPort           125000 non-null  int64  
 8   reversePacketTotalCount            125000 non-null  int64  
 9   reverseOctetTotalCount             125000 non-null  int64  
 10  reverseFlowAttributes              125000 non-null  int8   
 11  ipClassOfService                   1250

### XG Boost

In [7]:
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

response = ['response']
predictors = [x for x in list(train_df.columns) if x not in response]

X = train_df[predictors]
y = train_df[response]
X_test = test_df[predictors]
y_test = test_df[response]
#label = train_df['response']
#dtrain = xgb.DMatrix(train_df, label=label, enable_categorical=True)
# Instantiate an XGBClassifier
model = xgb.XGBClassifier(objective ='multi:softprob', 
                          num_class = 19,
                          colsample_bytree = 0.3, 
                          learning_rate = 0.1,
                          max_depth = 10, 
                          alpha = 1, 
                          n_estimators = 50)

# Train the model
model.fit(X, y)

# Predictions
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.4118


### Exploring prediction errors

In [82]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score

y_test_arr = np.ravel(y_test)
# Step 2: Create a DataFrame with true labels and predicted labels
results_df = pd.DataFrame({'True': y_test_arr, 'Predicted': y_pred})

# Step 3: Compare the true and predicted labels to find mismatches
results_df['Correct'] = results_df['True'] == results_df['Predicted']

# Step 4: Extract indices of incorrectly classified instances
incorrect_indices = results_df[results_df['Correct'] == False].index.tolist()

# Now `incorrect_indices` holds the indices of the incorrectly classified instances
#print("Indices of incorrectly classified instances:", incorrect_indices)




In [9]:
row = 0
prediction_int = y_test.iloc[row]
print(f'It should be {prediction_int}, I am getting {y_pred[row]}')

It should be response    0
Name: 0, dtype: int32, I am getting 0


In [10]:
probabilities = model.predict_proba(X_test)
probabilities

array([[0.52821815, 0.00714703, 0.03455031, ..., 0.00629136, 0.01075714,
        0.00702013],
       [0.7807144 , 0.00256663, 0.00435964, ..., 0.00200704, 0.00305754,
        0.00264425],
       [0.91762847, 0.00104874, 0.00098997, ..., 0.00118892, 0.00112028,
        0.00098989],
       ...,
       [0.02109229, 0.03403796, 0.01524978, ..., 0.04164699, 0.01014183,
        0.03341097],
       [0.02103613, 0.02710547, 0.01372556, ..., 0.04248632, 0.0093953 ,
        0.04124414],
       [0.01933529, 0.02646217, 0.01408833, ..., 0.04258097, 0.00934198,
        0.03754492]], dtype=float32)

In [11]:
probabilities[row,:]

array([0.52821815, 0.00714703, 0.03455031, 0.00774228, 0.00679514,
       0.20508993, 0.00740229, 0.00716077, 0.05794402, 0.00720267,
       0.009235  , 0.00659496, 0.00622814, 0.0069524 , 0.00965795,
       0.00919403, 0.00615403, 0.0070845 , 0.01066438, 0.00879017,
       0.00732048, 0.0188028 , 0.00629136, 0.01075714, 0.00702013],
      dtype=float32)

In [44]:
## Hyper parameter tuning
model = xgb.XGBClassifier(use_label_encoder=False)
param_grid = {
    'n_estimators': [50,100],
    'learning_rate': [0.1,0.2],
    'max_depth': [5,10],
    'alpha': [1,2]
}
model = xgb.XGBClassifier(use_label_encoder=False)


In [45]:
grid_search = GridSearchCV(model, param_grid, scoring='accuracy', cv=3, verbose=1, n_jobs=-1)
grid_search.fit(X, y)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


In [46]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print("Best hyperparameters:", grid_search.best_params_)
print("Test accuracy:", accuracy_score(y_test, y_pred))

Best hyperparameters: {'alpha': 2, 'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 50}
Test accuracy: 0.4669473684210526


In [30]:
import numpy as np
conversion = np.arange(0,25)
decoded_category = le.inverse_transform(conversion)
for i in conversion:
    print(f'{i} = {decoded_category[i]}')

0 = amazon_echo_gen2
1 = au_network_camera
2 = au_wireless_adapter
3 = bitfinder_awair_breathe_easy
4 = candy_house_sesami_wi-fi_access_point
5 = google_home_gen1
6 = i-o_data_qwatch
7 = irobot_roomba
8 = jvc_kenwood_cu-hb1
9 = jvc_kenwood_hdtv_ip_camera
10 = line_clova_wave
11 = link_japan_eremote
12 = mouse_computer_room_hub
13 = nature_remo
14 = panasonic_doorphone
15 = philips_hue_bridge
16 = planex_camera_one_shot!
17 = planex_smacam_outdoor
18 = planex_smacam_pantilt
19 = powerelectric_wi-fi_plug
20 = qrio_hub
21 = sony_bravia
22 = sony_network_camera
23 = sony_smart_speaker
24 = xiaomi_mijia_led


In [83]:
tot = 0
between = 1000
i = 0
#print(f"Gradient Boosted Decision Tree Accuracy: {accuracy*100}%\n \nclass by class:")
while(between <= 25000):
    unique, counts = np.unique(y_pred[between-1000:between], return_counts=True)
    i_count = np.sum(y_pred[between-1000:between] == i)
    tot = np.sum(y_pred[:] == i)
    #tot += i_count
    percentage = i_count
    #print(f'{decoded_category[i]}, accuracy = {percentage}%')
    #print(f'{i} = {decoded_category[i]}, we are accurate {percentage}% amount of times\n{dict(zip(unique, counts))}')
    between += 1000
    i += 1

### Threshold with standardising

In [201]:
### standardising the probabilities based on individual class means and std deviations
import pandas as pd
#probabilities = model.predict_proba(X_test)
prob = pd.DataFrame(probabilities)
mean_prob = prob.mean(axis=0)
std_deviation = prob.std(axis=0)

mean_prob = np.ravel(mean_prob)
std_deviation = np.ravel(std_deviation)
mean_reduced_prob = (probabilities - mean_prob) / std_deviation

In [211]:
z_score_50_percent = (0.9 - mean_prob) / std_deviation
threshold = mean_reduced_prob > z_score_50_percent
predictions = np.full((probabilities.shape[0],), -1)
for i, instance in enumerate(threshold):
    # Check if any probability meets the threshold
    if any(instance):
        # Get the index of the max probability above the threshold
        predictions[i] = np.argmax(mean_reduced_prob[i])


In [212]:
dropped = (predictions == -1).sum()
total = len(predictions)
print(f'dropped: {dropped}, total = {total}, percent = {dropped/total*100}%')

dropped: 14339, total = 25000, percent = 57.355999999999995%


In [213]:


y_test_arr = np.ravel(y_test)
# Step 2: Create a DataFrame with true labels and predicted labels
results_df = pd.DataFrame({'True': y_test_arr, 'Predicted': predictions})

classified = results_df['Predicted'] != -1

prediction_cert = results_df[classified]

prediction_cert['Correct'] = prediction_cert['True'] == prediction_cert['Predicted']
true_count = prediction_cert['Correct'].sum()
true_count

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction_cert['Correct'] = prediction_cert['True'] == prediction_cert['Predicted']


9945

In [214]:
prediction_cert.shape

(10661, 3)

In [215]:
9945/10661

0.9328393208892224

In [79]:
10058/10634

0.9458341169832613

### Class by class accuracy

In [216]:
import numpy as np
conversion = np.arange(0,25)
decoded_category = le.inverse_transform(conversion)
y_test_arr = np.ravel(y_test)

def classByClassAccuracy(predictions, actual):
    ind = 0
    classes = {}
    for j in predictions:
        if j == -1:
            i = 'failed'
        else: 
            i = decoded_category[j]
        if i in classes.keys():
            
            if i == decoded_category[actual[ind]]:
                classes[i]["count"] += 1
            classes[i]["total"] += 1
        else:
            
            if i == decoded_category[actual[ind]]:
                classes[i] = {"count":1, "total":1}
            else:
                classes[i] = {"count":0, "total":1}
        
        ind += 1
 
    results = {}
    for k in classes.keys():
        results[k] = (classes[k]["count"], classes[k]["total"])
 
    return dict(sorted(results.items(), key=lambda x:x[1], reverse=True))
class_acc = classByClassAccuracy(predictions, y_test_arr)
class_acc

{'mouse_computer_room_hub': (991, 991),
 'planex_camera_one_shot!': (984, 984),
 'i-o_data_qwatch': (908, 913),
 'sony_bravia': (876, 885),
 'panasonic_doorphone': (849, 849),
 'jvc_kenwood_cu-hb1': (809, 815),
 'au_wireless_adapter': (715, 715),
 'sony_smart_speaker': (710, 1027),
 'amazon_echo_gen2': (638, 648),
 'powerelectric_wi-fi_plug': (548, 564),
 'line_clova_wave': (332, 335),
 'jvc_kenwood_hdtv_ip_camera': (296, 483),
 'qrio_hub': (246, 253),
 'philips_hue_bridge': (223, 229),
 'sony_network_camera': (148, 162),
 'planex_smacam_pantilt': (146, 223),
 'bitfinder_awair_breathe_easy': (143, 144),
 'au_network_camera': (107, 109),
 'planex_smacam_outdoor': (102, 115),
 'candy_house_sesami_wi-fi_access_point': (48, 58),
 'link_japan_eremote': (43, 43),
 'irobot_roomba': (40, 52),
 'xiaomi_mijia_led': (38, 48),
 'nature_remo': (5, 16),
 'failed': (0, 14339)}

In [45]:
i = 0
for j in predictions:
    i += 1
i

25000

In [100]:
indexes = results_df[(results_df['True'] == 7) & (results_df['Predicted'] == 7)].index

In [101]:
for index in indexes:
    print(test_df['flowStartMilliseconds'][index])

2019-09-07 12:51:48.031
2019-09-02 06:08:59.446
2019-09-02 11:40:05.339
2019-08-31 11:15:39.779
2019-09-03 23:07:29.435
2019-09-01 00:17:20.183
2019-09-04 14:02:07.719
2019-08-29 21:13:26.322
2019-08-30 01:48:02.161
2019-09-04 04:31:08.267
2019-09-03 17:36:24.884
2019-08-27 10:30:53.749
2019-09-09 10:14:39.841
2019-09-04 14:32:17.947
2019-09-01 04:11:23.226
2019-09-07 16:22:26.917
2019-08-31 20:46:59.208
2019-09-09 08:14:17.083
2019-09-08 18:17:48.702
2019-08-28 17:57:45.094
2019-09-02 19:41:09.674
2019-08-29 07:52:55.163
2019-08-29 18:43:01.795
2019-09-06 04:53:27.422
2019-09-01 21:43:48.857
2019-08-31 12:15:45.736
2019-09-04 00:07:33.457
2019-09-02 15:10:40.380
2019-09-07 13:51:54.720
2019-09-02 18:10:55.205
2019-08-31 11:45:42.304
2019-09-04 13:01:56.016
2019-08-31 15:16:03.881
2019-09-05 22:29:33.512
2019-09-04 02:00:48.039
2019-09-04 12:01:55.224
2019-08-29 06:52:47.345
2019-09-07 20:53:15.242
2019-08-30 20:21:32.612


In [183]:
predictions

array([ 0,  0,  0, ..., 22,  0,  0])

### RANDOM FOREST

Unnamed: 0,True,Predicted
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
24995,24,-1
24996,24,24
24997,24,-1
24998,24,-1


In [159]:
# Converting Variables into category
columns_categorical = ["flowAttributes", "protocolIdentifier", "ipClassOfService", "flowEndReason",
                      'reverseFlowAttributes']


for regressors in columns_categorical:
    train_df = pd.get_dummies(train_df, columns=[regressors], prefix=regressors)
    test_df = pd.get_dummies(test_df, columns=[regressors], prefix=regressors)



In [160]:
# Automatically remove inconsistent columns/predictors

for regressors in test_df.columns:
    if regressors not in train_df.columns:
         test_df = test_df.drop(columns=regressors)

for regressors in train_df.columns:
    if regressors not in test_df.columns:
         train_df = train_df.drop(columns=regressors)

In [164]:
len(test_df.columns)

42

In [163]:
len(test_df.columns)

42

In [156]:
# Manually

category = ['ipClassOfService_0xd0']
for regressors in category:
    if regressors in train_df.columns:
        train_df = train_df.drop(columns=regressors)
    if regressors in test_df.columns:
        test_df = test_df.drop(columns=regressors)


In [167]:
from sklearn.ensemble import RandomForestClassifier

response = ['response']
predictors = [x for x in list(train_df.columns) if x not in response]

X = train_df[predictors]
y = train_df[response]
X_test = test_df[predictors]
y_test = test_df[response]
forest1 = RandomForestClassifier(criterion='entropy', bootstrap=True, n_estimators = 400)
forest1 = forest1.fit(X, y)
res = forest1.predict(X_test)



  return fit_method(estimator, *args, **kwargs)


In [57]:
y_test['response'][0]

'amazon_echo_gen2'

In [60]:
from sklearn.metrics import accuracy_score
accuracies = accuracy_score(y_test, res)
count = 0
total = 0
for k in res:
    if k == y_test['response'][total]:
        count +=1
    total += 1

print(f"sklearn: {accuracies}\nme:      {count/total}")

sklearn: 0.44042105263157894
me:      0.44042105263157894


In [168]:
probabilities = forest1.predict_proba(X_test)

In [169]:
probabilities.shape

(25000, 25)

In [194]:
y_test['response'] = le.fit_transform(y_test['response'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  y_test['response'] = le.fit_transform(y_test['response'])


In [195]:
y_test

Unnamed: 0,response
0,0
1,0
2,0
3,0
4,0
...,...
24995,24
24996,24
24997,24
24998,24
