#### make sure we're running the correct virtual environment

In [None]:
import sys
sys.executable

#### imports to build the Random Forest ensemble classifier

In [72]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import accuracy_score, make_scorer, f1_score, roc_auc_score

#supress warnings
import warnings
warnings.filterwarnings('ignore')

#### load the LAN datasets

In [73]:
df = pd.read_csv("data/bertoli_lan_attack_dataset.csv") # attack dataset
attack_classes = pd.read_csv("data/bertoli_lan_attack_labels_sbseg.csv") # labels
bonafide = pd.read_csv('data/bertoli_lan_bonafide_dataset_20191121.csv.gz') # bonafide dataset
print(df.shape, bonafide.shape)

(86480, 41) (103094, 41)


In [74]:
df_labeled = df.merge(attack_classes, how='inner', left_on='ip.src', right_on='ip')
df_labeled.drop(['ip'], axis=1, inplace=True)
df_labeled.head(2)

Unnamed: 0,frame_info.encap_type,frame_info.time,frame_info.time_epoch,frame_info.number,frame_info.len,frame_info.cap_len,eth.type,ip.version,ip.hdr_len,ip.tos,...,tcp.flags.reset,tcp.flags.push,tcp.flags.ack,tcp.flags.urg,tcp.flags.cwr,tcp.window_size,tcp.checksum,tcp.urgent_pointer,tcp.options.mss_val,label
0,1,"Dec 31, 1969 21:03:41.953641000 -03",221.953641,20,58,58,0x00000800,4,20,,...,0,0,0,0,0,1024,0x00005cb1,0,1460.0,nmap_tcp_syn
1,1,"Dec 31, 1969 21:03:41.953762000 -03",221.953762,21,58,58,0x00000800,4,20,,...,0,0,0,0,0,1024,0x00007d1e,0,1460.0,nmap_tcp_syn


In [75]:
bonafide['label'] = "bonafide" # create column label on bonafide dataset

In [76]:
fields = ['eth.type', 'ip.id', 'ip.flags', 'ip.checksum', 'ip.dsfield', 'tcp.flags', 'tcp.checksum']

for field in fields:
    df_labeled[field] = df_labeled[field].apply(lambda x: int(str(x), 16))
    
bonafide = bonafide.fillna(0)
for field in fields:
    bonafide[field] = bonafide[field].apply(lambda x: int(str(x), 16))

full_data = pd.concat([bonafide, df_labeled])

#### clean the data

In [77]:
# we only want TCP or IP flag 6
wrong_proto = full_data[full_data['ip.proto'] != 6]['label'].value_counts().values
full_data = full_data[full_data['ip.proto'] == 6]
print("It was found and removed", wrong_proto,"packets.")

# remove unnecessary fields
full_data.drop(columns=['frame_info.time', 'frame_info.encap_type', 'frame_info.time_epoch', 'frame_info.number', 
                        'frame_info.len', 'frame_info.cap_len', 'eth.type', 'ip.flags', 'ip.src', 'ip.dst',
                        'ip.version', 'ip.proto', 'tcp.flags'], axis=1, inplace=True)

full_data.drop(columns=['ip.hdr_len', 'ip.tos', 'ip.flags.rb', 
                        'ip.flags.mf', 'ip.frag_offset'], axis=1, inplace=True) #should combine with above

It was found and removed [11708] packets.


#### initialize the random forest classifier

In [90]:
algorithms = {
    #"NB" : (GaussianNB(), {}),
    #"LR" : (LogisticRegression(), {}),
    'RF': (RandomForestClassifier(random_state=17, n_jobs=-1), {
            "n_estimators" : [10, 50, 100, 200],
            "criterion" : ("gini", "entropy"), 
            "max_depth": [5, 10],
            "class_weight": (None, "balanced", "balanced_subsample")
    })
}

In [91]:
full_data = full_data.fillna(0)
X = full_data.drop(columns = ["label"])
y = full_data.label

print(X.shape, y.shape)

(113759, 23) (113759,)


In [95]:
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=17) # Train, Test
gskf = StratifiedKFold(n_splits=3, shuffle=True, random_state=17) # Validation
#perf = f1_score # can be used roc_auc_score for binary classification
perf = make_scorer(f1_score, average='micro') #changed this from just f1_score

score = {}
for algorithm in algorithms.keys():
    score[algorithm] = []

    for algorithm, (clf, parameters) in algorithms.items():
        print(algorithm)
        for train, test in kf.split(X, y):
            prep = StandardScaler()
            prep.fit(X.iloc[train])
            best = GridSearchCV(clf, parameters, cv=gskf, scoring=make_scorer(perf)) #perf is modified here
            best.fit(prep.transform(X.iloc[train]), y.iloc[train])
            trans = prep.transform(X.iloc[test], y.iloc[test])
            score[algorithm].append(perf(best.predict(trans)))

RF


ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [69]:
# write the full_data out so we don't have to preprocess the data files...
#full_data.to_csv('full_data.csv', index=False)

#### fit the model to the training data

In [None]:
random_forest_classifier(x, y)

#### generate predictions based on training

In [None]:
y_pred_RF = RF_clf.predict(X_test_scaled)

#### measure prediction accuracy

In [None]:
print(accuracy_score(y_test, y_pred_RF))

In [None]:
pd.DataFrame.from_dict(score)