In [1]:
import tensorflow as tf
from tensorflow import keras
import sklearn
import numpy as np
import pandas as pd
import os
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
def to_int(x):
    return int(x, 16)

In [3]:
path = "./datasets/iot-network-intrusion-dataset_190919/Packets_integrated/"

In [4]:
names_list = [f for f in os.listdir(path) if f.endswith('.txt')]

In [5]:
names_list

['benign-dec.txt',
 'dos-synflooding.txt',
 'mirai-ackflooding.txt',
 'mirai-hostbruteforce-1,3,5.txt',
 'mirai-hostbruteforce-2,4.txt',
 'mirai-httpflooding.txt',
 'mirai-udpflooding.txt',
 'mitm.txt',
 'scan-hostport_1.txt',
 'scan-hostport_2.txt',
 'scan-hostport_3.txt',
 'scan-hostport_4.txt',
 'scan-hostport_5.txt',
 'scan-hostport_6.txt',
 'scan-portos.txt']

# 파일 변환 코드

In [6]:
# for txt in names_list:
#     packet = pd.read_csv(os.path.join(path, txt), sep="\n", header = None)
#     packet[packet[0].str.contains("Frame \(")] = "\f"
#     packet[0] = packet[0].str[:53]
#     packet = packet[0].str.split("  ", expand=True)
#     packet = packet[[0, 1]]
#     packet = pd.DataFrame(packet.to_string(header=None, index=False).split("\f"))
#     print("Almost Done!")
#     packet[0]=packet[0].str.lstrip()
#     packet[0] = packet[0].str.replace("\n", " ")
#     for _ in range(10):
#         packet[0] = packet[0].str.replace("  ", " ")
#     packet[0] = packet[0].str[:222]
#     packet = packet[0].str.split(" ", expand=True)
#     packet=packet.fillna("00")
#     packet = packet.drop(columns=[0, 17, 34, 51, 68, 71])
#     packet=packet.replace("","00")
#     packet.to_csv(os.path.join(path, txt).replace(".txt","_hex")+".csv")
#     print(txt,"done")
# #     packet = packet.applymap(to_int)

# Reading files

In [14]:
hex_list = [f for f in os.listdir(path) if f.endswith('hex.csv')]

In [15]:
hex_list

['benign-dec_hex.csv',
 'dos-synflooding_hex.csv',
 'mirai-ackflooding_hex.csv',
 'mirai-hostbruteforce-1,3,5_hex.csv',
 'mirai-hostbruteforce-2,4_hex.csv',
 'mirai-httpflooding_hex.csv',
 'mirai-udpflooding_hex.csv',
 'mitm_hex.csv',
 'scan-hostport_1_hex.csv',
 'scan-hostport_2_hex.csv',
 'scan-hostport_3_hex.csv',
 'scan-hostport_4_hex.csv',
 'scan-hostport_5_hex.csv',
 'scan-hostport_6_hex.csv',
 'scan-portos_hex.csv']

In [17]:
dict_names=hex_list.copy()
csv_list=hex_list.copy()
for i in range(len(hex_list)):
    dict_names[i] = hex_list[i].replace("_hex.csv","")
    csv_list[i] = hex_list[i].replace("_hex","")

In [21]:
dict_names

['benign-dec',
 'dos-synflooding',
 'mirai-ackflooding',
 'mirai-hostbruteforce-1,3,5',
 'mirai-hostbruteforce-2,4',
 'mirai-httpflooding',
 'mirai-udpflooding',
 'mitm',
 'scan-hostport_1',
 'scan-hostport_2',
 'scan-hostport_3',
 'scan-hostport_4',
 'scan-hostport_5',
 'scan-hostport_6',
 'scan-portos']

In [22]:
csv_list

['benign-dec.csv',
 'dos-synflooding.csv',
 'mirai-ackflooding.csv',
 'mirai-hostbruteforce-1,3,5.csv',
 'mirai-hostbruteforce-2,4.csv',
 'mirai-httpflooding.csv',
 'mirai-udpflooding.csv',
 'mitm.csv',
 'scan-hostport_1.csv',
 'scan-hostport_2.csv',
 'scan-hostport_3.csv',
 'scan-hostport_4.csv',
 'scan-hostport_5.csv',
 'scan-hostport_6.csv',
 'scan-portos.csv']

In [46]:
X_dict = {}
for i, packets in enumerate(names_list):
    X_dict[dict_names[i]] = pd.read_csv(os.path.join(path, packets),index_col=0, dtype=str)
    X_dict[dict_names[i]] = X_dict[dict_names[i]].join(pd.read_csv(os.path.join(path,csv_list[i])),how="outer")

In [48]:
X_dict['mirai-hostbruteforce-1,3,5']

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,67,69,70,No.,Time,Source,Destination,Protocol,Length,Info
0,04,32,f4,45,17,b3,88,36,6c,d7,...,ff,fb,e1,1,0.000000e+00,104.74.213.186,192.168.0.24,TCP,826,[TCP segment of a reassembled PDU]
1,04,32,f4,45,17,b3,88,36,6c,d7,...,ff,fb,e1,2,2.200000e-04,104.74.213.186,192.168.0.24,TCP,1514,[TCP segment of a reassembled PDU]
2,04,32,f4,45,17,b3,88,36,6c,d7,...,ff,fb,e1,3,5.600000e-04,104.74.213.186,192.168.0.24,TCP,1514,[TCP segment of a reassembled PDU]
3,04,32,f4,45,17,b3,88,36,6c,d7,...,ff,fb,e1,4,6.370000e-04,104.74.213.186,192.168.0.24,TCP,1514,[TCP segment of a reassembled PDU]
4,04,32,f4,45,17,b3,88,36,6c,d7,...,ff,fb,e1,5,1.378000e-03,104.74.213.186,192.168.0.24,TCP,1514,[TCP segment of a reassembled PDU]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
273352,88,36,6c,d7,1c,56,04,32,f4,45,...,c9,de,65,273353,1.810277e+06,192.168.0.24,211.237.6.104,TCP,66,58485 > 443 [ACK] Seq=2510 Ack=376686 Win=52...
273353,04,32,f4,45,17,b3,88,36,6c,d7,...,00,00,00,273354,1.810277e+06,EFMNetwo_d7:1c:56,Partron_45:17:b3,ARP,42,Who has 192.168.0.24? Tell 192.168.0.1
273354,88,36,6c,d7,1c,56,04,32,f4,45,...,00,00,00,273355,1.810277e+06,Partron_45:17:b3,EFMNetwo_d7:1c:56,ARP,42,192.168.0.24 is at 04:32:f4:45:17:b3
273355,bc,1c,81,4b,ae,ba,88,36,6c,d7,...,00,00,00,273356,1.810277e+06,EFMNetwo_d7:1c:56,Sichuani_4b:ae:ba,ARP,42,Who has 192.168.0.13? Tell 192.168.0.1


In [11]:
y_dict = {}
for key, value in enumerate(X_dict):
    y_dict[value] = pd.read_csv(os.path.join(path, value)+"_attacked.csv", header=None)[0][1:].apply(int)

  interactivity=interactivity, compiler=compiler, result=result)


In [12]:
for i in dict_names:
    print(i, len(X_dict[i]),"    attacked", len(y_dict[i]))

benign-dec 137396     attacked 0
dos-synflooding 141709     attacked 64646
mirai-ackflooding 313462     attacked 75632
mirai-hostbruteforce-1,3,5 273357     attacked 1636
mirai-hostbruteforce-2,4 179998     attacked 961
mirai-httpflooding 248294     attacked 10464
mirai-udpflooding 1187114     attacked 949284
mitm 194184     attacked 101885
scan-hostport_1 29352     attacked 1490
scan-hostport_2 37106     attacked 2701
scan-hostport_3 13555     attacked 1730
scan-hostport_4 7801     attacked 1707
scan-hostport_5 4760     attacked 2211
scan-hostport_6 6828     attacked 2401
scan-portos 211078     attacked 12970


In [13]:
label_dict={}
for i in dict_names:
    label = np.zeros(len(X_dict[i]))
    label[y_dict[i]-1] = 1
    ###위에 -1 붙여야됨
    label_dict[i] = label

In [14]:
X=pd.DataFrame([])
for i in dict_names:
    X = X.append(X_dict[i])

In [15]:
y=[]
for i in dict_names:
    y.extend(label_dict[i])
y = np.array(y)

In [16]:
print(len(X), len(y))

2985994 2985994


# 중간에 all.csv가져오는 코드 넣을것

In [17]:
X

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,60,61,62,63,64,65,66,67,69,70
0,48,4b,aa,2c,d8,f9,bc,1c,81,4b,...,08,0a,00,07,20,ee,15,d3,9b,8a
1,48,4b,aa,2c,d8,f9,bc,1c,81,4b,...,08,0a,00,07,20,ee,15,d3,9b,8a
2,48,4b,aa,2c,d8,f9,bc,1c,81,4b,...,08,0a,00,07,20,ee,15,d3,9b,8a
3,48,4b,aa,2c,d8,f9,bc,1c,81,4b,...,08,0a,00,07,20,ee,15,d3,9b,8a
4,48,4b,aa,2c,d8,f9,bc,1c,81,4b,...,08,0a,00,07,20,ee,15,d3,9b,8a
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211073,88,36,6c,d7,1c,56,04,32,f4,45,...,08,0a,00,03,86,b0,f5,eb,3e,fa
211074,88,36,6c,d7,1c,56,04,32,f4,45,...,08,0a,00,03,86,b0,f5,eb,3e,fa
211075,88,36,6c,d7,1c,56,04,32,f4,45,...,08,0a,00,03,86,b1,f5,eb,3e,fa
211076,88,36,6c,d7,1c,56,04,32,f4,45,...,08,0a,00,03,86,b1,f5,eb,3e,fa


In [18]:
df = pd.read_csv("./datasets/iot-network-intrusion-dataset_190919/all.csv")

In [19]:
X["Source"]=X["37"]+X["38"]
X["Destination"]=X["37"]+X["38"]
# X["No."] = df["No."]
# X["Time"] = df["Time"]
# X["Protocol"] = df["Protocol"]
# X["Length"] = df["Length"]

In [20]:
X_reduced = X[["Source", "Destination"]]

In [21]:
X_reduced["Source"] = X_reduced["Source"].apply(to_int)
X_reduced["Destination"] = X_reduced["Destination"].apply(to_int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [22]:
X_reduced

Unnamed: 0,Source,Destination
0,9020,9020
1,9020,9020
2,9020,9020
3,9020,9020
4,9020,9020
...,...,...
211073,41467,41467
211074,41467,41467
211075,41467,41467
211076,41467,41467


In [57]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df["Protocol"] = encoder.fit_transform(df["Protocol"])
df["Source"] = encoder.fit_transform(df["Source"])
df["Destination"] = encoder.fit_transform(df["Destination"])
df["Info"] = encoder.fit_transform(df["Info"])

In [58]:
df

Unnamed: 0,No.,Time,Source,Destination,Protocol,Length,Info
0,1,0.000000e+00,26910,26991,579,1498,463463
1,2,1.440000e-04,26910,26991,579,140,468154
2,3,2.270000e-04,26910,26991,579,1502,463464
3,4,7.650000e-04,26910,26991,579,1498,463465
4,5,8.400000e-04,26910,26991,579,1502,463466
...,...,...,...,...,...,...,...
2985989,2985990,9.752289e+06,26940,27406,579,66,363881
2985990,2985991,9.752289e+06,61260,60946,529,42,593218
2985991,2985992,9.752289e+06,61272,60941,529,42,19991
2985992,2985993,9.752289e+06,61260,60956,529,42,593034


In [23]:
X = X.drop(["Source", "Destination"], axis=1)

In [24]:
X

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,60,61,62,63,64,65,66,67,69,70
0,48,4b,aa,2c,d8,f9,bc,1c,81,4b,...,08,0a,00,07,20,ee,15,d3,9b,8a
1,48,4b,aa,2c,d8,f9,bc,1c,81,4b,...,08,0a,00,07,20,ee,15,d3,9b,8a
2,48,4b,aa,2c,d8,f9,bc,1c,81,4b,...,08,0a,00,07,20,ee,15,d3,9b,8a
3,48,4b,aa,2c,d8,f9,bc,1c,81,4b,...,08,0a,00,07,20,ee,15,d3,9b,8a
4,48,4b,aa,2c,d8,f9,bc,1c,81,4b,...,08,0a,00,07,20,ee,15,d3,9b,8a
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
211073,88,36,6c,d7,1c,56,04,32,f4,45,...,08,0a,00,03,86,b0,f5,eb,3e,fa
211074,88,36,6c,d7,1c,56,04,32,f4,45,...,08,0a,00,03,86,b0,f5,eb,3e,fa
211075,88,36,6c,d7,1c,56,04,32,f4,45,...,08,0a,00,03,86,b1,f5,eb,3e,fa
211076,88,36,6c,d7,1c,56,04,32,f4,45,...,08,0a,00,03,86,b1,f5,eb,3e,fa


In [23]:
print(len(y), len(y[y==0]), len(y[y==1]))

2985994 1756276 1229718


In [25]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [26]:
X_train, X_valid, y_train, y_valid = train_test_split(X_reduced,y, test_size=0.25)

In [28]:
len(y_train)

2239495

In [26]:
X_train

Unnamed: 0,Source,Destination
329435,60075,60075
132118,443,443
463479,64783,64783
1010452,51252,51252
133101,7100,7100
...,...,...
26859,9020,9020
154715,64783,64783
95433,56361,56361
200648,49153,49153


In [27]:
X_train

Unnamed: 0,Source,Destination
154611,64775,64775
126456,52739,52739
71871,443,443
116774,9020,9020
845077,56361,56361
...,...,...
664602,60189,60189
163377,51998,51998
529164,64776,64776
465668,60120,60120


In [28]:
print(y_train, len(y_train))

[1. 0. 0. ... 1. 1. 1.] 2239495


In [32]:
model_knn = KNeighborsClassifier(n_neighbors=10)
model_knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=10)

In [30]:
model_rf = RandomForestClassifier()
model_rf.fit(X_train, y_train)

RandomForestClassifier()

In [33]:
model_knn.score(X_valid, y_valid)

0.9652698797989012

In [31]:
model_rf.score(X_valid, y_valid)

0.9671734322484022

In [45]:
X_train, X_valid, y_train, y_valid = train_test_split(X, np.array(y), test_size=0.25)

In [None]:
model = keras.models.Sequential([
    keras.layers.Dense(10, activation="relu", input_shape=[X_train.shape[1]]),
    keras.layers.Dense(1, activation="sigmoid")
])

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
history = model.fit(X_train, y_train, epochs=10, validation_data = (X_valid, y_valid))

Train on 2239495 samples, validate on 746499 samples
Epoch 1/10


In [12]:
model = keras.models.Sequential([
    keras.layers.Dense(100, activation="relu", input_shape=[X_train.shape[1]]),
    keras.layers.Dense(1, activation="sigmoid")
])

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
history = model.fit(X_train, y_train, epochs=20, validation_data = (X_valid, y_valid))

Train on 949691 samples, validate on 237423 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [34]:
flood_pred = model.predict_classes(X_valid)
result=["benign", "attacked"]
print(classification_report(y_valid,flood_pred , target_names=result))

NameError: name 'model' is not defined

In [102]:
flood_pred = model.predict_classes(X_valid)
result=["benign", "attacked"]
print(classification_report(y_valid,flood_pred , target_names=result))

              precision    recall  f1-score   support

      benign       1.00      1.00      1.00     35814
    attacked       1.00      1.00      1.00       186

    accuracy                           1.00     36000
   macro avg       1.00      1.00      1.00     36000
weighted avg       1.00      1.00      1.00     36000



In [30]:
model.trainable_variables

[<tf.Variable 'dense/kernel:0' shape=(66, 100) dtype=float32, numpy=
 array([[-0.21548988, -0.06131569, -0.05593153, ...,  0.009556  ,
          1.7233193 , -0.2308436 ],
        [-0.18978657, -0.25972167,  0.10120087, ..., -0.0030753 ,
         -2.4265988 , -0.25424433],
        [-0.02546336, -0.03507623, -0.1180388 , ...,  0.00665484,
         -1.0056187 , -0.19892313],
        ...,
        [-0.08734512, -0.02374357, -0.04575067, ..., -0.10786477,
          1.9088019 ,  0.13243084],
        [-0.08642702,  0.0873314 , -0.0180158 , ...,  0.05652766,
          0.4187118 ,  0.0130941 ],
        [-0.27137038,  0.19135252, -0.09057982, ...,  0.07734038,
          0.3805156 ,  0.05727315]], dtype=float32)>,
 <tf.Variable 'dense/bias:0' shape=(100,) dtype=float32, numpy=
 array([-0.05285334, -0.12155698, -0.01935806, -0.6765026 , -0.07473051,
        -0.0804761 , -0.05523553, -0.02441452, -0.08331953, -0.05623414,
        -0.05350327, -0.17753392, -0.11969976, -0.0678602 , -0.05730273,
     