In [1]:
import pandas as pd
from glob import glob

from imblearn.under_sampling import RandomUnderSampler

In [11]:
from sklearn import preprocessing

In [2]:
def get_csv(uri):
    csvs = []
    if "*" in uri:
        all_csv = glob(uri)
        [ csvs.append(pd.read_csv(uri)) for uri in all_csv ]
        return pd.concat(csvs)
    return pd.read_csv(uri)

In [3]:
dropped_features = [
    "Flow ID", "Src IP", "Src Port",
    "Dst IP", "Dst Port", "Timestamp"
]

def label_dataset(ds, label):
    dataset = ds.copy()
    dataset[["Label"]] = label
    return dataset

def remove_string(ds):
    # Hapus fitur yang memiliki string --------------------------------
    print("Jumlah fitur sebelum didrop: " + str(ds.shape[1]))
    ds.drop(columns=dropped_features, inplace=True)
    print("Jumlah fitur setelah didrop: " + str(ds.shape[1]))

### Malware IoT-23

In [4]:
malware = get_csv('/media/kmdr7/Seagate/DATASETS/IOT-23/CTU-IoT-Malware-Capture-48-1/out2/*')
malware = label_dataset(malware, "Malicious")
remove_string(malware)

Jumlah fitur sebelum didrop: 84
Jumlah fitur setelah didrop: 78


### Benign IoT-23

In [5]:
benign_iot23 = get_csv('/media/kmdr7/Seagate/DATASETS/IoT-Traffic-Traces/out/*')
benign_iot23 = label_dataset(benign_iot23, "Benign")
remove_string(benign_iot23)

Jumlah fitur sebelum didrop: 84
Jumlah fitur setelah didrop: 78


### Honeypot IoT-23

In [None]:
honeypot = pd.concat([
    get_csv('/media/kmdr7/Seagate/DATASETS/IOT-23/CTU-Honeypot-Capture-7-1/out/*.csv'),
    get_csv('/media/kmdr7/Seagate/DATASETS/IOT-23/CTU-Honeypot-Capture-4-1/out/*.csv'),
    get_csv('/media/kmdr7/Seagate/DATASETS/IOT-23/CTU-Honeypot-Capture-5-1/out/*.csv')
])
honeypot = label_dataset(honeypot, "Benign")
remove_string(honeypot)

### Benign IoT-Benign-Trafic

In [None]:
# benign_iottr = get_csv('/media/kmdr7/Seagate/DATASETS/IoT-Benign-Traffic/out/18-10-17.pcap_Flow.csv')
# honeypot = pd.concat([
#     get_csv('/media/kmdr7/Seagate/DATASETS/IoT-Benign-Traffic/out/CTU-Honeypot-Capture-7-1/out/*.csv'),
#     get_csv('/media/kmdr7/Seagate/DATASETS/IoT-Benign-Traffic/out/CTU-Honeypot-Capture-4-1/out/*.csv'),
#     get_csv('/media/kmdr7/Seagate/DATASETS/IoT-Benign-Traffic/out/CTU-Honeypot-Capture-5-1/out/*.csv')
# ])
# benign_iottr = label_dataset(benign_iottr, "Benign")
# remove_string(benign_iottr)

In [6]:
print("Malware Shape " + str(malware.shape))
print("Benign IoT-23 Shape " + str(benign_iot23.shape))
# print("Honeypot IoT-23 Shape " + str(honeypot.shape))
# print("Benign IoT-Benign-Trafic Shape " + str(benign_iottr.shape))
print("==========================================")
print("Total Malware " + str(malware.shape[0]))
print("Total Benign " + str(benign_iot23.shape[0]))
print("Total Dataset " + str(malware.shape[0]+benign_iot23.shape[0]))

Malware Shape (1692552, 78)
Benign IoT-23 Shape (1427672, 78)
Total Malware 1692552
Total Benign 1427672
Total Dataset 3120224


In [7]:
# benign = pd.concat([benign_iot23, honeypot])
dataset = pd.concat([benign_iot23, malware])

In [12]:
le = preprocessing.LabelEncoder()
label = le.fit_transform(dataset["Label"])

In [28]:
malware["Label"] = 1
benign_iot23["Label"] = 0

In [30]:
benign_iot23["Label"]

0        0
1        0
2        0
3        0
4        0
        ..
43926    0
43927    0
43928    0
43929    0
43930    0
Name: Label, Length: 1427672, dtype: int64

In [16]:
dataset["Label"] = label

In [17]:
dataset["Label"]

0         0
1         0
2         0
3         0
4         0
         ..
134220    1
134221    1
134222    1
134223    1
134224    1
Name: Label, Length: 3120224, dtype: int64

In [None]:
x = dataset.drop(["Label"], axis=1)
y = dataset[["Label"]]
smp = RandomUnderSampler(random_state=0, sampling_strategy='majority')
x_smp, y_smp = smp.fit_resample(x, y)

In [None]:
x_smp.shape

In [None]:
y_smp.shape

In [None]:
malware.shape

In [None]:
dataset.shape

In [None]:
x = dataset.drop(["Label"], axis=1)
y = dataset[["Label"]]

# sampler = TomekLinks(sampling_strategy="all")
sampler = RandomUnderSampler(random_state=0)

x_resampled, y_resampled = sampler.fit_resample(x, y)

In [None]:
x_resampled.shape

In [None]:
y_resampled.shape

In [None]:
malsample = malware.sample(frac=0.0005, random_state=0)
bensample = benign.sample(frac=0.0005, random_state=0)

In [None]:
print(malsample.shape)
print(bensample.shape)
print("Total Dataset " + str(malsample.shape[0] + bensample.shape[0]))

In [None]:
dataset_sampled = pd.concat([malsample, bensample])

## Gabungkan Malware Dataset

In [31]:
malware.to_csv("/media/kmdr7/Seagate/TA/DATASETS/NewMalware.csv", index=False)

## Gabungkan Benign Dataset

In [32]:
benign_iot23.to_csv("/media/kmdr7/Seagate/TA/DATASETS/NewBenign.csv", index=False)

## Gabung Dataset

In [18]:
dataset.to_csv("/media/kmdr7/Seagate/TA/DATASETS/new/Dataset.csv", index=False)

In [None]:
dataset_sampled.to_csv("/media/kmdr7/Seagate/TA/DATASETS/DatasetSampledSmall.csv", index=False)