## Início

In [1]:
import os
import re
import pickle

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns

In [3]:
import matplotlib.pyplot as plt

In [4]:
%matplotlib inline
%xmode verbose

Exception reporting mode: Verbose


In [5]:
%config InlineBackend.figure_format='retina'

In [6]:
pd.set_option("styler.format.thousands", ",")
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "last_expr_or_assign"

In [7]:
import warnings
warnings.filterwarnings('ignore')

In [8]:
import session_info
session_info.show()

## Tamanhos das Classes

In [9]:
if os.path.isfile('pickles/class_names.pkl'):
    with open('pickles/class_names.pkl', 'rb') as f:
        class_names = pickle.load(f)

if os.path.isfile('pickles/class_sizes.pkl'):
    with open('pickles/class_sizes.pkl', 'rb') as f:
        class_sizes = pickle.load(f)

In [10]:
classes_sizes = dict(zip(class_names, class_sizes))
classes_sizes = dict(sorted(classes_sizes.items(), key=lambda item: item[1]));

In [11]:
classes_denial = {class_name: class_size for class_name, class_size in classes_sizes.items() if r'DoS' in class_name};

In [12]:
classes_denial_total_size = sum(classes_denial.values())
f"Total de linhas de Negação de Serviço: {classes_denial_total_size:,}"

'Total de linhas de Negação de Serviço: 41,790,136'

In [13]:
classes_no_denial = {class_name: class_size for class_name, class_size in classes_sizes.items() if r'DoS' not in class_name}
classes_no_denial_total_size = sum(classes_no_denial.values())
f"Total de linhas do Resto das Classes: {classes_no_denial_total_size:,}"

'Total de linhas do Resto das Classes: 4,947,130'

## Ataques Inclusos (14)

- [x] Uploading_Attack
- [x] Recon-PingSweep
- [x] Backdoor_Malware
- [x] XSS
- [x] SqlInjection
- [x] CommandInjection
- [x] BrowserHijacking
- [x] DictionaryBruteForce
- [x] Recon-PortScan
- [x] Recon-OSScan
- [x] Recon-HostDiscovery
- [x] DNS_Spoofing
- [x] MITM-ArpSpoofing
- [x] VulnerabilityScan

## Ataques Não Inclusos (19)
Ataques de Negação de Serviço, de natureza volumétrica

- [ ] Mirai-greip_flood
- [ ] Mirai-greeth_flood
- [ ] Mirai-udpplain
- [ ] DDoS-SYN_Flood
- [ ] DDoS-RSTFINFLOOD
- [ ] DDoS-PSHACK_FLOOD
- [ ] DDoS-TCP_Flood
- [ ] DDoS-UDP_Flood
- [ ] DDoS-ICMP_Flood
- [ ] DDoS-HTTP_Flood
- [ ] DDoS-SynonymousIP_Flood
- [ ] DDoS-UDP_Fragmentation
- [ ] DDoS-ACK_Fragmentation
- [ ] DDoS-ICMP_Fragmentation
- [ ] DDoS-SlowLoris
- [ ] DoS-HTTP_Flood
- [ ] DoS-SYN_Flood
- [ ] DoS-TCP_Flood
- [ ] DoS-UDP_Flood

---

Caminho dos CSVs

In [14]:
from pathlib import Path
PATH = Path("/data/CICIoT2023/CSV")

PosixPath('/data/CICIoT2023/CSV')

## Números Presentemente

In [15]:
attacks = {'Uploading_Attack': 1252,
           'Recon-PingSweep': 2262,
           'Backdoor_Malware': 3218,
           'XSS': 3846,
           'SqlInjection': 5245,
           'CommandInjection': 5409,
           'BrowserHijacking': 5859,
           'DictionaryBruteForce': 13064,
           'Recon-PortScan': 82284,
           'Recon-OSScan': 98259,
           'Recon-HostDiscovery': 134378,
           'DNS_Spoofing': 178898,
           'VulnerabilityScan': 373351,
           'MITM-ArpSpoofing': 307560}
print(f"Quantidade de registros de ataque: {sum(attacks.values()):,}")
print(f"Quantidade de registros benignos:  {classes_sizes['Benign_Final']:,}")

Quantidade de registros de ataque: 1,214,885
Quantidade de registros benignos:  1,098,191


---

## Leitura/Otimização dos Dados

Classes Escolhidas:     `benign` X `attacks`

Proporção Escolhidas: `1,098,191` X `1,214,885`

### Path

In [16]:
import pathlib
PATH = pathlib.Path('/data/CICIoT2023/CSV/');

In [17]:
def read_data(folder):
    csv_files = [f for f in os.listdir(folder) if f.endswith('.csv')]
    dfs = []
    for file in csv_files:
        file_path = os.path.join(folder, file)
        df = pd.read_csv(file_path, engine='pyarrow')
        dfs.append(df)
    return dfs

### Dados Benignos

In [18]:
benign_folder_path =  PATH / 'Benign_Final'
dfs_benign = read_data(benign_folder_path)
f"Quantidade de arquivos benign: {len(dfs_benign)}"

'Quantidade de arquivos benign: 4'

In [19]:
df_benign = pd.concat(dfs_benign, ignore_index=True)
df_benign.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1098191 entries, 0 to 1098190
Data columns (total 39 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   Header_Length    1098191 non-null  float64
 1   Protocol Type    1098191 non-null  int64  
 2   Time_To_Live     1098191 non-null  float64
 3   Rate             1098191 non-null  float64
 4   fin_flag_number  1098191 non-null  float64
 5   syn_flag_number  1098191 non-null  float64
 6   rst_flag_number  1098191 non-null  float64
 7   psh_flag_number  1098191 non-null  float64
 8   ack_flag_number  1098191 non-null  float64
 9   ece_flag_number  1098191 non-null  float64
 10  cwr_flag_number  1098191 non-null  float64
 11  ack_count        1098191 non-null  int64  
 12  syn_count        1098191 non-null  int64  
 13  fin_count        1098191 non-null  int64  
 14  rst_count        1098191 non-null  int64  
 15  HTTP             1098191 non-null  float64
 16  HTTPS            1

Os tipos estão muito pesados, `float64` e `int64`. O dataset ocupa muito espaço também (`320MB+`).

### Otimizando a Memória

Convertendo os atributos para `snake_case`, além de criar a label `benign`

In [20]:
def to_snake_case(name):
    return re.sub(r'\W+', '_', name).lower()

In [21]:
df_benign.columns = df_benign.columns.to_series().apply(to_snake_case)
df_benign['label'] = 'benign'

Encontrar os `dtypes` que minimizam o memory footprint

In [22]:
def suggest_optimized_dtypes(df):
    optimized_dtypes = {}
    for column in df.columns:
        col_type = df[column].dtype
        if pd.api.types.is_integer_dtype(col_type):
            min_val, max_val = df[column].min(), df[column].max()
            if pd.api.types.is_bool_dtype(df[column]):
                optimized_dtypes[column] = np.bool_
            else:
                optimized_dtypes[column] = {True: np.uint8, False: np.uint16, 
                                             2: np.uint32, 3: np.uint64}.get(True if min_val >= 0 else False, 
                                             np.int64 if max_val > np.iinfo(np.uint64).max else np.int32)
        elif pd.api.types.is_float_dtype(col_type):
            if df[column].between(np.finfo(np.float16).min, np.finfo(np.float16).max).all():
                optimized_dtypes[column] = np.float16
            elif df[column].between(np.finfo(np.float32).min, np.finfo(np.float32).max).all():
                optimized_dtypes[column] = np.float32
            else:
                optimized_dtypes[column] = np.float64
        elif pd.api.types.is_object_dtype(col_type):
            optimized_dtypes[column] = 'category'
    return optimized_dtypes

In [23]:
optimized_dtypes = suggest_optimized_dtypes(df_benign)
list(optimized_dtypes.items())[:5] 

[('header_length', numpy.float16),
 ('protocol_type', numpy.uint8),
 ('time_to_live', numpy.float16),
 ('rate', numpy.float64),
 ('fin_flag_number', numpy.float16)]

Aplicar os tipos otimizados

In [24]:
df_benign_optimized = df_benign.astype(optimized_dtypes)
df_benign_optimized.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1098191 entries, 0 to 1098190
Data columns (total 40 columns):
 #   Column           Non-Null Count    Dtype   
---  ------           --------------    -----   
 0   header_length    1098191 non-null  float16 
 1   protocol_type    1098191 non-null  uint8   
 2   time_to_live     1098191 non-null  float16 
 3   rate             1098191 non-null  float64 
 4   fin_flag_number  1098191 non-null  float16 
 5   syn_flag_number  1098191 non-null  float16 
 6   rst_flag_number  1098191 non-null  float16 
 7   psh_flag_number  1098191 non-null  float16 
 8   ack_flag_number  1098191 non-null  float16 
 9   ece_flag_number  1098191 non-null  float16 
 10  cwr_flag_number  1098191 non-null  float16 
 11  ack_count        1098191 non-null  uint8   
 12  syn_count        1098191 non-null  uint8   
 13  fin_count        1098191 non-null  uint8   
 14  rst_count        1098191 non-null  uint8   
 15  http             1098191 non-null  float16 
 16  

Demonstrar a melhora na quantidade de memória utilizada

In [25]:
def calcular_economia_memoria(df):
    original_memory = df.memory_usage(deep=True).sum()
    optimized_dtypes = suggest_optimized_dtypes(df)
    df_optimized = df.astype(optimized_dtypes)
    optimized_memory = df_optimized.memory_usage(deep=True).sum()
    memory_saved = original_memory - optimized_memory
    percentage_saved = (memory_saved / original_memory) * 100
    print(f"Memória original: {original_memory / (1024**2):.2f} MB")
    print(f"Memória otimizada: {optimized_memory / (1024**2):.2f} MB")
    print(f"Memória economizada: {memory_saved / (1024**2):.2f} MB ({percentage_saved:.1f}%)")
    return df_optimized, optimized_dtypes

In [26]:
df_benign_optimized, optimized_dtypes = calcular_economia_memoria(df_benign)
df_benign_optimized

Memória original: 384.37 MB
Memória otimizada: 92.16 MB
Memória economizada: 292.20 MB (76.0%)


Unnamed: 0,header_length,protocol_type,time_to_live,rate,fin_flag_number,syn_flag_number,rst_flag_number,psh_flag_number,ack_flag_number,ece_flag_number,...,tot_sum,min,max,avg,std,tot_size,iat,number,variance,label
0,29.593750,6,114.125,68.738911,0.0,0.0,0.0,0.099976,0.899902,0.0,...,0,66,234,947.000,706.044506,947.000,0.014549,10,4.984988e+05,benign
1,24.796875,6,149.125,47.628610,0.0,0.0,0.0,0.199951,0.700195,0.0,...,181,66,89,146.125,114.666134,146.125,0.021088,10,1.314832e+04,benign
2,17.593750,6,156.125,35.455709,0.0,0.0,0.0,0.300049,0.600098,0.0,...,163,60,234,323.500,434.414868,323.500,0.028641,10,1.887163e+05,benign
3,24.796875,6,91.125,68.694442,0.0,0.0,0.0,0.099976,0.700195,0.0,...,192,66,58,1222.000,1347.199581,1222.000,0.014748,10,1.814947e+06,benign
4,24.000000,6,180.875,97.070372,0.0,0.0,0.0,0.000000,0.700195,0.0,...,60,66,134,134.000,135.292276,134.000,0.010300,10,1.830400e+04,benign
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1098186,29.593750,6,87.625,7986.108149,0.0,0.0,0.0,0.000000,0.899902,0.0,...,22,60,42,1461.000,1472.146747,1461.000,0.000125,10,2.167216e+06,benign
1098187,32.000000,6,99.375,6911.029824,0.0,0.0,0.0,0.000000,1.000000,0.0,...,28,66,18,1462.000,1974.242133,1462.000,0.000145,10,3.897632e+06,benign
1098188,29.593750,6,93.500,9852.722575,0.0,0.0,0.0,0.000000,0.899902,0.0,...,46,60,42,1182.000,1442.300154,1182.000,0.000102,10,2.080230e+06,benign
1098189,32.000000,6,99.375,8305.552475,0.0,0.0,0.0,0.000000,1.000000,0.0,...,28,66,18,1462.000,1974.242133,1462.000,0.000135,10,3.897632e+06,benign


---

### Dados Malignos

In [27]:
attacks_paths = {}
for attack in attacks.keys():
    attacks_paths[attack] = PATH / attack

In [28]:
dataframes = []
for attack_name, dir_path in attacks_paths.items():
    if dir_path.exists() and dir_path.is_dir():
        for file_path in dir_path.glob("*.csv"):
            df = pd.read_csv(file_path, engine='pyarrow')
            df['label'] = attack_name
            dataframes.append(df)
    else:
        print(f"Diretório {dir_path} não existe.")

In [29]:
df_attack = pd.concat(dataframes, ignore_index=True)

Unnamed: 0,Header_Length,Protocol Type,Time_To_Live,Rate,fin_flag_number,syn_flag_number,rst_flag_number,psh_flag_number,ack_flag_number,ece_flag_number,...,Tot sum,Min,Max,AVG,Std,Tot size,IAT,Number,Variance,label
0,14.8,6,78.6,20.155562,0.0,0.0,0.1,0.200,0.5,0.0,...,1121,60,218,112.1,63.715426,112.1,0.049614,10,4.059656e+03,Uploading_Attack
1,21.6,6,119.1,27.499197,0.0,0.0,0.0,0.100,0.6,0.0,...,1566,60,650,156.6,183.739066,156.6,0.041712,10,3.376004e+04,Uploading_Attack
2,19.2,6,111.2,25.816773,0.0,0.0,0.0,0.400,0.5,0.0,...,2897,60,1514,289.7,436.771247,289.7,0.039537,10,1.907691e+05,Uploading_Attack
3,15.6,6,108.3,28.561883,0.0,0.0,0.0,0.200,0.5,0.0,...,1682,60,480,168.2,126.905739,168.2,0.036588,10,1.610507e+04,Uploading_Attack
4,18.8,6,102.3,567.020048,0.0,0.0,0.0,0.500,0.9,0.0,...,1337,60,492,133.7,141.198088,133.7,0.001778,10,1.993690e+04,Uploading_Attack
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1214880,32.0,6,56.0,4262.504065,0.0,0.0,0.0,0.400,1.0,0.0,...,32516,1514,4410,3251.6,1142.197803,3251.6,0.000269,10,1.304616e+06,MITM-ArpSpoofing
1214881,32.0,6,56.0,4752.752408,0.0,0.0,0.0,0.400,1.0,0.0,...,28172,1514,4410,2817.2,1068.428212,2817.2,0.000246,10,1.141539e+06,MITM-ArpSpoofing
1214882,32.0,6,57.6,1165.764474,0.0,0.0,0.0,0.400,1.0,0.0,...,23828,66,4410,2382.8,1556.553229,2382.8,0.000881,10,2.422858e+06,MITM-ArpSpoofing
1214883,32.0,6,64.0,346636.694215,0.0,0.0,0.0,0.000,1.0,0.0,...,660,66,66,66.0,0.000000,66.0,0.000003,10,0.000000e+00,MITM-ArpSpoofing


In [30]:
df_attack['label'].value_counts()

label
VulnerabilityScan       373351
MITM-ArpSpoofing        307560
DNS_Spoofing            178898
Recon-HostDiscovery     134378
Recon-OSScan             98259
Recon-PortScan           82284
DictionaryBruteForce     13064
BrowserHijacking          5859
CommandInjection          5409
SqlInjection              5245
XSS                       3846
Backdoor_Malware          3218
Recon-PingSweep           2262
Uploading_Attack          1252
Name: count, dtype: int64

In [31]:
df_attack.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1214885 entries, 0 to 1214884
Data columns (total 40 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   Header_Length    1214885 non-null  float64
 1   Protocol Type    1214885 non-null  int64  
 2   Time_To_Live     1214885 non-null  float64
 3   Rate             1214885 non-null  float64
 4   fin_flag_number  1214885 non-null  float64
 5   syn_flag_number  1214885 non-null  float64
 6   rst_flag_number  1214885 non-null  float64
 7   psh_flag_number  1214885 non-null  float64
 8   ack_flag_number  1214885 non-null  float64
 9   ece_flag_number  1214885 non-null  float64
 10  cwr_flag_number  1214885 non-null  float64
 11  ack_count        1214885 non-null  int64  
 12  syn_count        1214885 non-null  int64  
 13  fin_count        1214885 non-null  int64  
 14  rst_count        1214885 non-null  int64  
 15  HTTP             1214885 non-null  float64
 16  HTTPS            1

---

### Otimizando a Memória 2

In [32]:
df_attack_optimized, optimized_dtypes_attack = calcular_economia_memoria(df_attack)
df_attack_optimized

Memória original: 436.30 MB
Memória otimizada: 101.96 MB
Memória economizada: 334.34 MB (76.6%)


Unnamed: 0,Header_Length,Protocol Type,Time_To_Live,Rate,fin_flag_number,syn_flag_number,rst_flag_number,psh_flag_number,ack_flag_number,ece_flag_number,...,Tot sum,Min,Max,AVG,Std,Tot size,IAT,Number,Variance,label
0,14.796875,6,78.62500,20.155562,0.0,0.0,0.099976,0.199951,0.500000,0.0,...,97,60,218,112.125,63.715426,112.125,0.049622,10,4.059656e+03,Uploading_Attack
1,21.593750,6,119.12500,27.499197,0.0,0.0,0.000000,0.099976,0.600098,0.0,...,30,60,138,156.625,183.739066,156.625,0.041718,10,3.376004e+04,Uploading_Attack
2,19.203125,6,111.18750,25.816773,0.0,0.0,0.000000,0.399902,0.500000,0.0,...,81,60,234,289.750,436.771247,289.750,0.039551,10,1.907691e+05,Uploading_Attack
3,15.601562,6,108.31250,28.561883,0.0,0.0,0.000000,0.199951,0.500000,0.0,...,146,60,224,168.250,126.905739,168.250,0.036591,10,1.610507e+04,Uploading_Attack
4,18.796875,6,102.31250,567.020048,0.0,0.0,0.000000,0.500000,0.899902,0.0,...,57,60,236,133.750,141.198088,133.750,0.001778,10,1.993690e+04,Uploading_Attack
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1214880,32.000000,6,56.00000,4262.504065,0.0,0.0,0.000000,0.399902,1.000000,0.0,...,4,234,58,3252.000,1142.197803,3252.000,0.000269,10,1.304616e+06,MITM-ArpSpoofing
1214881,32.000000,6,56.00000,4752.752408,0.0,0.0,0.000000,0.399902,1.000000,0.0,...,12,234,58,2818.000,1068.428212,2818.000,0.000246,10,1.141539e+06,MITM-ArpSpoofing
1214882,32.000000,6,57.59375,1165.764474,0.0,0.0,0.000000,0.399902,1.000000,0.0,...,20,66,58,2382.000,1556.553229,2382.000,0.000881,10,2.422858e+06,MITM-ArpSpoofing
1214883,32.000000,6,64.00000,346636.694215,0.0,0.0,0.000000,0.000000,1.000000,0.0,...,148,66,66,66.000,0.000000,66.000,0.000003,10,0.000000e+00,MITM-ArpSpoofing


---

## Salvando no disco

Para evitar retrabalho, daqui pra frente usaremos apenas os DataFrames já preparados

In [33]:
df_benign_optimized.to_pickle('pickles/df_benign.pkl')
df_attack_optimized.to_pickle('pickles/df_attack.pkl')

## Fim