# Feature Engineering - CIC IoT 2023 Dataset for Cybersecurity Research

[University of New Brunswick - Canadian Institute for Cybersecurity](https://www.unb.ca/cic/datasets/index.html)

# Imports

In [10]:
import os
import sys

import pandas as pd

from IPython.display import HTML

sys.path.append('../') 

## Definitions

In [48]:
from utils import get_constants

constants = get_constants()

parquet_path = constants['parquet_path']
refined_parquet_path = constants['refined_parquet_path']

features = constants['features']
protocol_layer = constants['protocol_layer']
protocol_layer_map = constants['protocol_layer_map']
attack_category = constants['attack_category']
attack_category_map = constants['attack_category_map']

# Feature Engineering

In [29]:
df = pd.read_parquet(parquet_path)
original_columns_set = set(df.columns)
drop_columns_set = set()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46686579 entries, 0 to 46686578
Data columns (total 48 columns):
 #   Column           Dtype   
---  ------           -----   
 0   flow_duration    float32 
 1   Header_Length    float32 
 2   Protocol Type    float32 
 3   Duration         float32 
 4   Rate             float32 
 5   Srate            float32 
 6   Drate            float32 
 7   fin_flag_number  bool    
 8   syn_flag_number  bool    
 9   rst_flag_number  bool    
 10  psh_flag_number  bool    
 11  ack_flag_number  bool    
 12  ece_flag_number  bool    
 13  cwr_flag_number  bool    
 14  ack_count        float32 
 15  syn_count        float32 
 16  fin_count        float32 
 17  urg_count        float32 
 18  rst_count        float32 
 19  HTTP             bool    
 20  HTTPS            bool    
 21  DNS              bool    
 22  Telnet           bool    
 23  SMTP             bool    
 24  SSH              bool    
 25  IRC              bool    
 26  TCP         

## Drop Features

As we saw in the EDA, there are a couple protocols that are present in < 100 records from the 46.7M that we have, so we're gonna drop them here to reduce the dimensionality of our data.

In [39]:
drop_protocols = {'Telnet', 'SMTP', 'IRC', 'DHCP'}

drop_columns_set |= drop_protocols

Also as shown in the EDA, there are 2 TCP Flags that fall in the same criteria.

In [40]:
drop_tcp_flags = {'ece_flag_number', 'cwr_flag_number'}

drop_columns_set |= drop_tcp_flags

# Save

In [46]:
current_columns_set = set(df.columns)
added_features = current_columns_set - original_columns_set

HTML(f"""
<p>
    In this process, we have <strong>dropped {len(drop_columns_set)} features</strong>:
    <ul>
        {''.join(f"<li>{feature_name}</li>" for feature_name in sorted(drop_columns_set))}
    </ul>
</p>
""")

In [50]:
refined_df = df.drop(columns=drop_columns_set)

refined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46686579 entries, 0 to 46686578
Data columns (total 42 columns):
 #   Column           Dtype   
---  ------           -----   
 0   flow_duration    float32 
 1   Header_Length    float32 
 2   Protocol Type    float32 
 3   Duration         float32 
 4   Rate             float32 
 5   Srate            float32 
 6   Drate            float32 
 7   fin_flag_number  bool    
 8   syn_flag_number  bool    
 9   rst_flag_number  bool    
 10  psh_flag_number  bool    
 11  ack_flag_number  bool    
 12  ack_count        float32 
 13  syn_count        float32 
 14  fin_count        float32 
 15  urg_count        float32 
 16  rst_count        float32 
 17  HTTP             bool    
 18  HTTPS            bool    
 19  DNS              bool    
 20  SSH              bool    
 21  TCP              bool    
 22  UDP              bool    
 23  ARP              bool    
 24  ICMP             bool    
 25  IPv              bool    
 26  LLC         

In [49]:
refined_df.to_parquet(refined_parquet_path)

!du -sh {refined_parquet_path}

958M	/var/fasttmp/bruno_dsn/unb_cic_ds_refined.parquet
