# Feature Engineering - CIC IoT 2023 Dataset for Cybersecurity Research

[University of New Brunswick - Canadian Institute for Cybersecurity](https://www.unb.ca/cic/datasets/index.html)

# Imports

In [1]:
import os
import sys

import pandas as pd

from IPython.display import HTML

sys.path.append('../') 

## Definitions

In [2]:
from utils import get_constants, get_balanced_weights, get_features_list

constants = get_constants()

parquet_path = constants['parquet_path']
refined_parquet_path = constants['refined_parquet_path']

features = constants['features']
protocol_layer = constants['protocol_layer']
protocol_layer_map = constants['protocol_layer_map']
attack_category = constants['attack_category']
attack_category_map = constants['attack_category_map']

# Feature Engineering

In [35]:
df = pd.read_parquet(parquet_path)

features_list = get_features_list(df, constants)

original_columns_set = set(features_list)
drop_columns_set = set()

In [36]:
general_label_weights = get_balanced_weights(df, 'general_label')
label_weights = get_balanced_weights(df, 'label')

df_sample = df.sample(1_500_000, weights=general_label_weights, random_state=6958)

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46686579 entries, 0 to 46686578
Data columns (total 48 columns):
 #   Column           Dtype   
---  ------           -----   
 0   flow_duration    float32 
 1   Header_Length    float32 
 2   Protocol Type    float32 
 3   Duration         float32 
 4   Rate             float32 
 5   Srate            float32 
 6   Drate            float32 
 7   fin_flag_number  bool    
 8   syn_flag_number  bool    
 9   rst_flag_number  bool    
 10  psh_flag_number  bool    
 11  ack_flag_number  bool    
 12  ece_flag_number  bool    
 13  cwr_flag_number  bool    
 14  ack_count        float32 
 15  syn_count        float32 
 16  fin_count        float32 
 17  urg_count        float32 
 18  rst_count        float32 
 19  HTTP             bool    
 20  HTTPS            bool    
 21  DNS              bool    
 22  Telnet           bool    
 23  SMTP             bool    
 24  SSH              bool    
 25  IRC              bool    
 26  TCP         

## Drop Features with low Variance

Here we're going to try to identify features with very low variance (that are present in almost all reacords or in almost none of them) to drop, aiming to reduce the dimensionality of our data.

In [11]:
from sklearn.feature_selection import VarianceThreshold

In [38]:
%%time
sel = VarianceThreshold(threshold=0.001)
sel.fit(df_sample[features_list])

drop_features_variances = {
    feature
    for feature, is_relevant in zip(features_list, sel.get_support())
    if not is_relevant
}

drop_columns_set |= drop_features_variance

drop_features_variance

CPU times: user 840 ms, sys: 477 ms, total: 1.32 s
Wall time: 1.31 s


{'ARP',
 'DHCP',
 'Drate',
 'IRC',
 'SMTP',
 'Telnet',
 'cwr_flag_number',
 'ece_flag_number'}

As we saw in the EDA, there are a couple protocols that are present in a very low number of records from the 46.7M that we have, so it's natural that even a simple feature selection algorithm would detect them here.

# Save

In [26]:
current_columns_set = set(df.columns)
added_features = current_columns_set - original_columns_set

HTML(f"""
<p>
    In this process, we have <strong>dropped {len(drop_columns_set)} features</strong>:
    <ul>
        {''.join(f"<li>{feature_name}</li>" for feature_name in sorted(drop_columns_set))}
    </ul>
</p>
""")

In [29]:
refined_df = df.drop(columns=drop_columns_set)

refined_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46686579 entries, 0 to 46686578
Data columns (total 40 columns):
 #   Column           Dtype   
---  ------           -----   
 0   flow_duration    float32 
 1   Header_Length    float32 
 2   Protocol Type    float32 
 3   Duration         float32 
 4   Rate             float32 
 5   Srate            float32 
 6   fin_flag_number  bool    
 7   syn_flag_number  bool    
 8   rst_flag_number  bool    
 9   psh_flag_number  bool    
 10  ack_flag_number  bool    
 11  ack_count        float32 
 12  syn_count        float32 
 13  fin_count        float32 
 14  urg_count        float32 
 15  rst_count        float32 
 16  HTTP             bool    
 17  HTTPS            bool    
 18  DNS              bool    
 19  SSH              bool    
 20  TCP              bool    
 21  UDP              bool    
 22  ICMP             bool    
 23  IPv              bool    
 24  LLC              bool    
 25  Tot sum          float32 
 26  Min         

In [30]:
refined_df.to_parquet(refined_parquet_path)

!du -sh {refined_parquet_path}

958M	/var/fasttmp/bruno_dsn/unb_cic_ds_refined.parquet
