# Feature Selection

In [1]:
# IMPORT DEPENDENCIES 
import pandas as pd
import numpy as np
import plotly.express as px
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
# READ DATA 
internet_firewall = pd.read_csv("C:/Users/kayan/UCD/STA221/Final_Project/Data/log2.csv") 
# REMOVE ROWS WITH NULL VALUES 
internet_firewall = internet_firewall.dropna()
internet_firewall.head()

Unnamed: 0,Source Port,Destination Port,NAT Source Port,NAT Destination Port,Action,Bytes,Bytes Sent,Bytes Received,Packets,Elapsed Time (sec),pkts_sent,pkts_received
0,57222,53,54587,53,allow,177,94,83,2,30,1,1
1,56258,3389,56258,3389,allow,4768,1600,3168,19,17,10,9
2,6881,50321,43265,50321,allow,238,118,120,2,1199,1,1
3,50553,3389,50553,3389,allow,3327,1438,1889,15,17,8,7
4,50002,443,45848,443,allow,25358,6778,18580,31,16,13,18


In [3]:
# CHANGE CLASS TO CATEGORICAL VARIABLE 
internet_firewall['Action'] = internet_firewall.Action.astype('category')

## I. Variance Threshold 

In [4]:
# IMPORT DEPENDENCIES 
from sklearn.feature_selection import VarianceThreshold

In [5]:
# CHOOSE QUANTITATIVE VARIABLES 
X = internet_firewall[['Source Port', 'Destination Port', 'NAT Source Port', 
                'Bytes', 'Bytes Sent', 'Bytes Received', 'Packets', 
                'Elapsed Time (sec)', 'pkts_sent', 'pkts_received']]

In [6]:
# SELECT FEATURES USING 0 VARIANCE THRESHOLD 
var_thresh = VarianceThreshold(threshold =0.0) #set threshold to 0.0 
var_thresh.fit(X) 
print(var_thresh.get_support())
print(var_thresh.fit_transform(X).var(axis=0)) #print variances 

[ True  True  True  True  True  True  True  True  True  True]
[2.32733214e+08 3.40988951e+08 4.82703838e+08 3.15663741e+13
 1.46544219e+13 6.06729965e+12 2.63473110e+07 9.14817216e+04
 1.03609743e+07 4.94313096e+06]


## II. Filter-based methods

### 1. Chi-square Test

In [7]:
# IMPORT DEPENDENCIES 
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler

In [8]:
feature_names = ['Source Port', 'Destination Port', 'NAT Source Port', 
                'Bytes', 'Bytes Sent', 'Bytes Received', 'Packets', 
                'Elapsed Time (sec)', 'pkts_sent', 'pkts_received'] #list of features 

In [9]:
# GET VARIABLES 
X = internet_firewall[feature_names] #independent variables 
y = internet_firewall['Action'] #target variable
X_norm = MinMaxScaler().fit_transform(X) #scale features (models can be sensitive to scale of input)

In [10]:
# GET CHI-SQUARE VALUES FOR ALL FEATURES 
chi_scores = chi2(X_norm,y) #compute chi-sq values 
scores_df = pd.DataFrame({'feature_names': feature_names, 
                          'chi-square_values': chi_scores[0], 
                          'p_values': chi_scores[1]})
print(scores_df.sort_values('chi-square_values', ascending= False))

        feature_names  chi-square_values       p_values
1    Destination Port       15541.300414   0.000000e+00
2     NAT Source Port       14245.855522   0.000000e+00
0         Source Port         819.265928  2.869762e-177
7  Elapsed Time (sec)         295.351296   1.008912e-63
5      Bytes Received          11.310435   1.016039e-02
9       pkts_received           9.121870   2.771368e-02
6             Packets           4.773962   1.891170e-01
3               Bytes           3.711999   2.942893e-01
8           pkts_sent           2.624180   4.532662e-01
4          Bytes Sent           1.141353   7.671017e-01


Note: Choose top 6 since the p-values of last 4 features are not statistically significant 

In [11]:
# GET TOP 5 FEATURES 
chi_selector = SelectKBest(chi2, k=6) #get top 6 features 
chi_selector.fit(X_norm, y)
chi_support = chi_selector.get_support()
chi_feature = X.loc[:,chi_support].columns.tolist()
print(f"{len(chi_feature)} selected features: {chi_feature}")

6 selected features: ['Source Port', 'Destination Port', 'NAT Source Port', 'Bytes Received', 'Elapsed Time (sec)', 'pkts_received']


## III. Wrapper-based methods

### 1. Recursive Feature Elimination

#### RFE - Logistic Regression

In [12]:
# IMPORT DEPENDENCIES 
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [13]:
# PERFORM RFE AND SELECT TOP 6 FEATURES 
estimator = LogisticRegression(max_iter=1000) #define estimator 
rfe_lr = RFE(estimator, n_features_to_select=6, step=1)
rfe_lr.fit(X_norm, y)
rfe_lr_support = rfe_lr.get_support()
rfe_lr_feature = X.loc[:,rfe_lr_support].columns.tolist()
#print(f"Initial features: {X.columns.tolist()}")
#print(f"Feature Ranking: {rfe_lr.ranking_}")

In [14]:
# CONVERT TO DATAFRAME 
df = pd.DataFrame({'feature_names': X.columns.tolist(), 
                          'feature_ranking': rfe_lr.ranking_, 'support': rfe_lr_support})
print(df.sort_values('feature_ranking'))
print(f"{len(rfe_lr_feature)} selected features: {rfe_lr_feature} ")

        feature_names  feature_ranking  support
0         Source Port                1     True
1    Destination Port                1     True
2     NAT Source Port                1     True
5      Bytes Received                1     True
7  Elapsed Time (sec)                1     True
9       pkts_received                1     True
6             Packets                2    False
3               Bytes                3    False
8           pkts_sent                4    False
4          Bytes Sent                5    False
6 selected features: ['Source Port', 'Destination Port', 'NAT Source Port', 'Bytes Received', 'Elapsed Time (sec)', 'pkts_received'] 


#### RFECV - Logistic Regression

In [15]:
# IMPORT DEPENDENCIES 
from sklearn.feature_selection import RFECV
from sklearn.tree import DecisionTreeRegressor

In [16]:
# PERFORM RFECV
estimator = LogisticRegression(max_iter=1000) #define estimator 
rfecv_dtr = RFECV(estimator=estimator,step=1, cv=5)
rfecv_dtr.fit(X_norm, y)
#print(f"Feature Ranking: {rfecv_dtr.ranking_}")
#print(f"Initial features: {X.columns.tolist()}")
#print(rfecv_dtr.support_)

In [17]:
# CONVERT TO DATAFRAME 
df = pd.DataFrame({'feature_names': X.columns.tolist(), 
                          'feature_ranking': rfecv_dtr.ranking_, 'support': rfecv_dtr.support_})
print(df.sort_values('feature_ranking'))
print(f"Number of features selected with CV: {rfecv_dtr.n_features_}")

        feature_names  feature_ranking  support
1    Destination Port                1     True
2     NAT Source Port                1     True
7  Elapsed Time (sec)                1     True
0         Source Port                2    False
5      Bytes Received                3    False
9       pkts_received                4    False
6             Packets                5    False
3               Bytes                6    False
8           pkts_sent                7    False
4          Bytes Sent                8    False
Number of features selected with CV: 3


### 2. Forward Feature Selection

In [18]:
# IMPORT DEPENDENCIES
from sklearn.feature_selection import SequentialFeatureSelector

In [19]:
# PERFORM FORWARD SELECTION AND AUTO SELECT NUMBER OF FEATURES 
sfs = SequentialFeatureSelector(estimator, n_features_to_select='auto', direction='forward')
sfs.fit(X_norm, y)
sfs_support = sfs.get_support()

# CONVERT TO DATAFRAME 
df = pd.DataFrame({'feature_names': X.columns.tolist(), 'support': sfs_support})
print(df)
sfs_feature = X.loc[:,sfs_support].columns.tolist()
print(f"Optimal subset size using forward feature selection: {len(sfs_feature)}")

        feature_names  support
0         Source Port    False
1    Destination Port     True
2     NAT Source Port     True
3               Bytes     True
4          Bytes Sent     True
5      Bytes Received    False
6             Packets    False
7  Elapsed Time (sec)     True
8           pkts_sent    False
9       pkts_received    False
Optimal subset size using forward feature selection: 5


### 3. Backward Elimination

In [20]:
# PERFORM BACKWARD SELECTION AND AUTO SELECT NUMBER OF FEATURES 
sfs2 = SequentialFeatureSelector(estimator, n_features_to_select='auto', direction='backward')
sfs2.fit(X_norm, y)
sfs2_support = sfs2.get_support()

# CONVERT TO DATAFRAME 
df = pd.DataFrame({'feature_names': X.columns.tolist(), 'support': sfs2_support})
print(df)
sfs2_feature = X.loc[:,sfs2_support].columns.tolist()
print(f"Optimal subset size using forward feature selection: {len(sfs2_feature)}")

        feature_names  support
0         Source Port    False
1    Destination Port     True
2     NAT Source Port     True
3               Bytes    False
4          Bytes Sent    False
5      Bytes Received    False
6             Packets    False
7  Elapsed Time (sec)     True
8           pkts_sent     True
9       pkts_received     True
Optimal subset size using forward feature selection: 5


## IV. Embedded methods

### 1. SelectFromModel

In [21]:
# IMPORT DEPENDENCIES
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV

In [22]:
#set hyperparam n_estimators=50 
embedded_rf = SelectFromModel(estimator, max_features=6)
embedded_rf.fit(X_norm, y)
embedded_rf_support = embedded_rf.get_support()
embedded_rf_feature = X.loc[:,embedded_rf_support].columns.tolist()

# CONVERT TO DATAFRAME 
df = pd.DataFrame({'feature_names': X.columns.tolist(), 'support': embedded_rf_support})
print(df)
print(f"{len(embedded_rf_feature)} selected features: {embedded_rf_feature} ")

        feature_names  support
0         Source Port    False
1    Destination Port     True
2     NAT Source Port     True
3               Bytes    False
4          Bytes Sent    False
5      Bytes Received    False
6             Packets    False
7  Elapsed Time (sec)     True
8           pkts_sent    False
9       pkts_received    False
3 selected features: ['Destination Port', 'NAT Source Port', 'Elapsed Time (sec)'] 


## Combined results

In [23]:
featureselection_df = pd.DataFrame({'Feature':feature_names,
                                    'Chi-2':chi_support, 
                                    'RFE_LR':rfe_lr_support , 
                                    'RFECV_LR': rfecv_dtr.support_ , 
                                    'FFS_LR':sfs_support, 
                                    'BE_LR':sfs2_support,
                                    'SelectFromModel_LR': embedded_rf_support})
featureselection_df['Total'] = np.sum(featureselection_df.iloc[:, 1:], axis=1) #sum Trues 
featureselection_df.sort_values('Total', ascending=False)

Unnamed: 0,Feature,Chi-2,RFE_LR,RFECV_LR,FFS_LR,BE_LR,SelectFromModel_LR,Total
1,Destination Port,True,True,True,True,True,True,6
2,NAT Source Port,True,True,True,True,True,True,6
7,Elapsed Time (sec),True,True,True,True,True,True,6
9,pkts_received,True,True,False,False,True,False,3
0,Source Port,True,True,False,False,False,False,2
5,Bytes Received,True,True,False,False,False,False,2
3,Bytes,False,False,False,True,False,False,1
4,Bytes Sent,False,False,False,True,False,False,1
8,pkts_sent,False,False,False,False,True,False,1
6,Packets,False,False,False,False,False,False,0


# Random Forest

#### Top 3 features 

In [115]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [116]:
features = ['Destination Port', 'NAT Source Port', 'Elapsed Time (sec)'] #top 3 features 
X = internet_firewall[features] #independent variables 
X_norm = MinMaxScaler().fit_transform(X) #scale features (models can be sensitive to scale of input)

In [117]:
# SPLIT DATA INTO TRAINING AND TEST SET
X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.2, random_state=42)

In [118]:
# FIT RANDOM FOREST 
rf_clf = RandomForestClassifier() 
rf_clf.fit(X_train, y_train)

In [119]:
#PREDICTION AND ACCURACY
y_pred = rf_clf.predict(X_test)
acc1 = metrics.accuracy_score(y_test, y_pred)
print("Accuracy: ", acc1)

Accuracy:  0.9981689173723964


In [120]:
# F1-SCORE
scorea1 = metrics.f1_score(y_test, y_pred,average='micro')
print("F1-score: ", scorea1)
scorea2 = metrics.f1_score(y_test, y_pred,average='macro')
print("F1-score: ", scorea2)
scorea3 = metrics.f1_score(y_test, y_pred,average='weighted')
print("F1-score: ", scorea3)

F1-score:  0.9981689173723964
F1-score:  0.9791621419684439
F1-score:  0.9981727897816209


In [121]:
# FEATURE IMPORTANCE
feature_imp = pd.Series(rf_clf.feature_importances_, index = features).sort_values(ascending=False)
feature_imp

Destination Port      0.508699
Elapsed Time (sec)    0.248088
NAT Source Port       0.243213
dtype: float64

#### All features 

In [122]:
features = ['Source Port', 'Destination Port', 'NAT Source Port', 
                'Bytes', 'Bytes Sent', 'Bytes Received', 'Packets', 
                'Elapsed Time (sec)', 'pkts_sent', 'pkts_received'] #all features 
X = internet_firewall[features] #independent variables 
X_norm = MinMaxScaler().fit_transform(X) #scale features (models can be sensitive to scale of input)

In [123]:
# SPLIT DATA INTO TRAINING AND TEST SET
X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.2, random_state=42)

In [124]:
# FIT RANDOM FOREST 
rf_clf1 = RandomForestClassifier() 
rf_clf1.fit(X_train, y_train)

In [125]:
#PREDICTION AND ACCURACY
y_pred = rf_clf1.predict(X_test)
acc2 = metrics.accuracy_score(y_test, y_pred)
print("Accuracy: ", acc2)

Accuracy:  0.9981689173723964


In [126]:
# F1-SCORE
scoreb1 = metrics.f1_score(y_test, y_pred,average='micro')
print("F1-score: ", scoreb1)
scoreb2 = metrics.f1_score(y_test, y_pred,average='macro')
print("F1-score: ", scoreb2)
scoreb3 = metrics.f1_score(y_test, y_pred,average='weighted')
print("F1-score: ", scoreb3)

F1-score:  0.9981689173723964
F1-score:  0.9148329708333357
F1-score:  0.9981304432004069


In [127]:
# FEATURE IMPORTANCE
feature_imp = pd.Series(rf_clf1.feature_importances_, index = features).sort_values(ascending=False)
feature_imp

Destination Port      0.272707
NAT Source Port       0.215877
Elapsed Time (sec)    0.197022
Packets               0.117844
Source Port           0.066619
Bytes Received        0.062754
pkts_received         0.058732
pkts_sent             0.007910
Bytes Sent            0.000321
Bytes                 0.000213
dtype: float64

#### Top 6 features 

In [128]:
features = ['Destination Port', 'NAT Source Port', 'Elapsed Time (sec)', 'pkts_received', 'Source Port', 'Bytes Received'] #all features 
X = internet_firewall[features] #independent variables 
X_norm = MinMaxScaler().fit_transform(X) #scale features (models can be sensitive to scale of input)

In [129]:
# SPLIT DATA INTO TRAINING AND TEST SET
X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.2, random_state=42)

In [130]:
# FIT RANDOM FOREST 
rf_clf2 = RandomForestClassifier() 
rf_clf2.fit(X_train, y_train)

In [131]:
#PREDICTION AND ACCURACY
y_pred = rf_clf2.predict(X_test)
acc3 = metrics.accuracy_score(y_test, y_pred)
print("Accuracy: ", acc3)

Accuracy:  0.9980163271534295


In [132]:
# F1-SCORE
scorec1 = metrics.f1_score(y_test, y_pred,average='micro')
print("F1-score: ", scorec1)
scorec2 = metrics.f1_score(y_test, y_pred,average='macro')
print("F1-score: ", scorec2)
scorec3 = metrics.f1_score(y_test, y_pred,average='weighted')
print("F1-score: ", scorec3)

F1-score:  0.9980163271534295
F1-score:  0.956336085417724
F1-score:  0.9980175805582516


In [133]:
# FEATURE IMPORTANCE
feature_imp = pd.Series(rf_clf2.feature_importances_, index = features).sort_values(ascending=False)
feature_imp

Destination Port      0.351629
Elapsed Time (sec)    0.255207
NAT Source Port       0.145146
pkts_received         0.124356
Bytes Received        0.069429
Source Port           0.054234
dtype: float64

#### Top 4 features 

In [134]:
features = ['Destination Port', 'NAT Source Port', 'Elapsed Time (sec)', 'pkts_received'] #all features 
X = internet_firewall[features] #independent variables 
X_norm = MinMaxScaler().fit_transform(X) #scale features (models can be sensitive to scale of input)

In [135]:
# SPLIT DATA INTO TRAINING AND TEST SET
X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.2, random_state=42)

In [136]:
# FIT RANDOM FOREST 
rf_clf2 = RandomForestClassifier() 
rf_clf2.fit(X_train, y_train)

In [137]:
#PREDICTION AND ACCURACY
y_pred = rf_clf2.predict(X_test)
acc4 = metrics.accuracy_score(y_test, y_pred)
print("Accuracy: ", acc4)

Accuracy:  0.9987792782482643


In [138]:
# F1-SCORE
scored1 = metrics.f1_score(y_test, y_pred,average='micro')
print("F1-score: ", scored1)
scored2 = metrics.f1_score(y_test, y_pred,average='macro')
print("F1-score: ", scored2)
scored3 = metrics.f1_score(y_test, y_pred,average='weighted')
print("F1-score: ", scored3)

F1-score:  0.9987792782482643
F1-score:  0.9796275555697924
F1-score:  0.998782415098926


#### Top 5 features 

In [139]:
features = ['Destination Port', 'NAT Source Port', 'Elapsed Time (sec)', 'pkts_received', 'Source Port'] #all features 
X = internet_firewall[features] #independent variables 
X_norm = MinMaxScaler().fit_transform(X) #scale features (models can be sensitive to scale of input)

In [140]:
# SPLIT DATA INTO TRAINING AND TEST SET
X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.2, random_state=42)

In [141]:
# FIT RANDOM FOREST 
rf_clf2 = RandomForestClassifier() 
rf_clf2.fit(X_train, y_train)

In [142]:
#PREDICTION AND ACCURACY
y_pred = rf_clf2.predict(X_test)
acc5 = metrics.accuracy_score(y_test, y_pred)
print("Accuracy: ", acc5)

Accuracy:  0.997940032043946


In [143]:
# F1-SCORE
scoree1 = metrics.f1_score(y_test, y_pred,average='micro')
print("F1-score: ", scoree1)
scoree2 = metrics.f1_score(y_test, y_pred,average='macro')
print("F1-score: ", scoree2)
scoree3 = metrics.f1_score(y_test, y_pred,average='weighted')
print("F1-score: ", scoree3)

F1-score:  0.997940032043946
F1-score:  0.929779440936172
F1-score:  0.9979311165770223


## Scores  

In [145]:
num_features = ['Top 3', 'All', 'Top 6', 
                'Top 4', 'Top 5']
scores_df = pd.DataFrame({'Number of features':num_features,
                                    'f1_score (micro)': [scorea1, scoreb1, scorec1, scored1,scoree1],
                                    'f1_score (macro)': [scorea2, scoreb2, scorec2, scored2,scoree2],
                                    'f1_score (weight)': [scorea3, scoreb3, scorec3, scored3,scoree3]})
scores_df['Average'] = np.mean(scores_df, axis=1)
scores_df.sort_values('Average', ascending=False)

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


Unnamed: 0,Number of features,f1_score (micro),f1_score (macro),f1_score (weight),Average
3,Top 4,0.998779,0.979628,0.998782,0.992396
0,Top 3,0.998169,0.979162,0.998173,0.991835
2,Top 6,0.998016,0.956336,0.998018,0.984123
4,Top 5,0.99794,0.929779,0.997931,0.975217
1,All,0.998169,0.914833,0.99813,0.970377
