<a href="https://colab.research.google.com/github/karunaprakash062/Major_Project/blob/main/featureselection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv('/content/drive/MyDrive/NSL-KDD/NSL_KDD_Data.csv')

In [4]:
unique_train_classes = set(data['class'])

In [5]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
data['service'] = encoder.fit_transform(data['service'])
data['protocol_type'] = encoder.fit_transform(data['protocol_type'])
data['flag'] = encoder.fit_transform(data['flag'])

In [6]:
removed_feature = data.pop('num_outbound_cmds')

In [7]:
target = data.pop('class')

In [8]:
target = encoder.fit_transform(target)

In [9]:
target = pd.Series(target)

Filter Methods for features selection

In [10]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2, f_classif, mutual_info_classif


X, y = data, target
feature_names = data.columns

# Chi-square test
chi2_selector = SelectKBest(chi2, k=10)
X_chi2_selected = chi2_selector.fit_transform(X, y)
selected_chi2_indices = chi2_selector.get_support(indices=True)
selected_chi2_features = [feature_names[i] for i in selected_chi2_indices]

# ANOVA F-value (f_classif)
fvalue_selector = SelectKBest(f_classif, k=10)
X_fvalue_selected = fvalue_selector.fit_transform(X, y)
selected_fvalue_indices = fvalue_selector.get_support(indices=True)
selected_fvalue_features = [feature_names[i] for i in selected_fvalue_indices]

# Mutual Information
mi_selector = SelectKBest(mutual_info_classif, k=10)
X_mi_selected = mi_selector.fit_transform(X, y)
selected_mi_indices = mi_selector.get_support(indices=True)
selected_mi_features = [feature_names[i] for i in selected_mi_indices]

print("Chi-square selected features:", selected_chi2_features)
print("ANOVA F-value selected features:", selected_fvalue_features)
print("Mutual Information selected features:", selected_mi_features)

Chi-square selected features: ['duration', 'service', 'src_bytes', 'dst_bytes', 'wrong_fragment', 'hot', 'count', 'srv_count', 'dst_host_count', 'dst_host_srv_count']
ANOVA F-value selected features: ['flag', 'land', 'wrong_fragment', 'serror_rate', 'srv_serror_rate', 'same_srv_rate', 'dst_host_same_srv_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate']
Mutual Information selected features: ['service', 'flag', 'src_bytes', 'count', 'same_srv_rate', 'diff_srv_rate', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_serror_rate']


In [11]:
# Convert the selected features lists to sets
set_chi2 = set(selected_chi2_features)
set_fvalue = set(selected_fvalue_features)
set_mi = set(selected_mi_features)

# Find common features between Chi-square and ANOVA F-value
common_chi2_fvalue = set_chi2.intersection(set_fvalue)

# Find common features between Chi-square and Mutual Information
common_chi2_mi = set_chi2.intersection(set_mi)

# Find common features between ANOVA F-value and Mutual Information
common_fvalue_mi = set_fvalue.intersection(set_mi)

# Find common features present in any two sets among the three
common_any_two = common_chi2_fvalue.union(common_chi2_mi, common_fvalue_mi)

print("Common features present in any two sets among the three:", common_any_two)


Common features present in any two sets among the three: {'dst_host_srv_count', 'wrong_fragment', 'flag', 'src_bytes', 'service', 'same_srv_rate', 'dst_host_same_srv_rate', 'dst_host_serror_rate', 'count'}


Wrapper Methods for features selection

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE


# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize Random Forest classifier
clf = RandomForestClassifier(random_state=42)

# Initialize RFE selector
rfe_selector = RFE(estimator=clf, n_features_to_select=10, step=1)

# Fit RFE selector to training data
rfe_selector.fit(X_train, y_train)

# Get selected feature indices
selected_indices = rfe_selector.support_

# Get selected feature names
selected_features = [feature_names[i] for i, selected in enumerate(selected_indices) if selected]

print("Selected features using RFE:", selected_features)

Selected features using RFE: ['protocol_type', 'flag', 'src_bytes', 'dst_bytes', 'count', 'same_srv_rate', 'diff_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_serror_rate', ' d_level']


Dimensionality Reduction

In [13]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.manifold import TSNE

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# PCA
pca = PCA(n_components=10)
X_pca = pca.fit_transform(X_scaled)

# LDA
lda = LDA(n_components=10)
X_lda = lda.fit_transform(X_scaled, y)


print("PCA Reduced Dimension Data:")
print(X_pca[:5])
print("\nLDA Reduced Dimension Data:")
print(X_lda[:5])

PCA Reduced Dimension Data:
[[-0.98733665 -0.11234776 -0.03161786 -0.1554983   0.21487399  0.23593905
  -0.43658733  0.09837595  0.0689059   0.03954749]
 [ 0.64880745  0.88215687  0.1941114  -0.97405637  1.31600367  2.22649351
  -3.39679643  0.85448652 -0.86926176  0.26141247]
 [ 3.62482172 -2.00677961  0.06426656 -0.2994913  -0.05018042 -0.47213191
  -0.19064406 -0.03954534  0.25837693  0.09134273]
 [-2.65138475 -0.66838408 -0.10273376  0.52054588 -0.36706718 -1.04948625
   0.1359334  -0.02593796  0.08346168 -0.14063829]
 [-2.44103577 -0.59066356 -0.17549714  1.36715049 -0.19769866 -0.21213396
   0.15432648 -0.05113339 -0.04340128 -0.17092763]]

LDA Reduced Dimension Data:
[[ 1.27113407  2.94265533 -0.31996107 -0.58502933 -1.21649681 -0.44434829
  -0.13085822  1.00659148  0.06396474  0.2585262 ]
 [ 1.25830168 -0.01387044  1.37340558  1.63144876  3.53858     1.72794255
   2.21244954 -1.01030419 -0.88142249 -0.07404923]
 [ 1.47777194 -7.27495958  0.40337506  0.11007694  0.26049845 -0.48

Embedded Methods

In [14]:
from sklearn.linear_model import Lasso, ElasticNet

# 1. LASSO
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)
selected_features_lasso = [feature_names[i] for i in range(len(lasso.coef_)) if abs(lasso.coef_[i]) > 0]
print("Selected features using LASSO:", selected_features_lasso)

# 2. Elastic Net
elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)
elastic_net.fit(X_train, y_train)
selected_features_elastic_net = [feature_names[i] for i in range(len(elastic_net.coef_)) if abs(elastic_net.coef_[i]) > 0]
print("Selected features using Elastic Net:", selected_features_elastic_net)

Selected features using LASSO: ['duration', 'wrong_fragment', 'logged_in', 'is_guest_login', 'srv_count', 'srv_serror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_srv_serror_rate']
Selected features using Elastic Net: ['duration', 'protocol_type', 'service', 'wrong_fragment', 'hot', 'num_failed_logins', 'logged_in', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate']


In [15]:
common_features = set(selected_features_lasso).intersection(selected_features_elastic_net)

print("Common features selected by LASSO and Elastic Net:", common_features)

Common features selected by LASSO and Elastic Net: {'dst_host_diff_srv_rate', 'wrong_fragment', 'dst_host_srv_serror_rate', 'dst_host_srv_diff_host_rate', 'srv_count', 'duration', 'same_srv_rate', 'dst_host_same_src_port_rate', 'is_guest_login', 'srv_diff_host_rate', 'diff_srv_rate', 'srv_serror_rate', 'logged_in'}


In [16]:
filter_method_features = data[common_any_two]

  filter_method_features = data[common_any_two]


In [17]:
test_data = pd.read_csv('/content/drive/MyDrive/NSL-KDD/KDDTest+.csv')

In [18]:
test_data = test_data[test_data['class'].isin(unique_train_classes)]

In [21]:
test_data['service'] = encoder.fit_transform(test_data['service'])
test_data['protocol_type'] = encoder.fit_transform(test_data['protocol_type'])
test_data['flag'] = encoder.fit_transform(test_data['flag'])
test_data['class'] = encoder.fit_transform(test_data['class'])

In [22]:
test_data

Unnamed: 0,duration,protocol_type,service,flag,src_bytes,dst_bytes,land,wrong_fragment,urgent,hot,...,dst_host_same_srv_rate,dst_host_diff_srv_rate,dst_host_same_src_port_rate,dst_host_srv_diff_host_rate,dst_host_serror_rate,dst_host_srv_serror_rate,dst_host_rerror_rate,dst_host_srv_rerror_rate,class,d_level
0,0,1,44,1,0,0,0,0,0,0,...,0.04,0.06,0.00,0.00,0.00,0.0,1.00,1.00,9,21
1,0,1,44,1,0,0,0,0,0,0,...,0.00,0.06,0.00,0.00,0.00,0.0,1.00,1.00,9,21
2,2,1,19,9,12983,0,0,0,0,0,...,0.61,0.04,0.61,0.02,0.00,0.0,0.00,0.00,11,21
5,0,1,22,9,267,14515,0,0,0,0,...,1.00,0.00,0.01,0.03,0.01,0.0,0.00,0.00,11,21
6,0,1,48,9,1022,387,0,0,0,0,...,0.11,0.72,0.00,0.00,0.00,0.0,0.72,0.04,11,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22538,0,0,14,9,1032,0,0,0,0,0,...,1.00,0.00,1.00,0.00,0.00,0.0,0.00,0.00,18,20
22539,0,1,48,9,794,333,0,0,0,0,...,0.72,0.06,0.01,0.01,0.01,0.0,0.00,0.00,11,21
22540,0,1,22,9,317,938,0,0,0,0,...,1.00,0.00,0.01,0.01,0.01,0.0,0.00,0.00,11,21
22541,0,1,22,9,54540,8314,0,0,0,2,...,1.00,0.00,0.00,0.00,0.00,0.0,0.07,0.07,0,15
