In [44]:
import pandas as pd

In [45]:
df = pd.read_csv("balancedDarknet.csv")

In [46]:
df.shape

(213317, 29)

In [47]:
df.columns.to_list()

['Src IP',
 'Dst IP',
 'Dst Port',
 'Protocol',
 'Fwd Packet Length Min',
 'Bwd Packet Length Min',
 'Bwd Packet Length Mean',
 'Flow IAT Max',
 'Fwd IAT Std',
 'Bwd IAT Std',
 'Bwd IAT Max',
 'Bwd Packets/s',
 'Packet Length Min',
 'Packet Length Mean',
 'FIN Flag Count',
 'SYN Flag Count',
 'PSH Flag Count',
 'Average Packet Size',
 'Bwd Segment Size Avg',
 'Subflow Bwd Bytes',
 'Bwd Init Win Bytes',
 'Fwd Act Data Pkts',
 'Fwd Seg Size Min',
 'Idle Mean',
 'Idle Std',
 'Idle Max',
 'Idle Min',
 'application type',
 'traffic nature']

In [48]:
X = df.drop("traffic nature",axis=1)
y = df.loc[:,["traffic nature"]]

# recursif feature eliminator:

In [49]:
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()

rfe = RFE(estimator=clf, n_features_to_select=25, step=1)

rfe.fit(X, y)

RFE_selected_features = X.columns[rfe.support_]

  estimator.fit(X[:, features], y, **fit_params)
  estimator.fit(X[:, features], y, **fit_params)
  estimator.fit(X[:, features], y, **fit_params)
  self.estimator_.fit(X[:, features], y, **fit_params)


In [50]:
RFE_selected_features= RFE_selected_features.to_list()
RFE_selected_features

['Src IP',
 'Dst IP',
 'Dst Port',
 'Protocol',
 'Fwd Packet Length Min',
 'Bwd Packet Length Min',
 'Bwd Packet Length Mean',
 'Flow IAT Max',
 'Fwd IAT Std',
 'Bwd Packets/s',
 'Packet Length Min',
 'Packet Length Mean',
 'FIN Flag Count',
 'PSH Flag Count',
 'Average Packet Size',
 'Bwd Segment Size Avg',
 'Subflow Bwd Bytes',
 'Bwd Init Win Bytes',
 'Fwd Act Data Pkts',
 'Fwd Seg Size Min',
 'Idle Mean',
 'Idle Std',
 'Idle Max',
 'Idle Min',
 'application type']

# mutual information gain

In [51]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif

kbest = SelectKBest(score_func=mutual_info_classif, k=25)

kbest.fit(X, y)

MIC_selected_features = X.columns[kbest.get_support()]

  y = column_or_1d(y, warn=True)


In [52]:
MIC_selected_features=MIC_selected_features.to_list()
MIC_selected_features

['Src IP',
 'Dst IP',
 'Dst Port',
 'Protocol',
 'Fwd Packet Length Min',
 'Bwd Packet Length Min',
 'Bwd Packet Length Mean',
 'Flow IAT Max',
 'Bwd IAT Std',
 'Bwd IAT Max',
 'Bwd Packets/s',
 'Packet Length Min',
 'Packet Length Mean',
 'PSH Flag Count',
 'Average Packet Size',
 'Bwd Segment Size Avg',
 'Subflow Bwd Bytes',
 'Bwd Init Win Bytes',
 'Fwd Act Data Pkts',
 'Fwd Seg Size Min',
 'Idle Mean',
 'Idle Std',
 'Idle Max',
 'Idle Min',
 'application type']

# CHI2 features dependency

In [53]:
from sklearn.feature_selection import chi2

kbest = SelectKBest(score_func=chi2, k=25)

kbest.fit(X, y)

CHI2_selected_features = X.columns[kbest.get_support()]

In [54]:
CHI2_selected_features= CHI2_selected_features.to_list()
CHI2_selected_features

['Src IP',
 'Dst IP',
 'Dst Port',
 'Protocol',
 'Fwd Packet Length Min',
 'Bwd Packet Length Min',
 'Bwd Packet Length Mean',
 'Flow IAT Max',
 'Fwd IAT Std',
 'Bwd IAT Std',
 'Bwd IAT Max',
 'Bwd Packets/s',
 'Packet Length Min',
 'Packet Length Mean',
 'SYN Flag Count',
 'PSH Flag Count',
 'Average Packet Size',
 'Bwd Segment Size Avg',
 'Subflow Bwd Bytes',
 'Bwd Init Win Bytes',
 'Fwd Act Data Pkts',
 'Idle Mean',
 'Idle Std',
 'Idle Max',
 'Idle Min']

# ANNOVA features dependency

In [55]:
from sklearn.feature_selection import f_classif

kbest = SelectKBest(score_func=f_classif, k=25)


kbest.fit(X, y)


ANNOVA_selected_features = X.columns[kbest.get_support()]

  y = column_or_1d(y, warn=True)


In [56]:
ANNOVA_selected_features= ANNOVA_selected_features.to_list()
ANNOVA_selected_features

['Src IP',
 'Dst IP',
 'Dst Port',
 'Protocol',
 'Fwd Packet Length Min',
 'Bwd Packet Length Min',
 'Bwd Packet Length Mean',
 'Bwd IAT Max',
 'Bwd Packets/s',
 'Packet Length Min',
 'Packet Length Mean',
 'FIN Flag Count',
 'SYN Flag Count',
 'PSH Flag Count',
 'Average Packet Size',
 'Bwd Segment Size Avg',
 'Subflow Bwd Bytes',
 'Bwd Init Win Bytes',
 'Fwd Act Data Pkts',
 'Fwd Seg Size Min',
 'Idle Mean',
 'Idle Std',
 'Idle Max',
 'Idle Min',
 'application type']

# overall best features

In [57]:
commun_features = set(RFE_selected_features).intersection(MIC_selected_features, CHI2_selected_features, ANNOVA_selected_features)

print(commun_features)  

{'Average Packet Size', 'Packet Length Min', 'Dst IP', 'Bwd Init Win Bytes', 'Subflow Bwd Bytes', 'Dst Port', 'Idle Min', 'Fwd Packet Length Min', 'Idle Max', 'Protocol', 'Idle Mean', 'PSH Flag Count', 'Bwd Packets/s', 'Fwd Act Data Pkts', 'Src IP', 'Bwd Packet Length Min', 'Packet Length Mean', 'Bwd Packet Length Mean', 'Bwd Segment Size Avg', 'Idle Std'}


In [58]:
best_features = list(commun_features)

In [59]:
len(best_features)

20

In [60]:
newDF = df.loc[:,best_features]

In [61]:
newDF=pd.concat([newDF,y],axis=1)

In [62]:
newDF.shape

(213317, 21)

In [63]:
newDF

Unnamed: 0,Average Packet Size,Packet Length Min,Dst IP,Bwd Init Win Bytes,Subflow Bwd Bytes,Dst Port,Idle Min,Fwd Packet Length Min,Idle Max,Protocol,...,PSH Flag Count,Bwd Packets/s,Fwd Act Data Pkts,Src IP,Bwd Packet Length Min,Packet Length Mean,Bwd Packet Length Mean,Bwd Segment Size Avg,Idle Std,traffic nature
0,0.000000,0,1249740220,1181,0,5228,1.437770e+15,0,1.437770e+15,6,...,0,0.033292,0,177772555,0,0.000000,0.000000,0.000000,3.678736e+07,0
1,99.000000,42,3322363789,0,57,53,0.000000e+00,42,0.000000e+00,17,...,0,8.642594,0,168298718,114,66.000000,114.000000,114.000000,0.000000e+00,2
2,0.000000,0,177772555,980,0,48213,0.000000e+00,0,0.000000e+00,6,...,0,47619.047620,0,1796602589,0,0.000000,0.000000,0.000000,0.000000e+00,0
3,223.500000,37,2211055107,0,186,53,0.000000e+00,37,0.000000e+00,17,...,0,532.197978,0,168296462,373,149.000000,373.000000,373.000000,0.000000e+00,2
4,0.000000,0,177772555,0,0,42857,0.000000e+00,0,0.000000e+00,6,...,0,0.000000,0,3645881404,0,0.000000,0.000000,0.000000,0.000000e+00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213312,0.000000,0,2915224844,379,0,4852,0.000000e+00,0,0.000000e+00,6,...,0,25.928187,0,2211115158,0,0.000000,0.000000,0.000000,0.000000e+00,3
213313,124.385412,32,2211055107,0,92,53,0.000000e+00,32,0.000000e+00,17,...,0,860.461083,0,2211115158,184,82.923608,184.529941,184.529941,0.000000e+00,3
213314,679.935601,0,3281674540,65535,564,443,1.456330e+15,0,1.456330e+15,6,...,550,10.388149,95,167772687,0,663.955415,775.478086,775.478086,2.087574e+07,3
213315,650.371199,0,3281677079,65535,451,443,1.456400e+15,0,1.456400e+15,6,...,1869,21.970592,1093,167772687,0,650.226750,797.999022,797.999022,3.114654e+07,3


In [64]:
newDF["traffic nature"].value_counts()

0    55167
2    53969
1    52846
3    51335
Name: traffic nature, dtype: int64

In [65]:
newDF.corr()["Idle Mean"]

Average Packet Size       0.471617
Packet Length Min        -0.238852
Dst IP                   -0.011654
Bwd Init Win Bytes        0.472792
Subflow Bwd Bytes         0.337894
Dst Port                 -0.075682
Idle Min                  0.827853
Fwd Packet Length Min    -0.203956
Idle Max                  0.990212
Protocol                 -0.401364
Idle Mean                 1.000000
PSH Flag Count            0.235571
Bwd Packets/s            -0.159213
Fwd Act Data Pkts         0.168092
Src IP                   -0.118370
Bwd Packet Length Min    -0.355201
Packet Length Mean        0.496559
Bwd Packet Length Mean    0.337787
Bwd Segment Size Avg      0.337787
Idle Std                  0.180037
traffic nature            0.077312
Name: Idle Mean, dtype: float64

# idle mean , idle max, idle min are mutally highly correlated ,thus, one of them can represent all of them


In [66]:
newDF = newDF.drop(["Idle Min","Idle Max"],axis=1)

In [67]:
newDF.shape

(213317, 19)

### samething for the rest of features ,dropping one of all mutually highly correlated features to avoid issues with multicollinearity :

In [68]:
corr_matrix = newDF.corr().abs()

# Create a boolean mask for highly correlated features
high_corr_mask = corr_matrix > 0.8

# Filter the highly correlated features
high_corr_features = []
for i in range(len(high_corr_mask.columns)):
    for j in range(i):
        if high_corr_mask.iloc[i, j]:
            colname1 = high_corr_mask.columns[i]
            colname2 = high_corr_mask.columns[j]
            high_corr_features.append((colname1, colname2))

# Print the highly correlated feature pairs
print(high_corr_features)

[('Subflow Bwd Bytes', 'Average Packet Size'), ('Fwd Act Data Pkts', 'PSH Flag Count'), ('Packet Length Mean', 'Average Packet Size'), ('Packet Length Mean', 'Subflow Bwd Bytes'), ('Bwd Packet Length Mean', 'Average Packet Size'), ('Bwd Packet Length Mean', 'Subflow Bwd Bytes'), ('Bwd Packet Length Mean', 'Packet Length Mean'), ('Bwd Segment Size Avg', 'Average Packet Size'), ('Bwd Segment Size Avg', 'Subflow Bwd Bytes'), ('Bwd Segment Size Avg', 'Packet Length Mean'), ('Bwd Segment Size Avg', 'Bwd Packet Length Mean')]


In [69]:
len(high_corr_features)

11

In [70]:
X = newDF.drop("traffic nature",axis=1)
X.shape
y.shape

(213317, 1)

### to choose what column to drop , we will use feature importance provided by a random forest classifier , the one assigned to the lowest importance by the classifier will be dropped :

In [71]:
from sklearn.ensemble import RandomForestClassifier

# Assume X and y are the feature matrix and target vector, respectively
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X, y)

# Compute feature importance scores
feature_importance = dict(zip(X.columns, rf.feature_importances_))

  rf.fit(X, y)


In [72]:
feature_importance

{'Average Packet Size': 0.05020300374935943,
 'Packet Length Min': 0.017270598284792125,
 'Dst IP': 0.0885799354437332,
 'Bwd Init Win Bytes': 0.1340181379648132,
 'Subflow Bwd Bytes': 0.01456159026246243,
 'Dst Port': 0.036866528085073985,
 'Fwd Packet Length Min': 0.014458002316509498,
 'Protocol': 0.013444047895274558,
 'Idle Mean': 0.09716178251705594,
 'PSH Flag Count': 0.030198021570437987,
 'Bwd Packets/s': 0.04472827861380822,
 'Fwd Act Data Pkts': 0.023094195518449826,
 'Src IP': 0.31737621699087815,
 'Bwd Packet Length Min': 0.018035704573141517,
 'Packet Length Mean': 0.04067417630852918,
 'Bwd Packet Length Mean': 0.019817690591186723,
 'Bwd Segment Size Avg': 0.022477953287675885,
 'Idle Std': 0.017034136026818237}

In [73]:
for corr_feature in high_corr_features:
    # Choose one of the correlated features to drop
    feature1, feature2 = corr_feature
    if feature_importance[feature1] > feature_importance[feature2]:
        drop_feature = feature2
    else:
        drop_feature = feature1
        
    # Drop the feature from the dataset
    if drop_feature in newDF.columns.to_list():
        newDF.drop(columns=[drop_feature], inplace=True)

In [74]:
last_columns = newDF.columns.to_list()

In [75]:
last_columns

['Average Packet Size',
 'Packet Length Min',
 'Dst IP',
 'Bwd Init Win Bytes',
 'Dst Port',
 'Fwd Packet Length Min',
 'Protocol',
 'Idle Mean',
 'PSH Flag Count',
 'Bwd Packets/s',
 'Src IP',
 'Bwd Packet Length Min',
 'Idle Std',
 'traffic nature']

In [76]:
len(last_columns)

14

In [77]:
newDF

Unnamed: 0,Average Packet Size,Packet Length Min,Dst IP,Bwd Init Win Bytes,Dst Port,Fwd Packet Length Min,Protocol,Idle Mean,PSH Flag Count,Bwd Packets/s,Src IP,Bwd Packet Length Min,Idle Std,traffic nature
0,0.000000,0,1249740220,1181,5228,0,6,1.437770e+15,0,0.033292,177772555,0,3.678736e+07,0
1,99.000000,42,3322363789,0,53,42,17,0.000000e+00,0,8.642594,168298718,114,0.000000e+00,2
2,0.000000,0,177772555,980,48213,0,6,0.000000e+00,0,47619.047620,1796602589,0,0.000000e+00,0
3,223.500000,37,2211055107,0,53,37,17,0.000000e+00,0,532.197978,168296462,373,0.000000e+00,2
4,0.000000,0,177772555,0,42857,0,6,0.000000e+00,0,0.000000,3645881404,0,0.000000e+00,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
213312,0.000000,0,2915224844,379,4852,0,6,0.000000e+00,0,25.928187,2211115158,0,0.000000e+00,3
213313,124.385412,32,2211055107,0,53,32,17,0.000000e+00,0,860.461083,2211115158,184,0.000000e+00,3
213314,679.935601,0,3281674540,65535,443,0,6,1.456330e+15,550,10.388149,167772687,0,2.087574e+07,3
213315,650.371199,0,3281677079,65535,443,0,6,1.456400e+15,1869,21.970592,167772687,0,3.114654e+07,3


In [78]:
newDF.to_csv("selectedDarknet.csv",index=False)