In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib


In [2]:
data = pd.read_csv(".\Data\Cypher-CICIDS2017-balanced-dataset-2024-12-16.csv")


In [3]:
features = ['DestinationPort', 'FlowDuration', 'FlowPacketsPerSec', 
            'TotalFwdPackets', 'PacketLengthMean', 'SYNFlagCount']
target = 'Label'

data = data[features + [target]]
data.dropna(inplace=True)  
data['Label'] = data['Label'].map({'BENIGN': 0, 'ATTACK': 1})  


In [4]:
print("Is any Infinity values in data:", np.isinf(data).values.any())
print("Is any NaN values in data:", np.isnan(data).values.any())


Is any Infinity values in data: True
Is any NaN values in data: False


In [5]:
data = data[~np.isinf(data).any(axis=1)]
data = data.dropna()


In [6]:
X = data[features]
y = data['Label']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [8]:
scalers = [StandardScaler()]
scaler_names = ['StandardScaler']

lr_param_grid = {
    'scaler': scalers,
    'model__C': [1],
    'model__solver': [ 'lbfgs']
}

rf_param_grid = {
    'scaler': scalers,
    'model__n_estimators': [100],
    'model__max_depth': [20],
    'model__min_samples_split': [5]
}

In [9]:
lr_pipeline = Pipeline([
    ('scaler', StandardScaler()),  
    ('model', LogisticRegression())
])
lr_grid = GridSearchCV(lr_pipeline, lr_param_grid, cv=5, scoring='accuracy')
lr_grid.fit(X_train, y_train)

In [10]:
rf_pipeline = Pipeline([
    ('scaler', StandardScaler()),  
    ('model', RandomForestClassifier(random_state=42))
])
rf_grid = GridSearchCV(rf_pipeline, rf_param_grid, cv=5, scoring='accuracy')
rf_grid.fit(X_train, y_train)

In [11]:
best_lr_model = lr_grid.best_estimator_
best_lr_scaler = lr_grid.best_params_['scaler']
print("Best Logistic Regression Parameters:", lr_grid.best_params_)
print(f"Best Logistic Regression Scaler: {type(best_lr_scaler).__name__}")

best_rf_model = rf_grid.best_estimator_
best_rf_scaler = rf_grid.best_params_['scaler']
print("Best Random Forest Parameters:", rf_grid.best_params_)
print(f"Best Random Forest Scaler: {type(best_rf_scaler).__name__}")

Best Logistic Regression Parameters: {'model__C': 1, 'model__solver': 'lbfgs', 'scaler': StandardScaler()}
Best Logistic Regression Scaler: StandardScaler
Best Random Forest Parameters: {'model__max_depth': 20, 'model__min_samples_split': 5, 'model__n_estimators': 100, 'scaler': StandardScaler()}
Best Random Forest Scaler: StandardScaler


In [12]:
lr_pred = best_lr_model.predict(X_test)
print("Logistic Regression Classification Report:")
print(classification_report(y_test, lr_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, lr_pred))


Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.93      0.78    167184
           1       0.88      0.53      0.66    166950

    accuracy                           0.73    334134
   macro avg       0.78      0.73      0.72    334134
weighted avg       0.78      0.73      0.72    334134

Confusion Matrix:
[[155626  11558]
 [ 78234  88716]]


In [13]:
rf_pred = best_rf_model.predict(X_test)
print("Random Forest Classification Report:")
print(classification_report(y_test, rf_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, rf_pred))

Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99    167184
           1       1.00      0.99      0.99    166950

    accuracy                           0.99    334134
   macro avg       0.99      0.99      0.99    334134
weighted avg       0.99      0.99      0.99    334134

Confusion Matrix:
[[166553    631]
 [  1051 165899]]


In [14]:
# Lưu Logistic Regression model
best_lr_model = lr_grid.best_estimator_
joblib.dump(best_lr_model, 'best_logistic_regression_model.pkl')
print("Đã lưu Logistic Regression model thành công!")

# Lưu Random Forest model
best_rf_model = rf_grid.best_estimator_
joblib.dump(best_rf_model, 'best_random_forest_model.pkl')
print("Đã lưu Random Forest model thành công!")

Đã lưu Logistic Regression model thành công!
Đã lưu Random Forest model thành công!


In [15]:
best_model, best_scaler = None, None
if accuracy_score(y_test, lr_pred) > accuracy_score(y_test, rf_pred):
    best_model = best_lr_model
    best_scaler = best_lr_scaler
    best_model_name = "Logistic Regression"
else:
    best_model = best_rf_model
    best_scaler = best_rf_scaler
    best_model_name = "Random Forest"

print(f"Selected Best Model: {best_model_name}")

Selected Best Model: Random Forest


In [8]:
rf_model = joblib.load('best_random_forest_model.pkl')

In [9]:
import random as rd


In [17]:
index = rd.randint(0,len(X_test))
random_sample = X_test.iloc[index].values.reshape(1,-1)
y_true = y_test.iloc[index]
y_pred =rf_model.predict(random_sample)

print("Dữ liệu đầu vào (X_sample):\n", random_sample)
print("Nhãn thật (y_true):", y_true)
print("Kết quả dự đoán (y_pred):", y_pred[0])

original_index = X_test.index[index]  # Lấy chỉ số gốc từ tập X_test
print(f"Chỉ số gốc trong tập dữ liệu ban đầu: {original_index}")
print("Dữ liệu gốc trong X:\n", X.loc[original_index])
print("Nhãn gốc trong y:", y.loc[original_index])

Dữ liệu đầu vào (X_sample):
 [[8.00000000e+01 3.51848000e+05 2.55792274e+01 3.00000000e+00
  1.16330000e+03 0.00000000e+00]]
Nhãn thật (y_true): 1
Kết quả dự đoán (y_pred): 1
Chỉ số gốc trong tập dữ liệu ban đầu: 606065
Dữ liệu gốc trong X:
 DestinationPort          80.000000
FlowDuration         351848.000000
FlowPacketsPerSec        25.579227
TotalFwdPackets           3.000000
PacketLengthMean       1163.300000
SYNFlagCount              0.000000
Name: 606065, dtype: float64
Nhãn gốc trong y: 1


