In [None]:
# Paths
root_dir = 'D:/vhproj/intrusion-network'
data_dir = 'data/processed/demo_data.csv'


In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import joblib

# Load data
path = os.path.join(root_dir, data_dir)
df = pd.read_csv(path)
print('Loaded df shape:', df.shape)
print('Columns:', df.columns.tolist())

# Prepare target and features
target = 'label'
if target not in df.columns:
    raise KeyError(f"Target column '{target}' not found in dataframe")
le = LabelEncoder()
y = le.fit_transform(df[target])
X = df.drop(columns=[target])
X_num = X.select_dtypes(include=[np.number]).copy()
X_num = X_num.fillna(0)
print('Numeric features shape:', X_num.shape)

# Split
X_train, X_test, y_train, y_test = train_test_split(X_num, y, test_size=0.2, random_state=42, stratify=y)
print('Train/test sizes:', X_train.shape[0], X_test.shape[0])

# Train RandomForest
model = RandomForestClassifier(n_estimators=200, random_state=42, n_jobs=-1)
model.fit(X_train, y_train)
# Predict
y_pred = model.predict(X_test)
# Metrics
accuracy = accuracy_score(y_test, y_pred)
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted')
print(f'Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}')

# Classification report
print(classification_report(y_test, y_pred))
# Confusion matrix
plt.figure(figsize=(12,10))
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()

# Feature importances
importances = model.feature_importances_
feat_df = pd.DataFrame({'feature': X_num.columns, 'importance': importances}).sort_values('importance', ascending=False).head(30)
print(feat_df.to_string(index=False))
# Save model and encoder
save_dir = os.path.join(root_dir, 'pretrained')
os.makedirs(save_dir, exist_ok=True)
joblib.dump(model, os.path.join(save_dir, 'random_forest_model.joblib'))
joblib.dump(le, os.path.join(save_dir, 'label_encoder.joblib'))
print('Saved RandomForest model and label encoder to', save_dir)
