多元線性回歸分析模板 (CRISP-DM 流程)

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('cybersecurity_attacks.csv')

# --- Data Preparation ---

# 1. Handle Missing Values
columns_to_drop_missing = ['Malware Indicators', 'Alerts/Warnings', 'Proxy Information', 'Firewall Logs', 'IDS/IPS Alerts']
df = df.drop(columns=columns_to_drop_missing)

# 2. Feature Selection and Dropping irrelevant columns
columns_to_drop_irrelevant = [
    'Timestamp', 'Source IP Address', 'Destination IP Address',
    'User Information', 'Device Information', 'Geo-location Data', 'Payload Data'
]
df = df.drop(columns=columns_to_drop_irrelevant)

# 3. Encoding Categorical Features
df_encoded = pd.get_dummies(df, columns=df.select_dtypes(include=['object']).columns, drop_first=True)

# Define target and features
X = df_encoded.drop('Anomaly Scores', axis=1)
y = df_encoded['Anomaly Scores']

# 4. Data Splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Feature Scaling ---
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --- Feature Selection with Lasso ---
print("\nPerforming Feature Selection with Lasso...")
lasso = Lasso(alpha=0.1) # alpha is the regularization strength
lasso.fit(X_train_scaled, y_train)

# Get selected features (with non-zero coefficients)
selected_features_mask = lasso.coef_ != 0
selected_features = X.columns[selected_features_mask]
print(f"\nFeatures selected by Lasso: {selected_features.tolist()}")

# If no features are selected, we can't proceed.
if len(selected_features) == 0:
    print("\nLasso selected 0 features. The model cannot be improved with this method.")
    # I will stop here if no features are selected.
else:
    # --- Modeling with Selected Features ---
    print("\nTraining a new Linear Regression model with selected features...")
    X_train_selected = X_train[selected_features]
    X_test_selected = X_test[selected_features]

    model_selected = LinearRegression()
    model_selected.fit(X_train_selected, y_train)

    # --- Evaluation of the new model ---
    y_pred_selected = model_selected.predict(X_test_selected)

    r2_selected = r2_score(y_test, y_pred_selected)
    rmse_selected = np.sqrt(mean_squared_error(y_test, y_pred_selected))
    mae_selected = mean_absolute_error(y_test, y_pred_selected)

    print(f"\nModel Evaluation with Selected Features:")
    print(f"R-squared (R²): {r2_selected:.4f}")
    print(f"Root Mean Squared Error (RMSE): {rmse_selected:.4f}")
    print(f"Mean Absolute Error (MAE): {mae_selected:.4f}")

    # --- Visualization with Prediction Intervals ---
    print("\nGenerating prediction plot with prediction intervals...")
    # Calculate prediction intervals
    # For simplicity, we'll use a method based on the residuals of the training data
    # This is an approximation. For a more rigorous approach, one would need to use more complex statistical formulas.
    residuals = y_train - model_selected.predict(X_train_selected)
    pred_interval = 1.96 * np.std(residuals) # 95% prediction interval

    lower_bound = y_pred_selected - pred_interval
    upper_bound = y_pred_selected + pred_interval

    plt.figure(figsize=(12, 7))
    sns.scatterplot(x=y_test, y=y_pred_selected, alpha=0.5, label='Predicted Values')
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2, label='Ideal Fit')

    # Plot prediction intervals for a subset of points for clarity
    sample_indices = np.random.choice(len(y_test), 100, replace=False)
    for i in sample_indices:
        plt.plot([y_test.iloc[i], y_test.iloc[i]], [lower_bound[i], upper_bound[i]], color='gray', linestyle='-', linewidth=0.5)

    plt.xlabel("Actual Anomaly Scores")
    plt.ylabel("Predicted Anomaly Scores")
    plt.title("Actual vs. Predicted Anomaly Scores with Prediction Intervals")
    plt.legend()
    plt.grid(True)
    plt.savefig('prediction_plot_with_intervals.png')
    plt.close()

    print("\nPrediction plot with intervals saved as 'prediction_plot_with_intervals.png'")


Performing Feature Selection with Lasso...

Features selected by Lasso: ['Destination Port', 'Protocol_TCP', 'Protocol_UDP', 'Packet Type_Data', 'Traffic Type_FTP', 'Attack Type_Intrusion', 'Attack Signature_Known Pattern B', 'Action Taken_Ignored', 'Severity Level_Low', 'Severity Level_Medium', 'Network Segment_Segment C']

Training a new Linear Regression model with selected features...

Model Evaluation with Selected Features:
R-squared (R²): -0.0009
Root Mean Squared Error (RMSE): 28.9136
Mean Absolute Error (MAE): 25.0148

Generating prediction plot with prediction intervals...

Prediction plot with intervals saved as 'prediction_plot_with_intervals.png'
