In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import SMOTE
from scipy.interpolate import interp1d

# === Config ===
file_path = "Book2.csv"
target_column = "attack"
cv_folds = 3
n_features = 3
random_state = 42

# === Load and Prepare Data ===
print("Loading and preprocessing data...")
df = pd.read_csv(file_path).dropna()
df[target_column] = LabelEncoder().fit_transform(df[target_column])
X_raw = pd.get_dummies(df.drop(columns=[target_column]), drop_first=True)
y = df[target_column].values

# === Feature Selection & Scaling ===
print("Feature selection and scaling...")
X_sel = SelectKBest(score_func=f_classif, k=min(n_features, X_raw.shape[1])).fit_transform(X_raw, y)
X_scaled = StandardScaler().fit_transform(X_sel)

# === K-Nearest Neighbors (KNN) ===
model = KNeighborsClassifier(n_neighbors=5)

# === Cross-Validation and Plotting ===
skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=random_state)
plt.figure(figsize=(8, 6))

# Store precision-recall curves and AP scores for each class
all_precision = {class_idx: [] for class_idx in np.unique(y)}
all_recall = {class_idx: [] for class_idx in np.unique(y)}
avg_ap_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_scaled, y), 1):
    print(f"\n=== Fold {fold} ===")
    X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    # SMOTE for multi-class
    print("Applying SMOTE...")
    smote = SMOTE(random_state=random_state, sampling_strategy='not majority', k_neighbors=1)
    X_res, y_res = smote.fit_resample(X_train, y_train)

    print("Training KNN model...")
    model.fit(X_res, y_res)

    print("Generating predictions...")
    probs = model.predict_proba(X_val)

    # Collect precision-recall curve data
    ap_fold = 0
    for class_idx in np.unique(y):
        precision, recall, _ = precision_recall_curve(y_val == class_idx, probs[:, class_idx])
        ap = average_precision_score(y_val == class_idx, probs[:, class_idx])
        ap_fold += ap

        # Interpolation for consistent recall values
        recall_interp = np.linspace(0, 1, 100)  # 100 points to standardize recall values
        precision_interp = interp1d(recall, precision, kind='linear', fill_value="extrapolate")(recall_interp)

        # Append precision and recall values for averaging across folds
        all_precision[class_idx].append(precision_interp)
        all_recall[class_idx].append(recall_interp)

    avg_ap_scores.append(ap_fold / len(np.unique(y)))

# Plot combined PR Curve for all folds and average
for class_idx in np.unique(y):
    # Average Precision-Recall curves across all folds
    avg_precision = np.mean(all_precision[class_idx], axis=0)
    avg_recall = np.mean(all_recall[class_idx], axis=0)

    ap_avg = np.mean(avg_ap_scores)  # Average AP across all folds
    plt.plot(avg_recall, avg_precision, lw=2, label=f"Class {class_idx} (AP={ap_avg:.2f})")

# Final plot settings
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("KNN - Combined Precision-Recall Curves (3-Fold CV)")
plt.grid(True)
plt.legend(loc='lower left', fontsize="small", title="Classes")
plt.tight_layout()
plt.show()

# === Conclusion ===
mean_ap = np.mean(avg_ap_scores)
print(f"\nConclusion:")
print(f"✔ Average Precision Score across folds: {mean_ap:.4f}")
print("✔ KNN model maintained consistent precision-recall balance across all folds.")
print("✔ SMOTE improved class representation, especially in minority classes.")


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from scipy.interpolate import interp1d

# === Config ===
file_path = "Book2.csv"
target_column = "attack"
cv_folds = 3
n_features = 3
random_state = 42

# === Load and Prepare Data ===
print("Loading and preprocessing data...")
df = pd.read_csv(file_path).dropna()
df[target_column] = LabelEncoder().fit_transform(df[target_column])
X_raw = pd.get_dummies(df.drop(columns=[target_column]), drop_first=True)
y = df[target_column].values

# === Feature Selection & Scaling ===
print("Feature selection and scaling...")
X_sel = SelectKBest(score_func=f_classif, k=min(n_features, X_raw.shape[1])).fit_transform(X_raw, y)
X_scaled = StandardScaler().fit_transform(X_sel)

# === Logistic Regression (LR) ===
model = LogisticRegression(max_iter=1000, random_state=random_state)

# === Cross-Validation and Plotting ===
skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=random_state)
plt.figure(figsize=(8, 6))

# Store precision-recall curves and AP scores for each class
all_precision = {class_idx: [] for class_idx in np.unique(y)}
all_recall = {class_idx: [] for class_idx in np.unique(y)}
avg_ap_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_scaled, y), 1):
    print(f"\n=== Fold {fold} ===")
    X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    # SMOTE for multi-class
    print("Applying SMOTE...")
    smote = SMOTE(random_state=random_state, sampling_strategy='not majority', k_neighbors=1)
    X_res, y_res = smote.fit_resample(X_train, y_train)

    print("Training Logistic Regression model...")
    model.fit(X_res, y_res)

    print("Generating predictions...")
    probs = model.predict_proba(X_val)

    # Collect precision-recall curve data
    ap_fold = 0
    for class_idx in np.unique(y):
        precision, recall, _ = precision_recall_curve(y_val == class_idx, probs[:, class_idx])
        ap = average_precision_score(y_val == class_idx, probs[:, class_idx])
        ap_fold += ap

        # Interpolation for consistent recall values
        recall_interp = np.linspace(0, 1, 100)  # 100 points to standardize recall values
        precision_interp = interp1d(recall, precision, kind='linear', fill_value="extrapolate")(recall_interp)

        # Append precision and recall values for averaging across folds
        all_precision[class_idx].append(precision_interp)
        all_recall[class_idx].append(recall_interp)

    avg_ap_scores.append(ap_fold / len(np.unique(y)))

# Plot combined PR Curve for all folds and average
for class_idx in np.unique(y):
    # Average Precision-Recall curves across all folds
    avg_precision = np.mean(all_precision[class_idx], axis=0)
    avg_recall = np.mean(all_recall[class_idx], axis=0)

    ap_avg = np.mean(avg_ap_scores)  # Average AP across all folds
    plt.plot(avg_recall, avg_precision, lw=2, label=f"Class {class_idx} (AP={ap_avg:.2f})")

# Final plot settings
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Logistic Regression - Combined Precision-Recall Curves (3-Fold CV)")
plt.grid(True)
plt.legend(loc='lower left', fontsize="small", title="Classes")
plt.tight_layout()
plt.show()

# === Conclusion ===
mean_ap = np.mean(avg_ap_scores)
print(f"\nConclusion:")
print(f"✔ Average Precision Score across folds: {mean_ap:.4f}")
print("✔ Logistic Regression maintained consistent precision-recall balance across all folds.")
print("✔ SMOTE improved class representation, especially in minority classes.")


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from scipy.interpolate import interp1d

# === Config ===
file_path = "Book2.csv"
target_column = "attack"
cv_folds = 3
n_features = 3
random_state = 42

# === Load and Prepare Data ===
print("Loading and preprocessing data...")
df = pd.read_csv(file_path).dropna()
df[target_column] = LabelEncoder().fit_transform(df[target_column])
X_raw = pd.get_dummies(df.drop(columns=[target_column]), drop_first=True)
y = df[target_column].values

# === Feature Selection & Scaling ===
print("Feature selection and scaling...")
X_sel = SelectKBest(score_func=f_classif, k=min(n_features, X_raw.shape[1])).fit_transform(X_raw, y)
X_scaled = StandardScaler().fit_transform(X_sel)

# === Random Forest Classifier (RFC) ===
model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=random_state)

# === Cross-Validation and Plotting ===
skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=random_state)
plt.figure(figsize=(8, 6))

# Store precision-recall curves and AP scores for each class
all_precision = {class_idx: [] for class_idx in np.unique(y)}
all_recall = {class_idx: [] for class_idx in np.unique(y)}
avg_ap_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_scaled, y), 1):
    print(f"\n=== Fold {fold} ===")
    X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    # SMOTE for multi-class
    print("Applying SMOTE...")
    smote = SMOTE(random_state=random_state, sampling_strategy='not majority', k_neighbors=1)
    X_res, y_res = smote.fit_resample(X_train, y_train)

    print("Training Random Forest Classifier model...")
    model.fit(X_res, y_res)

    print("Generating predictions...")
    probs = model.predict_proba(X_val)

    # Collect precision-recall curve data
    ap_fold = 0
    for class_idx in np.unique(y):
        precision, recall, _ = precision_recall_curve(y_val == class_idx, probs[:, class_idx])
        ap = average_precision_score(y_val == class_idx, probs[:, class_idx])
        ap_fold += ap

        # Interpolation for consistent recall values
        recall_interp = np.linspace(0, 1, 100)  # 100 points to standardize recall values
        precision_interp = interp1d(recall, precision, kind='linear', fill_value="extrapolate")(recall_interp)

        # Append precision and recall values for averaging across folds
        all_precision[class_idx].append(precision_interp)
        all_recall[class_idx].append(recall_interp)

    avg_ap_scores.append(ap_fold / len(np.unique(y)))

# Plot combined PR Curve for all folds and average
for class_idx in np.unique(y):
    # Average Precision-Recall curves across all folds
    avg_precision = np.mean(all_precision[class_idx], axis=0)
    avg_recall = np.mean(all_recall[class_idx], axis=0)

    ap_avg = np.mean(avg_ap_scores)  # Average AP across all folds
    plt.plot(avg_recall, avg_precision, lw=2, label=f"Class {class_idx} (AP={ap_avg:.2f})")

# Final plot settings
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("RFC - Combined Precision-Recall Curves (3-Fold CV)")
plt.grid(True)
plt.legend(loc='lower left', fontsize="small", title="Classes")
plt.tight_layout()
plt.show()

# === Conclusion ===
mean_ap = np.mean(avg_ap_scores)
print(f"\nConclusion:")
print(f"✔ Average Precision Score across folds: {mean_ap:.4f}")
print("✔ Random Forest Classifier maintained consistent precision-recall balance across all folds.")
print("✔ SMOTE improved class representation, especially in minority classes.")


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import precision_recall_curve, average_precision_score
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from scipy.interpolate import interp1d

# === Config ===
file_path = "Book2.csv"
target_column = "attack"
cv_folds = 3
n_features = 3
random_state = 42

# === Load and Prepare Data ===
print("Loading and preprocessing data...")
df = pd.read_csv(file_path).dropna()
df[target_column] = LabelEncoder().fit_transform(df[target_column])
X_raw = pd.get_dummies(df.drop(columns=[target_column]), drop_first=True)
y = df[target_column].values

# === Feature Selection & Scaling ===
print("Feature selection and scaling...")
X_sel = SelectKBest(score_func=f_classif, k=min(n_features, X_raw.shape[1])).fit_transform(X_raw, y)
X_scaled = StandardScaler().fit_transform(X_sel)

# === XGBoost Model (Shallow and Fast) ===
model = XGBClassifier(
    use_label_encoder=False,
    eval_metric="logloss",
    max_depth=2,
    n_estimators=50,
    learning_rate=0.05,
    subsample=0.6,
    colsample_bytree=0.6,
    random_state=random_state
)

# === Cross-Validation and Plotting ===
skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=random_state)
plt.figure(figsize=(8, 6))

# Store precision-recall curves and AP scores for each class
all_precision = {class_idx: [] for class_idx in np.unique(y)}
all_recall = {class_idx: [] for class_idx in np.unique(y)}
avg_ap_scores = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_scaled, y), 1):
    print(f"\n=== Fold {fold} ===")
    X_train, X_val = X_scaled[train_idx], X_scaled[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    # SMOTE for multi-class
    print("Applying SMOTE...")
    smote = SMOTE(random_state=random_state, sampling_strategy='not majority', k_neighbors=1)
    X_res, y_res = smote.fit_resample(X_train, y_train)

    print("Training XGBoost model...")
    model.fit(X_res, y_res)

    print("Generating predictions...")
    probs = model.predict_proba(X_val)

    # Collect precision-recall curve data
    ap_fold = 0
    for class_idx in np.unique(y):
        precision, recall, _ = precision_recall_curve(y_val == class_idx, probs[:, class_idx])
        ap = average_precision_score(y_val == class_idx, probs[:, class_idx])
        ap_fold += ap

        # Interpolation for consistent recall values
        recall_interp = np.linspace(0, 1, 100)  # 100 points to standardize recall values
        precision_interp = interp1d(recall, precision, kind='linear', fill_value="extrapolate")(recall_interp)

        # Append precision and recall values for averaging across folds
        all_precision[class_idx].append(precision_interp)
        all_recall[class_idx].append(recall_interp)

    avg_ap_scores.append(ap_fold / len(np.unique(y)))

# Plot combined PR Curve for all folds and average
for class_idx in np.unique(y):
    # Average Precision-Recall curves across all folds
    avg_precision = np.mean(all_precision[class_idx], axis=0)
    avg_recall = np.mean(all_recall[class_idx], axis=0)

    ap_avg = np.mean(avg_ap_scores)  # Average AP across all folds
    plt.plot(avg_recall, avg_precision, lw=2, label=f"Class {class_idx} (AP={ap_avg:.2f})")

# Final plot settings
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("XGBoost - Combined Precision-Recall Curves (3-Fold CV)")
plt.grid(True)
plt.legend(loc='lower left', fontsize="small", title="Classes")
plt.tight_layout()
plt.show()

# === Conclusion ===
mean_ap = np.mean(avg_ap_scores)
print(f"\nConclusion:")
print(f"✔ Average Precision Score across folds: {mean_ap:.4f}")
print("✔ XGBoost maintained consistent precision-recall balance across all folds.")
print("✔ SMOTE improved class representation, especially in minority classes.")


In [None]:
pip install scikit-learn matplotlib numpy


In [None]:
pip install imbalanced-learn


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.calibration import CalibratedClassifierCV
from xgboost import XGBClassifier
from scipy.interpolate import interp1d

# === Config ===
file_path = "Book2.csv"
target_column = "attack"
cv_folds = 5
n_features = 3
random_state = 42

# === Load and Encode Data ===
print("🔄 Loading and preprocessing data...")
df = pd.read_csv(file_path).dropna()
df[target_column] = LabelEncoder().fit_transform(df[target_column])
X_raw = pd.get_dummies(df.drop(columns=[target_column]), drop_first=True)
y = df[target_column].values
n_classes = len(np.unique(y))

# === Models to Evaluate ===
models = {
    "Linear SVM": CalibratedClassifierCV(
        LinearSVC(C=1.0, max_iter=1000, random_state=random_state),
        method='sigmoid', cv=3
    ),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=random_state),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=random_state),
    "K-Nearest Neighbors": KNeighborsClassifier(n_neighbors=5),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=random_state)
}

# === Evaluation Loop ===
results = {}

for model_name, model in models.items():
    print(f"\n🚀 Evaluating: {model_name}")
    skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=random_state)

    all_precision = {cls: [] for cls in range(n_classes)}
    all_recall = {cls: [] for cls in range(n_classes)}
    avg_ap_scores = []

    for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X_raw, y), 1):
        # === Train/Val Split ===
        X_train_raw, X_val_raw = X_raw.iloc[train_idx], X_raw.iloc[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        # === Feature Selection & Scaling (Inside Fold) ===
        selector = SelectKBest(score_func=f_classif, k=min(n_features, X_train_raw.shape[1]))
        X_train_sel = selector.fit_transform(X_train_raw, y_train)
        X_val_sel = selector.transform(X_val_raw)

        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_sel)
        X_val_scaled = scaler.transform(X_val_sel)

        # === Train and Predict ===
        model.fit(X_train_scaled, y_train)
        if hasattr(model, "predict_proba"):
            probs = model.predict_proba(X_val_scaled)
        else:
            decision = model.decision_function(X_val_scaled)
            probs = 1 / (1 + np.exp(-decision))  # Sigmoid
            if probs.ndim == 1:
                probs = np.vstack([1 - probs, probs]).T

        # === Per-Class Precision-Recall and AP ===
        ap_fold = 0
        for cls in range(n_classes):
            precision, recall, _ = precision_recall_curve(y_val == cls, probs[:, cls])
            ap = average_precision_score(y_val == cls, probs[:, cls])
            ap_fold += ap

            recall_interp = np.linspace(0, 1, 100)
            precision_interp = interp1d(recall, precision, kind='linear', fill_value="extrapolate")(recall_interp)

            all_precision[cls].append(precision_interp)
            all_recall[cls].append(recall_interp)

        avg_ap_scores.append(ap_fold / n_classes)

    mean_ap = np.mean(avg_ap_scores)
    results[model_name] = mean_ap

    # === PR Curve Plot ===
    plt.figure(figsize=(8, 6))
    for cls in range(n_classes):
        avg_precision = np.mean(all_precision[cls], axis=0)
        avg_recall = np.mean(all_recall[cls], axis=0)
        plt.plot(avg_recall, avg_precision, lw=2, label=f"Class {cls}")

    plt.title(f"{model_name} - PR Curves (CV, No SMOTE)\nAvg AP: {mean_ap:.4f}")
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.legend(loc='lower left')
    plt.grid(True)
    plt.tight_layout()
    plt.show()
    print("✔ PR curve displayed.")

# === Summary Table ===
print("\n📊 Average Precision Scores (No SMOTE):")
for model_name, ap in results.items():
    print(f"{model_name:22s}: {ap:.4f}")



In [None]:
# ========================== Install Dependencies ==========================
!pip install openpyxl xgboost scikit-learn --quiet

# ========================== Imports ==========================
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

# ========================== Configuration ==========================
target_column = "attack"
n_features = 10
cv_folds = 3
random_state = 42

# ========================== Load and Preprocess ==========================
print("📂 Loading dataset...")
df = pd.read_csv("/content/Book2.csv")

# Drop rows with completely empty values
df.dropna(how='all', inplace=True)

# Check if target column exists
if target_column not in df.columns:
    raise ValueError(f"❌ Target column '{target_column}' not found in dataset.")

# Encode target labels
df[target_column] = LabelEncoder().fit_transform(df[target_column])

# Remove classes with fewer samples than folds
class_counts = df[target_column].value_counts()
valid_classes = class_counts[class_counts >= cv_folds].index
df = df[df[target_column].isin(valid_classes)]

# Split features and target
X = df.drop(columns=[target_column])
y = df[target_column].values

# One-hot encode categorical features
X = pd.get_dummies(X, drop_first=True)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

print(f"✅ Dataset shape after cleaning: {X_imputed.shape}")
print(f"🎯 Features: {X_imputed.shape[1]} | Target classes: {len(np.unique(y))}")

# ========================== Define Models ==========================
models = {
    "XGBoost": XGBClassifier(
        n_estimators=100, max_depth=3, learning_rate=0.05,
        subsample=0.6, colsample_bytree=0.6,
        reg_alpha=5, reg_lambda=5,
        use_label_encoder=False, eval_metric="mlogloss",
        verbosity=0, n_jobs=-1, random_state=random_state
    ),
    "KNN": KNeighborsClassifier(n_neighbors=7, weights='distance', p=2, n_jobs=-1),
    "Random Forest": RandomForestClassifier(
        n_estimators=100, max_depth=5,
        min_samples_leaf=10, max_features='log2',
        n_jobs=-1, random_state=random_state
    ),
    "SVM (RBF)": SVC(
        kernel="rbf", C=0.3,
        gamma='auto', probability=False,
        random_state=random_state
    ),
    "Logistic Regression": LogisticRegression(
        penalty='l2', C=0.3,
        solver='lbfgs', max_iter=1000,
        random_state=random_state
    )
}

# ========================== Cross-Validation Function ==========================
cv_results = {}

def run_cv(model_name, model, X, y):
    print(f"\n🚀 {model_name}: Running Stratified {cv_folds}-Fold Cross-Validation (No SMOTE)")
    cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=random_state)
    fold_f1_scores = []
    start_time = time.time()

    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        # Feature Selection
        selector = SelectKBest(score_func=f_classif, k=min(n_features, X.shape[1]))
        X_train_sel = selector.fit_transform(X_train, y_train)
        X_val_sel = selector.transform(X_val)

        # Standardization
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_sel)
        X_val_scaled = scaler.transform(X_val_sel)

        # Train and evaluate
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_val_scaled)
        f1 = f1_score(y_val, y_pred, average='weighted', zero_division=0)

        print(f"🔹 Fold {fold} | F1 Score: {f1:.4f}")
        fold_f1_scores.append(f1)

    avg_f1 = np.mean(fold_f1_scores)
    print(f"✅ {model_name} | Avg Weighted F1: {avg_f1:.4f} | ⏱ Time: {time.time() - start_time:.2f}s")
    cv_results[model_name] = fold_f1_scores

# ========================== Run Cross-Validation ==========================
for name, model in models.items():
    run_cv(name, model, X_imputed, y)

# ========================== Plot Cross-Validation Scores ==========================
cv_df = pd.DataFrame({
    "Model": [model for model, scores in cv_results.items() for _ in scores],
    "Fold": [f"Fold {i+1}" for scores in cv_results.values() for i in range(len(scores))],
    "F1 Score": [score for scores in cv_results.values() for score in scores]
})

plt.figure(figsize=(12, 6))
sns.boxplot(data=cv_df, x="Model", y="F1 Score", palette="Set2")
sns.stripplot(data=cv_df, x="Model", y="F1 Score", color='black', alpha=0.5, jitter=0.2)
plt.title("Cross-Validation Weighted F1 Scores per Model (No SMOTE)", fontsize=14)
plt.ylabel("F1 Score")
plt.xlabel("Model")
plt.xticks(rotation=15)
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# ========================== Install Dependencies ==========================
!pip install openpyxl xgboost imbalanced-learn scikit-learn --quiet

# ========================== Imports ==========================
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score

from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE  # Import SMOTE

# ========================== Configuration ==========================
target_column = "attack"
n_features = 10
cv_folds = 3
random_state = 42

# ========================== Load and Preprocess ==========================
print("📂 Loading dataset...")
df = pd.read_csv("/content/Book2.csv")

# Drop rows with completely empty values
df.dropna(how='all', inplace=True)

# Check if target column exists
if target_column not in df.columns:
    raise ValueError(f"❌ Target column '{target_column}' not found in dataset.")

# Encode target labels
df[target_column] = LabelEncoder().fit_transform(df[target_column])

# Remove classes with fewer samples than folds
class_counts = df[target_column].value_counts()
valid_classes = class_counts[class_counts >= cv_folds].index
df = df[df[target_column].isin(valid_classes)]

# Split features and target
X = df.drop(columns=[target_column])
y = df[target_column].values

# One-hot encode categorical features
X = pd.get_dummies(X, drop_first=True)

# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

print(f"✅ Dataset shape after cleaning: {X_imputed.shape}")
print(f"🎯 Features: {X_imputed.shape[1]} | Target classes: {len(np.unique(y))}")

# ========================== Define Models ==========================
models = {
    "XGBoost": XGBClassifier(
        n_estimators=100, max_depth=3, learning_rate=0.05,
        subsample=0.6, colsample_bytree=0.6,
        reg_alpha=5, reg_lambda=5,
        use_label_encoder=False, eval_metric="mlogloss",
        verbosity=0, n_jobs=-1, random_state=random_state
    ),
    "KNN": KNeighborsClassifier(n_neighbors=7, weights='distance', p=2, n_jobs=-1),
    "Random Forest": RandomForestClassifier(
        n_estimators=100, max_depth=5,
        min_samples_leaf=10, max_features='log2',
        n_jobs=-1, random_state=random_state
    ),
    "SVM (RBF)": SVC(
        kernel="rbf", C=0.3,
        gamma='auto', probability=False,
        random_state=random_state
    ),
    "Logistic Regression": LogisticRegression(
        penalty='l2', C=0.3,
        solver='lbfgs', max_iter=1000,
        random_state=random_state
    )
}

# ========================== Cross-Validation Function ==========================
cv_results = {}

def run_cv(model_name, model, X, y):
    print(f"\n🚀 {model_name}: Running Stratified {cv_folds}-Fold Cross-Validation (With SMOTE)")
    cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=random_state)
    fold_f1_scores = []
    start_time = time.time()

    smote = SMOTE(random_state=random_state)  # Initialize SMOTE

    for fold, (train_idx, val_idx) in enumerate(cv.split(X, y), 1):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]

        # Feature Selection
        selector = SelectKBest(score_func=f_classif, k=min(n_features, X.shape[1]))
        X_train_sel = selector.fit_transform(X_train, y_train)
        X_val_sel = selector.transform(X_val)

        # Standardization
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train_sel)
        X_val_scaled = scaler.transform(X_val_sel)

        # Apply SMOTE to the training data only
        X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

        # Train and evaluate
        model.fit(X_train_resampled, y_train_resampled)
        y_pred = model.predict(X_val_scaled)
        f1 = f1_score(y_val, y_pred, average='weighted', zero_division=0)

        print(f"🔹 Fold {fold} | F1 Score: {f1:.4f}")
        fold_f1_scores.append(f1)

    avg_f1 = np.mean(fold_f1_scores)
    print(f"✅ {model_name} | Avg Weighted F1: {avg_f1:.4f} | ⏱ Time: {time.time() - start_time:.2f}s")
    cv_results[model_name] = fold_f1_scores

# ========================== Run Cross-Validation ==========================
for name, model in models.items():
    run_cv(name, model, X_imputed, y)

# ========================== Plot Cross-Validation Scores ==========================
cv_df = pd.DataFrame({
    "Model": [model for model, scores in cv_results.items() for _ in scores],
    "Fold": [f"Fold {i+1}" for scores in cv_results.values() for i in range(len(scores))],
    "F1 Score": [score for scores in cv_results.values() for score in scores]
})

plt.figure(figsize=(12, 6))
sns.boxplot(data=cv_df, x="Model", y="F1 Score", palette="Set2")
sns.stripplot(data=cv_df, x="Model", y="F1 Score", color='black', alpha=0.5, jitter=0.2)
plt.title("Cross-Validation Weighted F1 Scores per Model (With SMOTE)", fontsize=14)
plt.ylabel("F1 Score")
plt.xlabel("Model")
plt.xticks(rotation=15)
plt.grid(True)
plt.tight_layout()
plt.show()
