Importig the dependencies

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

## Load Data

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train = train.drop(columns=['id', 'attack_cat'])
test = test.drop(columns=['id'])

#### Adversarial Validation (Detect Trainâ€“Test Drift)
 Purpose: find drifting features to drop later.

In [None]:
train['is_test'] = 0
test['is_test'] = 1


adv_data = pd.concat([train, test], axis=0).reset_index(drop=True)
y_adv = adv_data['is_test']
X_adv = adv_data.drop(columns=['is_test'])

#### preprossing for handling categorials

In [None]:
cat_cols = X_adv.select_dtypes(include=['object']).columns
for col in cat_cols:
    le = LabelEncoder()
    X_adv[col] = le.fit_transform(X_adv[col].astype(str))

X_adv = X_adv.fillna(0)

Training adverserial model

In [None]:
print("Running Adversarial Validation...")

model_adv = xgb.XGBClassifier(
    n_estimators=100, 
    max_depth=4, 
    learning_rate=0.1, 
    random_state=42,
    eval_metric='auc'
)

Applying 5Fold cross validation

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
aucs = []

for fold, (train_idx, val_idx) in enumerate(skf.split(X_adv, y_adv)):
    X_tr, X_val = X_adv.iloc[train_idx], X_adv.iloc[val_idx]
    y_tr, y_val = y_adv.iloc[train_idx], y_adv.iloc[val_idx]
    
    model_adv.fit(X_tr, y_tr)
    preds = model_adv.predict_proba(X_val)[:, 1]
    score = roc_auc_score(y_val, preds)
    aucs.append(score)
    print(f"Fold {fold+1} AUC: {score:.4f}")

print(f"\nAverage Adversarial AUC: {np.mean(aucs):.4f}")

 CHECK FEATURE IMPORTANCE (The "Culprits")

In [None]:
importances = pd.DataFrame({
    'feature': X_adv.columns,
    'importance': model_adv.feature_importances_
}).sort_values('importance', ascending=False)

print("\n--- TOP DRIFTING FEATURES ---")
print(importances.head(10))

In [None]:
N_FOLDS = 5
SEED = 42
TARGET_COL = 'attack_cat'

cols_to_drop = [
    'id', 'mean_pkt_flow',
    'stcpb', 'dtcpb',
    'dwin', 'swin',
    'is_sm_ips_ports'
]

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train = train.drop(columns=[c for c in cols_to_drop if c in train.columns])
test = test.drop(columns=[c for c in cols_to_drop if c in test.columns])


valid_classes = train['attack_cat'].value_counts()
valid_classes = valid_classes[valid_classes >= 5].index
train = train[train['attack_cat'].isin(valid_classes)].copy()train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

train = train.drop(columns=[c for c in cols_to_drop if c in train.columns])
test = test.drop(columns=[c for c in cols_to_drop if c in test.columns])


valid_classes = train['attack_cat'].value_counts()
valid_classes = valid_classes[valid_classes >= 5].index
train = train[train['attack_cat'].isin(valid_classes)].copy()

In [None]:
def create_features(df):
    eps = 1e-6
    df['s_bytes_per_pkt'] = df['sbytes'] / (df['spkts'] + eps)
    df['d_bytes_per_pkt'] = df['dbytes'] / (df['dpkts'] + eps)
    df['byte_ratio'] = df['sbytes'] / (df['dbytes'] + eps)
    df['pkt_ratio'] = df['spkts'] / (df['dpkts'] + eps)
    return df

train = create_features(train)
test = create_features(test)

log_cols = ['dur','spkts','dpkts','sbytes','dbytes','sload','dload','sloss','dloss']
for c in log_cols:
    if c in train.columns:
        train[c] = np.log1p(train[c])
        test[c] = np.log1p(test[c])

train = train.fillna(0)
test = test.fillna(0)

In [None]:
X = train.drop(columns=['attack_cat'])
y = train['attack_cat'].astype(str)

target_le = LabelEncoder()
y_encoded = target_le.fit_transform(y)
num_classes = len(target_le.classes_)

test = test[X.columns]
cat_cols = ['proto', 'service', 'state']


In [None]:
def get_oof_predictions(model_type, params, X, y, X_test, cat_cols):
    oof = np.zeros((X.shape[0], num_classes))
    test_preds = np.zeros((X_test.shape[0], num_classes))

    skf = StratifiedKFold(5, shuffle=True, random_state=42)

    for fold, (tr, va) in enumerate(skf.split(X, y)):
        X_tr, X_va = X.iloc[tr].copy(), X.iloc[va].copy()
        y_tr, y_va = y[tr], y[va]
        X_te = X_test.copy()

        if model_type == 'xgb':
            for c in cat_cols:
                freq = X_tr[c].value_counts(normalize=True)
                X_tr[c] = X_tr[c].map(freq).fillna(0)
                X_va[c] = X_va[c].map(freq).fillna(0)
                X_te[c] = X_te[c].map(freq).fillna(0)

            model = xgb.XGBClassifier(**params)
            model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], verbose=False)

            oof[va] = model.predict_proba(X_va)
            test_preds += model.predict_proba(X_te) / 5

        elif model_type == 'lgb':
            for c in cat_cols:
                X_tr[c] = X_tr[c].astype(str).astype('category')
                X_va[c] = X_va[c].astype(str).astype('category')
                X_te[c] = X_te[c].astype(str).astype('category')

            dtrain = lgb.Dataset(X_tr, label=y_tr, categorical_feature=cat_cols)
            dval = lgb.Dataset(X_va, label=y_va, categorical_feature=cat_cols)

            model = lgb.train(
                params, dtrain, 1000,
                valid_sets=[dval],
                callbacks=[lgb.early_stopping(50), lgb.log_evaluation(0)]
            )

            oof[va] = model.predict(X_va)
            test_preds += model.predict(X_te) / 5

    return oof, test_preds


In [None]:
xgb_params = {
    'objective': 'multi:softprob',
    'num_class': num_classes,
    'n_estimators': 1000,
    'learning_rate': 0.03,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.6,
    'reg_alpha': 2,
    'eval_metric': 'mlogloss',
    'early_stopping_rounds': 50,
    'random_state': 42,
    'n_jobs': -1
}


In [None]:
xgb_params = {
    'objective': 'multi:softprob',
    'num_class': num_classes,
    'n_estimators': 1000,
    'learning_rate': 0.03,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.6,
    'reg_alpha': 2,
    'eval_metric': 'mlogloss',
    'early_stopping_rounds': 50,
    'random_state': 42,
    'n_jobs': -1
}



In [None]:
lgb_params = {
    'objective': 'multiclass',
    'num_class': num_classes,
    'metric': 'multi_logloss',
    'learning_rate': 0.03,
    'num_leaves': 40,
    'feature_fraction': 0.6,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbosity': -1,
    'seed': 42
}


In [None]:
oof_xgb, test_xgb = get_oof_predictions(
    'xgb', xgb_params, X, y_encoded, test, cat_cols
)

oof_lgb, test_lgb = get_oof_predictions(
    'lgb', lgb_params, X, y_encoded, test, cat_cols
)


In [None]:
X_meta_train = np.hstack([oof_xgb, oof_lgb])
X_meta_test  = np.hstack([test_xgb, test_lgb])

meta_model = LogisticRegression(
    max_iter=1000,
    solver='lbfgs',
    random_state=42
)

meta_model.fit(X_meta_train, y_encoded)


In [None]:
final_probs = meta_model.predict_proba(X_meta_test)
final_preds = target_le.inverse_transform(np.argmax(final_probs, axis=1))

submission = pd.DataFrame({
    'id': pd.read_csv('test.csv')['id'],
    'attack_cat': final_preds
})

submission.to_csv('submission_stacking.csv', index=False)
print("Saved submission_stacking.csv")
