In [5]:
!pip install pandas numpy matplotlib seaborn scikit-learn xgboost lightgbm catboost imbalanced-learn

Collecting xgboost
  Downloading xgboost-2.1.3-py3-none-manylinux_2_28_x86_64.whl (153.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.9/153.9 MB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting lightgbm
  Downloading lightgbm-4.5.0-py3-none-manylinux_2_28_x86_64.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m58.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[?25hCollecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hCollecting imbalanced-learn
  Downloading imbalanced_learn-0.13.0-py3-none-any.whl (238 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m238.4/238.4 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
Collecting numpy
  Downloading numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.ma

In [6]:
import numpy as np
import pandas as pd
import warnings
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, StackingClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

In [7]:
warnings.filterwarnings("ignore")

# === Load Data ===
df = pd.read_csv("/kaggle/input/crest-dataset/dataset/DLLs_Imported.csv")

# === Preprocessing ===
df.drop(columns=["SHA256"], inplace=True)  # Remove non-informative column

In [31]:
# Encode target variable
label_encoder = LabelEncoder()
df["Type"] = label_encoder.fit_transform(df["Type"])

# Define features and target
X = df.drop(columns=["Type"])
y = df["Type"]

# === Train-Test Split ===
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
feature_names = X_train.columns

In [9]:
# === Feature Selection: Extra Trees ===
et_model = ExtraTreesClassifier(n_estimators=100, random_state=42)
et_model.fit(X_train, y_train)
selector = SelectFromModel(et_model, threshold="median", prefit=True)
X_train = selector.transform(X_train)
X_valid = selector.transform(X_valid)

# === Remove Highly Correlated Features ===
corr_matrix = pd.DataFrame(X_train).corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.9)]
X_train = np.delete(X_train, to_drop, axis=1)
X_valid = np.delete(X_valid, to_drop, axis=1)

# === Apply SMOTE for Class Balancing ===
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# === Standard Scaling ===
scaler = StandardScaler()
X_train_smote = scaler.fit_transform(X_train_smote)
X_valid_scaled = scaler.transform(X_valid)

In [38]:
# === Model Training & Hyperparameter Tuning ===
rf_param_grid = {
    'n_estimators': [400,500],
    'max_depth': [20, 15],
    'min_samples_split': [2, 5]
}
rf = RandomForestClassifier(random_state=42)
rf_grid = GridSearchCV(rf, rf_param_grid, cv=3, scoring='accuracy')
rf_grid.fit(X_train_smote, y_train_smote)
best_rf = rf_grid.best_estimator_

lgbm_param_grid = {
    'n_estimators': [400,500],
    'learning_rate': [0.05, 0.1],
    'max_depth': [20, 15]
}
lgbm = LGBMClassifier(random_state=42, class_weight='balanced', verbose=-1)
lgbm_grid = GridSearchCV(lgbm, lgbm_param_grid, cv=3, scoring='accuracy')
lgbm_grid.fit(X_train_smote, y_train_smote)
best_lgbm = lgbm_grid.best_estimator_


# === Stacked Ensemble Learning ===
stack_model = StackingClassifier(
    estimators=[("rf", best_rf), ("lgbm", best_lgbm)],
    final_estimator=LogisticRegression(multi_class="multinomial", solver="lbfgs", max_iter=1000),
    stack_method=("predict_proba")
)
stack_model.fit(X_train_smote, y_train_smote)

y_pred_ensemble = stack_model.predict(X_valid_scaled)

print("\nClassification Report for Ensemble Model:")
print(classification_report(y_valid, y_pred_ensemble))



Classification Report for Ensemble Model:
              precision    recall  f1-score   support

           0       0.86      0.40      0.54       359
           1       0.86      0.75      0.80       957
           2       0.94      0.93      0.93       882
           3       0.90      0.41      0.56       936
           4       0.69      0.69      0.69       965
           5       1.00      0.12      0.21       801
           6       0.31      0.98      0.47       704

    accuracy                           0.63      5604
   macro avg       0.79      0.61      0.60      5604
weighted avg       0.80      0.63      0.62      5604



In [36]:
import joblib

# Save the trained stacked ensemble model
joblib.dump(stack_model, "stack_model.pkl")

print("Stacked ensemble model saved successfully as 'stack_model.pkl'.")


Stacked ensemble model saved successfully as 'stack_model.pkl'.


In [35]:
# Get selected feature names
selected_features = feature_names[selector.get_support()].tolist()

# Save feature names to a CSV file
pd.DataFrame({"Feature": selected_features}).to_csv("selected_features_dll.csv", index=False)

# Print selected feature names
print("Selected Features:", selected_features)


Selected Features: ['advapi32.dll', 'kernel32.dll', 'vspmsg.dll', 'ole32.dll', 'oleaut32.dll', 'psapi.dll', 'setupapi.dll', 'shlwapi.dll', 'pdh.dll', 'xmllite.dll', 'msvcr110.dll', 'user32.dll', 'msvcrt.dll', 'shell32.dll', 'ntdll.dll', 'api-ms-win-core-winrt-l1-1-0.dll', 'msvcr100.dll', 'atl100.dll', 'msvcp100.dll', 'version.dll', 'mspdbcore.dll', 'rpcrt4.dll', 'secur32.dll', 'userenv.dll', 'mpclient.dll', 'cabinet.dll', 'comctl32.dll', 'gdi32.dll', 'api-ms-win-core-com-l1-1-1.dll', 'api-ms-win-core-synch-l1-2-0.dll', 'api-ms-win-core-processthreads-l1-1-2.dll', 'api-ms-win-core-errorhandling-l1-1-1.dll', 'api-ms-win-core-libraryloader-l1-2-0.dll', 'api-ms-win-core-profile-l1-1-0.dll', 'api-ms-win-core-sysinfo-l1-2-1.dll', 'api-ms-win-core-string-l1-1-0.dll', 'api-ms-win-core-registry-l1-1-0.dll', 'api-ms-win-core-io-l1-1-1.dll', 'api-ms-win-core-file-l1-2-1.dll', 'netapi32.dll', 'api-ms-win-core-rtlsupport-l1-2-0.dll', 'api-ms-win-core-heap-l2-1-0.dll', 'api-ms-win-core-heap-l1-2-0.d