In [3]:
!pip install pandas numpy matplotlib seaborn scikit-learn xgboost lightgbm catboost imbalanced-learn



In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from sklearn.ensemble import ExtraTreesClassifier, VotingClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report
from imblearn.combine import SMOTETomek
from sklearn.utils.class_weight import compute_class_weight

In [17]:
# Load data and split first
train_path = "/kaggle/input/crest-dataset/dataset/portable_executable.csv"
df = pd.read_csv(train_path)

# Drop SHA256 column
if 'SHA256' in df.columns:
    df.drop(columns=['SHA256'], inplace=True)

# Prepare data
X = df.drop(columns=['Type'])
y = df['Type']

# Split into train and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
feature_names = X_train.columns
# Impute missing values using training median
train_median = X_train.median()
X_train = X_train.fillna(train_median)
X_valid = X_valid.fillna(train_median)

In [6]:
from sklearn.feature_selection import RFECV

# Feature selection using RFECV with Extra Trees
extra_trees = ExtraTreesClassifier(n_estimators=100, random_state=42)
selector = RFECV(extra_trees, step=10, cv=3, scoring='accuracy')
selector.fit(X_train, y_train)

# Keep selected features
X_train = X_train.loc[:, selector.support_]
X_valid = X_valid.loc[:, selector.support_]

In [7]:

X_train = np.log1p(X_train)
X_valid = np.log1p(X_valid)


X_train = np.nan_to_num(X_train)
X_valid = np.nan_to_num(X_valid)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_valid_scaled = scaler.transform(X_valid)

In [8]:

smote_tomek = SMOTETomek(random_state=42)
X_train_smote, y_train_smote = smote_tomek.fit_resample(X_train_scaled, y_train)


class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {cls: weight for cls, weight in zip(np.unique(y_train), class_weights)}

In [9]:
from sklearn.model_selection import GridSearchCV


xgb_param_grid = {
    'max_depth': [5, 7],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}
xgb = XGBClassifier(random_state=42)
xgb_grid = GridSearchCV(xgb, xgb_param_grid, cv=3, scoring='accuracy')
xgb_grid.fit(X_train_smote, y_train_smote)
best_xgb = xgb_grid.best_estimator_


lgbm_param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.05, 0.1],
    'max_depth': [10, 15]
}
lgbm = LGBMClassifier(random_state=42, class_weight='balanced', verbose=-1)
lgbm_grid = GridSearchCV(lgbm, lgbm_param_grid, cv=3, scoring='accuracy')
lgbm_grid.fit(X_train_smote, y_train_smote)
best_lgbm = lgbm_grid.best_estimator_

from sklearn.linear_model import LogisticRegression

# Generate base model predictions
base_predictions = []
for model in [best_xgb, best_lgbm]:
    base_predictions.append(model.predict_proba(X_train_smote))

# Stack predictions for meta-model
X_meta = np.hstack(base_predictions)

# Train meta-model
meta_model = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
meta_model.fit(X_meta, y_train_smote)


val_predictions = [model.predict_proba(X_valid_scaled) for model in [best_xgb, best_lgbm]]
X_val_meta = np.hstack(val_predictions)
y_pred_ensemble = meta_model.predict(X_val_meta)

print("\nClassification Report for Ensemble Model:")
print(classification_report(y_valid, y_pred_ensemble))


Classification Report for Ensemble Model:
              precision    recall  f1-score   support

           0       0.93      0.93      0.93       359
           1       0.95      0.94      0.95       957
           2       0.99      0.99      0.99       882
           3       0.88      0.85      0.86       936
           4       0.94      0.89      0.91       965
           5       0.76      0.76      0.76       800
           6       0.72      0.81      0.77       704

    accuracy                           0.88      5603
   macro avg       0.88      0.88      0.88      5603
weighted avg       0.89      0.88      0.88      5603



In [12]:
import joblib
import numpy as np
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import GridSearchCV

ensemble_models = {
    'best_xgb': best_xgb,
    'best_lgbm': best_lgbm,
    'meta_model': meta_model
}

joblib.dump(ensemble_models, 'ensemble_model_pe.pkl')

print("Ensemble model saved as 'ensemble_model.pkl'")


Ensemble model saved as 'ensemble_model.pkl'


In [19]:
selected_features = feature_names[selector.support_].tolist()

# Save feature names to a CSV file
pd.DataFrame({"Feature": selected_features}).to_csv("selected_features_pe.csv", index=False)

# Print selected feature names
print("Selected Features:", selected_features)

Selected Features: ['TimeDateStamp', 'MajorLinkerVersion', 'SectionAlignment', 'SizeOfImage', 'DllCharacteristics', 'text_Misc_VirtualSize', 'text_SizeOfRawData', 'data_Characteristics', 'rdata_Characteristics', 'rsrc_Misc_VirtualSize', 'rsrc_SizeOfRawData', 'rsrc_PointerToRawData']
