In [1]:
import pickle

# Load preprocessed data from pickle file
with open('preprocessed_data.pkl', 'rb') as f:
    X_train, X_test, y_train, y_test = pickle.load(f)

# Verify the shapes
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)


X_train shape: (8000, 8)
X_test shape: (2000, 8)


In [4]:
# Clean column names by removing or replacing invalid characters
def clean_column_names(df):
    df = df.copy()
    df.columns = [col.replace(' ', '_').replace('[', '').replace(']', '').replace('<','') for col in df.columns]
    return df

X_train = clean_column_names(X_train)
X_test = clean_column_names(X_test)


In [7]:
param_grid_xgb_extended = {
    'xgb__n_estimators': [100, 200, 300],
    'xgb__max_depth': [4, 6, 8],
    'xgb__learning_rate': [0.01, 0.1, 0.2],
    'xgb__subsample': [0.8, 1.0],
    'xgb__colsample_bytree': [0.8, 1.0],
    'xgb__reg_alpha': [0, 0.1, 0.5],
    'xgb__reg_lambda': [1, 1.5, 2.0]
}


In [8]:
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Define the pipeline with SMOTE and XGBoost
pipeline_xgb_extended = Pipeline([
    ('smote', SMOTE(random_state=42)),
    ('xgb', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'))
])

# Set up GridSearchCV with the expanded parameter grid
grid_search_xgb_ext = GridSearchCV(
    estimator=pipeline_xgb_extended,
    param_grid=param_grid_xgb_extended,
    cv=5,
    scoring='f1',
    n_jobs=-1,
    verbose=2
)

# Fit the grid search on training data
grid_search_xgb_ext.fit(X_train, y_train)

# Output the best parameters and best score
print("Best parameters (extended):", grid_search_xgb_ext.best_params_)
print("Best cross-validation F1 score (extended):", grid_search_xgb_ext.best_score_)

# Evaluate on the test set
best_xgb_ext = grid_search_xgb_ext.best_estimator_
y_pred_xgb_ext = best_xgb_ext.predict(X_test)
print("Classification Report for Tuned XGBoost (with SMOTE):")
print(classification_report(y_test, y_pred_xgb_ext))


Fitting 5 folds for each of 972 candidates, totalling 4860 fits


Parameters: { "use_label_encoder" } are not used.



Best parameters (extended): {'xgb__colsample_bytree': 1.0, 'xgb__learning_rate': 0.2, 'xgb__max_depth': 6, 'xgb__n_estimators': 300, 'xgb__reg_alpha': 0, 'xgb__reg_lambda': 2.0, 'xgb__subsample': 1.0}
Best cross-validation F1 score (extended): 0.70575134061581
Classification Report for Tuned XGBoost (with SMOTE):
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      1939
           1       0.56      0.79      0.66        61

    accuracy                           0.97      2000
   macro avg       0.78      0.88      0.82      2000
weighted avg       0.98      0.97      0.98      2000



In [9]:
from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Define the pipeline with SMOTETomek and XGBoost
pipeline_xgb_stm = Pipeline([
    ('smotetomek', SMOTETomek(random_state=42)),
    ('xgb', XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'))
])

# Define an example parameter grid (you can expand it as needed)
param_grid_xgb_stm = {
    'xgb__n_estimators': [100, 200],
    'xgb__max_depth': [4, 6, 8],
    'xgb__learning_rate': [0.01, 0.1, 0.2]
}

# Set up GridSearchCV
grid_search_xgb_stm = GridSearchCV(
    estimator=pipeline_xgb_stm,
    param_grid=param_grid_xgb_stm,
    cv=5,
    scoring='f1',
    n_jobs=-1,
    verbose=2
)

# Fit grid search on training data (assuming X_train and y_train are already loaded)
grid_search_xgb_stm.fit(X_train, y_train)

# Output the best parameters and cross-validation score
print("Best parameters with SMOTETomek:", grid_search_xgb_stm.best_params_)
print("Best cross-validation F1 score:", grid_search_xgb_stm.best_score_)

# Evaluate on the test set
best_xgb_stm = grid_search_xgb_stm.best_estimator_
y_pred_xgb_stm = best_xgb_stm.predict(X_test)
print("Classification Report for XGBoost with SMOTETomek:")
print(classification_report(y_test, y_pred_xgb_stm))


Fitting 5 folds for each of 18 candidates, totalling 90 fits


Parameters: { "use_label_encoder" } are not used.



Best parameters with SMOTETomek: {'xgb__learning_rate': 0.2, 'xgb__max_depth': 6, 'xgb__n_estimators': 200}
Best cross-validation F1 score: 0.7000092647740057
Classification Report for XGBoost with SMOTETomek:
              precision    recall  f1-score   support

           0       0.99      0.98      0.98      1939
           1       0.52      0.75      0.61        61

    accuracy                           0.97      2000
   macro avg       0.75      0.87      0.80      2000
weighted avg       0.98      0.97      0.97      2000

