<a href="https://colab.research.google.com/github/khorzhengyu/project518/blob/main/Machine_Learning_Algorithm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
## Logistic Regression - Imbalanced data ##
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# Define the hyperparameter grid
log_reg_params = { 'C': [0.001, 0.01, 0.1, 1.0], 'max_iter' :[2000], 'random_state':[13]}

# Create a cross-validation strategy (Stratified K-Fold)
cv = StratifiedKFold(n_splits=5, shuffle=False)

# Initialize GridSearchCV with Logistic Regression, hyperparameter grid, and cross-validation
grid_log_reg = GridSearchCV(LogisticRegression(random_state = 13), log_reg_params, cv=cv, scoring = 'recall')

# Fit the grid search to your training data
grid_log_reg.fit(X_train, y_train)

# Print the best hyperparameters
print("Best hyperparameters for Logistic Regression:", grid_log_reg.best_params_['C'])

# Get the best estimator
best_log_reg = grid_log_reg.best_estimator_

In [None]:
predictionLR = grid_log_reg.predict(X_test)
print(confusion_matrix(y_test,predictionLR))
print(classification_report(y_test,predictionLR))

In [None]:
%matplotlib inline

cnf_matrix = confusion_matrix(y_test,predictionLR)

class_names=['Non-Fraud', 'Fraud'] # name  of classes
fig, ax = plt.subplots(figsize=(7, 6))
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu", fmt='g')
ax.xaxis.set_label_position("bottom")
plt.tight_layout()
plt.title('Confusion matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')
tick_marks = [0.5, 1.5]
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

# Get predicted probabilities from the best Logistic Regression model
y_proba = best_log_reg.predict_proba(X_test)[:, 1]

# Calculate the ROC curve
fpr, tpr, _ = roc_curve(y_test, y_proba)

# Calculate the AUC
auc = roc_auc_score(y_test, y_proba)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label="Logistic Regression, AUC={:.3f}".format(auc))
plt.plot([0, 1], [0, 1], color='orange', linestyle='--')

plt.xticks(np.arange(0.0, 1.1, step=0.1))
plt.xlabel("False Positive Rate", fontsize=15)

plt.yticks(np.arange(0.0, 1.1, step=0.1))
plt.ylabel("True Positive Rate", fontsize=15)

# Display AUC score at the bottom left of the curve
plt.title('ROC Curve Analysis', fontweight='bold', fontsize=15)
plt.legend(prop={'size': 13}, loc='lower right')

plt.show()


In [None]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import PrecisionRecallDisplay

display = PrecisionRecallDisplay.from_estimator(
    best_log_reg, X_test, y_test, name="Average precision")
_ = display.ax_.set_title("Precision-Recall curve with Class weights")

In [None]:
## Logistic Regression - Undersampling ##
rus = RandomUnderSampler(random_state = 42)
X_rus, y_rus = rus.fit_resample(X_train, y_train)

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=False)

In [None]:
from imblearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LogisticRegression

logistic_under_pipeline = make_pipeline(RandomUnderSampler(random_state=42),
                                        LogisticRegression(random_state=13, max_iter=1500))

In [None]:
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# Define the hyperparameter grid, specifying 'logisticregression__' for the C parameter
new_params = {
    'logisticregression__C': [0.001, 0.01, 0.1,1.0]
}

# Create a GridSearchCV instance, specifying the pipeline, parameter grid, cv, scoring, etc.
grid_under_logistic = GridSearchCV(
    logistic_under_pipeline,  # Your pipeline with Logistic Regression
    param_grid=new_params,  # The hyperparameter grid
    cv= cv ,  # Your cross-validation strategy (StratifiedKFold)
    scoring='recall',  # The scoring metric you want to optimize for
    return_train_score=True
)

# Fit the grid search to your data
grid_under_logistic.fit(X_train, y_train)


In [None]:
print('Best parameters:', grid_under_logistic.best_params_)
print('Best score:', grid_under_logistic.best_score_)

In [None]:
y_pred = grid_under_logistic.best_estimator_.named_steps['logisticregression'].predict(X_test)

In [None]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

In [None]:
%matplotlib inline

matrix_under = confusion_matrix(y_test,y_pred)

class_names=['Non-Fraud', 'Fraud'] # name  of classes
fig, ax = plt.subplots(figsize=(7, 6))
sns.heatmap(pd.DataFrame(matrix_under), annot=True, cmap="YlGnBu", fmt='g')
ax.xaxis.set_label_position("bottom")
plt.tight_layout()
plt.title('Confusion matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')
tick_marks = [0.5, 1.5]
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

# Get predicted probabilities from the best Logistic Regression model
y_proba = grid_under_logistic.predict_proba(X_test)[:, 1]

# Calculate the ROC curve
fpr, tpr, _ = roc_curve(y_test, y_proba)

# Calculate the AUC
auc = roc_auc_score(y_test, y_proba)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label="Logistic Regression, AUC={:.3f}".format(auc))
plt.plot([0, 1], [0, 1], color='orange', linestyle='--')

plt.xticks(np.arange(0.0, 1.1, step=0.1))
plt.xlabel("False Positive Rate", fontsize=15)

plt.yticks(np.arange(0.0, 1.1, step=0.1))
plt.ylabel("True Positive Rate", fontsize=15)

# Display AUC score at the bottom left of the curve
plt.title('ROC Curve Analysis', fontweight='bold', fontsize=15)
plt.legend(prop={'size': 13}, loc='lower right')

plt.show()


In [None]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import PrecisionRecallDisplay

display = PrecisionRecallDisplay.from_estimator(
    grid_under_logistic, X_test, y_test, name="Average precision")
_ = display.ax_.set_title("Precision-Recall curve with Class weights")

In [None]:
## Logistic Regression - Oversampling ##
ros = RandomOverSampler(random_state = 42)
X_ros, y_ros = ros.fit_resample(X_train, y_train)

In [None]:
import numpy as np

# Calculate the unique values and their counts
unique_values, counts = np.unique(y_ros, return_counts=True)

# Print the results
for value, count in zip(unique_values, counts):
    print(f'Class {value}: {count} samples')


In [None]:
from imblearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LogisticRegression

logistic_over_pipeline = make_pipeline(RandomOverSampler(random_state=42),
                                        LogisticRegression(random_state=13, max_iter=1500))

score2 = cross_val_score(logistic_over_pipeline, X_train, y_train, scoring='recall', cv= cv)
print("Cross Validation Recall Scores are: {}".format(score2))
print("Average Cross Validation Recall score: {}".format(score2.mean()))

In [None]:
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# Define the hyperparameter grid, specifying 'logisticregression__' for the C parameter
new_params = {
    'logisticregression__C': [0.001, 0.01, 0.1,1.0]
}

# Create a GridSearchCV instance, specifying the pipeline, parameter grid, cv, scoring, etc.
grid_over_logistic = GridSearchCV(
    logistic_over_pipeline,  # Your pipeline with Logistic Regression
    param_grid=new_params,  # The hyperparameter grid
    cv= cv ,  # Your cross-validation strategy (StratifiedKFold)
    scoring='recall',  # The scoring metric you want to optimize for
    return_train_score=True
)

# Fit the grid search to your data
grid_over_logistic.fit(X_train, y_train)


In [None]:
print('Best parameters:', grid_over_logistic.best_params_)
print('Best score:', grid_over_logistic.best_score_)

In [None]:
y_pred = grid_over_logistic.best_estimator_.named_steps['logisticregression'].predict(X_test)

In [None]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

In [None]:
%matplotlib inline

matrix_over = confusion_matrix(y_test,y_pred)

class_names=['Non-Fraud', 'Fraud'] # name  of classes
fig, ax = plt.subplots(figsize=(7, 6))
sns.heatmap(pd.DataFrame(matrix_over), annot=True, cmap="YlGnBu", fmt='g')
ax.xaxis.set_label_position("bottom")
plt.tight_layout()
plt.title('Confusion matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')
tick_marks = [0.5, 1.5]
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

# Get predicted probabilities from the best Logistic Regression model
y_proba = grid_over_logistic.predict_proba(X_test)[:, 1]

# Calculate the ROC curve
fpr, tpr, _ = roc_curve(y_test, y_proba)

# Calculate the AUC
auc = roc_auc_score(y_test, y_proba)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label="Logistic Regression, AUC={:.3f}".format(auc))
plt.plot([0, 1], [0, 1], color='orange', linestyle='--')

plt.xticks(np.arange(0.0, 1.1, step=0.1))
plt.xlabel("False Positive Rate", fontsize=15)

plt.yticks(np.arange(0.0, 1.1, step=0.1))
plt.ylabel("True Positive Rate", fontsize=15)

# Display AUC score at the bottom left of the curve
plt.title('ROC Curve Analysis', fontweight='bold', fontsize=15)
plt.legend(prop={'size': 13}, loc='lower right')

plt.show()


In [None]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import PrecisionRecallDisplay

display = PrecisionRecallDisplay.from_estimator(
    grid_over_logistic, X_test, y_test, name="Average precision")
_ = display.ax_.set_title("Precision-Recall curve with Class weights")

In [None]:
## Logistic Regression - SMOTE ##
from imblearn.over_sampling import SMOTE

smote_pipeline = make_pipeline(SMOTE(random_state=42),
                              LogisticRegression(max_iter = 2000, random_state=13))
score2 = cross_val_score(smote_pipeline, X_train, y_train, cv= cv)
print("Cross Validation Recall Scores are: {}".format(score2))
print("Average Cross Validation Recall score: {}".format(score2.mean()))

In [None]:
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# Define the hyperparameter grid, specifying 'logisticregression__' for the C parameter
new_params = {
    'logisticregression__C': [0.001, 0.01, 0.1,1.0]
}

# Create a GridSearchCV instance, specifying the pipeline, parameter grid, cv, scoring, etc.
grid_smote_logistic = GridSearchCV(
    smote_pipeline,  # Your pipeline with Logistic Regression
    param_grid=new_params,  # The hyperparameter grid
    cv= cv ,  # Your cross-validation strategy (StratifiedKFold)
      # The scoring metric you want to optimize for
    return_train_score=True
)

# Fit the grid search to your data
grid_smote_logistic.fit(X_train, y_train)


In [None]:
print('Best parameters:', grid_smote_logistic.best_params_)
print('Best score:', grid_smote_logistic.best_score_)

In [None]:
y_pred = grid_smote_logistic.best_estimator_.named_steps['logisticregression'].predict(X_test)

In [None]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

In [None]:
%matplotlib inline

matrix_smote = confusion_matrix(y_test,y_pred)

class_names=['Non-Fraud', 'Fraud'] # name  of classes
fig, ax = plt.subplots(figsize=(7, 6))
sns.heatmap(pd.DataFrame(matrix_smote), annot=True, cmap="YlGnBu", fmt='g')
ax.xaxis.set_label_position("bottom")
plt.tight_layout()
plt.title('Confusion matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')
tick_marks = [0.5, 1.5]
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

# Get predicted probabilities from the best Logistic Regression model
y_proba = grid_smote_logistic.predict_proba(X_test)[:, 1]

# Calculate the ROC curve
fpr, tpr, _ = roc_curve(y_test, y_proba)

# Calculate the AUC
auc = roc_auc_score(y_test, y_proba)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label="Logistic Regression, AUC={:.3f}".format(auc))
plt.plot([0, 1], [0, 1], color='orange', linestyle='--')

plt.xticks(np.arange(0.0, 1.1, step=0.1))
plt.xlabel("False Positive Rate", fontsize=15)

plt.yticks(np.arange(0.0, 1.1, step=0.1))
plt.ylabel("True Positive Rate", fontsize=15)

# Display AUC score at the bottom left of the curve
plt.title('ROC Curve Analysis', fontweight='bold', fontsize=15)
plt.legend(prop={'size': 13}, loc='lower right')

plt.show()


In [None]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import PrecisionRecallDisplay

display = PrecisionRecallDisplay.from_estimator(
    grid_smote_logistic, X_test, y_test, name="Average precision")
_ = display.ax_.set_title("Precision-Recall curve with Class weights")

In [None]:
## Random Forest - imbalanced dataset #
kf = StratifiedKFold(n_splits=5, shuffle=False)

In [None]:
rf = RandomForestClassifier(n_estimators=100, random_state=13)

In [None]:
score = cross_val_score(rf, X_train, y_train, cv=kf)
print("Cross Validation Recall scores are: {}".format(score))
print("Average Cross Validation Recall score: {}".format(score.mean()))

In [None]:
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators': [50, 100, 200],
    'random_state': [13]
}

grid_rf = GridSearchCV(rf, param_grid=params, cv=kf,
                          scoring='recall').fit(X_train, y_train)

In [None]:
print('Best parameters:', grid_rf.best_params_)
print('Best score:', grid_rf.best_score_)

In [None]:
y_pred = grid_rf.predict(X_test)

In [None]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

In [None]:
%matplotlib inline

rf_matrix = confusion_matrix(y_test,y_pred)

class_names=['Non-Fraud', 'Fraud'] # name  of classes
fig, ax = plt.subplots(figsize=(7, 6))
sns.heatmap(pd.DataFrame(rf_matrix), annot=True, cmap="YlGnBu", fmt='g')
ax.xaxis.set_label_position("bottom")
plt.tight_layout()
plt.title('Confusion matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')
tick_marks = [0.5, 1.5]
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

# Get predicted probabilities from the best Logistic Regression model
y_proba = grid_rf.predict_proba(X_test)[:, 1]

# Calculate the ROC curve
fpr, tpr, _ = roc_curve(y_test, y_proba)

# Calculate the AUC
auc = roc_auc_score(y_test, y_proba)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label="Random Forest, AUC={:.3f}".format(auc))
plt.plot([0, 1], [0, 1], color='orange', linestyle='--')

plt.xticks(np.arange(0.0, 1.1, step=0.1))
plt.xlabel("False Positive Rate", fontsize=15)

plt.yticks(np.arange(0.0, 1.1, step=0.1))
plt.ylabel("True Positive Rate", fontsize=15)

# Display AUC score at the bottom left of the curve
plt.title('ROC Curve Analysis', fontweight='bold', fontsize=15)
plt.legend(prop={'size': 13}, loc='lower right')

plt.show()


In [None]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import PrecisionRecallDisplay

display = PrecisionRecallDisplay.from_estimator(
    grid_rf, X_test, y_test, name="Average precision")
_ = display.ax_.set_title("Precision-Recall curve with Class weights")

In [None]:
## Random Forest - Oversampling ##
from imblearn.over_sampling import RandomOverSampler

rf_over_pipeline = make_pipeline(RandomOverSampler(random_state=42),
                              RandomForestClassifier(n_estimators = 100, random_state=13))

In [None]:
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

# Define the hyperparameter grid for Random Forest
params = {
    'randomforestclassifier__n_estimators': [50, 100, 200],
    'randomforestclassifier__random_state': [13]
}

# Create a GridSearchCV instance, specifying the pipeline, parameter grid, cv, scoring, etc.
grid_over_rf = GridSearchCV(
    rf_over_pipeline,  # Your pipeline with Random Forest
    param_grid=params,  # The hyperparameter grid
    cv=cv,  # Your cross-validation strategy (StratifiedKFold)
    scoring='recall',  # The scoring metric you want to optimize for
    return_train_score=True
)

# Fit the grid search to your data
grid_over_rf.fit(X_train, y_train)


In [None]:
print('Best parameters:', grid_over_rf.best_params_)
print('Best score:', grid_over_rf.best_score_)

In [None]:
y_pred = grid_over_rf.best_estimator_.named_steps['randomforestclassifier'].predict(X_test)

In [None]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

In [None]:
%matplotlib inline

rf_over = confusion_matrix(y_test,y_pred)

class_names=['Non-Fraud', 'Fraud'] # name  of classes
fig, ax = plt.subplots(figsize=(7, 6))
sns.heatmap(pd.DataFrame(rf_over), annot=True, cmap="YlGnBu", fmt='g')
ax.xaxis.set_label_position("bottom")
plt.tight_layout()
plt.title('Confusion matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')
tick_marks = [0.5, 1.5]
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

# Get predicted probabilities from the best Logistic Regression model
y_proba = grid_over_rf.predict_proba(X_test)[:, 1]

# Calculate the ROC curve
fpr, tpr, _ = roc_curve(y_test, y_proba)

# Calculate the AUC
auc = roc_auc_score(y_test, y_proba)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label="Random Forest, AUC={:.3f}".format(auc))
plt.plot([0, 1], [0, 1], color='orange', linestyle='--')

plt.xticks(np.arange(0.0, 1.1, step=0.1))
plt.xlabel("False Positive Rate", fontsize=15)

plt.yticks(np.arange(0.0, 1.1, step=0.1))
plt.ylabel("True Positive Rate", fontsize=15)

# Display AUC score at the bottom left of the curve
plt.title('ROC Curve Analysis', fontweight='bold', fontsize=15)
plt.legend(prop={'size': 13}, loc='lower right')

plt.show()


In [None]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import PrecisionRecallDisplay

display = PrecisionRecallDisplay.from_estimator(
    grid_over_rf, X_test, y_test, name="Average precision")
_ = display.ax_.set_title("Precision-Recall Curve with Class weights")

In [None]:
## Random Forst Undersampling ##
from imblearn.pipeline import Pipeline, make_pipeline

rf_under_pipeline = make_pipeline(RandomUnderSampler(random_state=42),
                              RandomForestClassifier(n_estimators=100, random_state=13))


In [None]:
from sklearn.model_selection import GridSearchCV
from imblearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

# Define the hyperparameter grid for Random Forest
params = {
    'randomforestclassifier__n_estimators': [50, 100, 200],
    'randomforestclassifier__random_state': [13]
}

# Create a GridSearchCV instance, specifying the pipeline, parameter grid, cv, scoring, etc.
grid_under_rf = GridSearchCV(
    rf_under_pipeline,  # Your pipeline with Random Forest
    param_grid=params,  # The hyperparameter grid
    cv=cv,  # Your cross-validation strategy (StratifiedKFold)
    scoring='recall',  # The scoring metric you want to optimize for
    return_train_score=True
)

# Fit the grid search to your data
grid_under_rf.fit(X_train, y_train)


In [None]:
print('Best parameters:', grid_under_rf.best_params_)
print('Best score:', grid_under_rf.best_score_)

In [None]:
y_pred = grid_under_rf.best_estimator_.named_steps['randomforestclassifier'].predict(X_test)

In [None]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

In [None]:
%matplotlib inline

rf_under = confusion_matrix(y_test,y_pred)

class_names=['Non-Fraud', 'Fraud'] # name  of classes
fig, ax = plt.subplots(figsize=(7, 6))
sns.heatmap(pd.DataFrame(rf_under), annot=True, cmap="YlGnBu", fmt='g')
ax.xaxis.set_label_position("bottom")
plt.tight_layout()
plt.title('Confusion matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')
tick_marks = [0.5, 1.5]
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

# Get predicted probabilities from the best Logistic Regression model
y_proba = grid_under_rf.predict_proba(X_test)[:, 1]

# Calculate the ROC curve
fpr, tpr, _ = roc_curve(y_test, y_proba)

# Calculate the AUC
auc = roc_auc_score(y_test, y_proba)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label="Random Forest, AUC={:.3f}".format(auc))
plt.plot([0, 1], [0, 1], color='orange', linestyle='--')

plt.xticks(np.arange(0.0, 1.1, step=0.1))
plt.xlabel("False Positive Rate", fontsize=15)

plt.yticks(np.arange(0.0, 1.1, step=0.1))
plt.ylabel("True Positive Rate", fontsize=15)

# Display AUC score at the bottom left of the curve
plt.title('ROC Curve Analysis', fontweight='bold', fontsize=15)
plt.legend(prop={'size': 13}, loc='lower right')

plt.show()


In [None]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import PrecisionRecallDisplay

display = PrecisionRecallDisplay.from_estimator(
    grid_under_rf, X_test, y_test, name="Average precision")
_ = display.ax_.set_title("Precision-Recall Curve with Class weights")

In [None]:
## Random Forest SMOTE ##
from imblearn.over_sampling import SMOTE

rf_smote_pipeline = make_pipeline(SMOTE(random_state=42),
                              RandomForestClassifier(n_estimators=100, random_state=13))


In [None]:
# Define the hyperparameter grid for Random Forest
params = {
    'randomforestclassifier__n_estimators': [50, 100, 200],
    'randomforestclassifier__random_state': [13]
}

# Create a GridSearchCV instance, specifying the pipeline, parameter grid, cv, scoring, etc.
grid_smote_rf = GridSearchCV(
    rf_smote_pipeline,  # Your pipeline with Random Forest
    param_grid=params,  # The hyperparameter grid
    cv=cv,  # Your cross-validation strategy (StratifiedKFold)
    scoring='recall',  # The scoring metric you want to optimize for
    return_train_score=True
)

# Fit the grid search to your data
grid_smote_rf.fit(X_train, y_train)

In [None]:
print('Best parameters:', grid_smote_rf.best_params_)
print('Best score:', grid_smote_rf.best_score_)

In [None]:
y_pred = grid_smote_rf.best_estimator_.named_steps['randomforestclassifier'].predict(X_test)

In [None]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

In [None]:
%matplotlib inline

rf_smote = confusion_matrix(y_test,y_pred)

class_names=['Non-Fraud', 'Fraud'] # name  of classes
fig, ax = plt.subplots(figsize=(7, 6))
sns.heatmap(pd.DataFrame(rf_smote), annot=True, cmap="YlGnBu", fmt='g')
ax.xaxis.set_label_position("bottom")
plt.tight_layout()
plt.title('Confusion matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')
tick_marks = [0.5, 1.5]
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

# Get predicted probabilities from the best Logistic Regression model
y_proba = grid_smote_rf.predict_proba(X_test)[:, 1]

# Calculate the ROC curve
fpr, tpr, _ = roc_curve(y_test, y_proba)

# Calculate the AUC
auc = roc_auc_score(y_test, y_proba)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label="Random Forest, AUC={:.3f}".format(auc))
plt.plot([0, 1], [0, 1], color='orange', linestyle='--')

plt.xticks(np.arange(0.0, 1.1, step=0.1))
plt.xlabel("False Positive Rate", fontsize=15)

plt.yticks(np.arange(0.0, 1.1, step=0.1))
plt.ylabel("True Positive Rate", fontsize=15)

# Display AUC score at the bottom left of the curve
plt.title('ROC Curve Analysis', fontweight='bold', fontsize=15)
plt.legend(prop={'size': 13}, loc='lower right')

plt.show()


In [None]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import PrecisionRecallDisplay

display = PrecisionRecallDisplay.from_estimator(
    grid_smote_rf, X_test, y_test, name="Average precision")
_ = display.ax_.set_title("Precision-Recall Curve with Class weights")

In [None]:
# Install Optuna - hyperparameter tuning for XGBoost
!pip3 install optuna

In [None]:
import warnings
warnings.filterwarnings('ignore')
import optuna

In [None]:
## XGBoost - Imbalanced ##
classifier = XGBClassifier()

In [None]:
# Using Optuna for hyperparameter tuning
def objective(trial):
    """Define the objective function"""

    params = {
        'max_depth': trial.suggest_int('max_depth', 6, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 5, 15),
        'subsample': trial.suggest_loguniform('subsample', 0.1, 1.0),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.6, 1.0),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01,1.0),
        'n_estimators': trial.suggest_int('n_estimators', 600,1000,100),
        'eval_metric': 'aucpr'
    }

    # Fit the model
    optuna_model = XGBClassifier(**params)
    optuna_model.fit(X_train, y_train)

    # Make predictions
    y_pred = optuna_model.predict(X_test)

    # Evaluate predictions
    recall = recall_score(y_test, y_pred)
    return recall

In [None]:
study = optuna.create_study(direction='maximize')

In [None]:
study.optimize(objective, n_trials=20)

In [None]:
trial = study.best_trial

In [None]:
print('Number of finished trials: {}'.format(len(study.trials)))
print('Best trial:')

print('  Value: {:.2f}'.format(trial.value))
print('  Params: ')

for key, value in trial.params.items():
    print('    {}: {:.3f}'.format(key, value))

In [None]:
params = trial.params

In [None]:
model = XGBClassifier(**params)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

In [None]:
%matplotlib inline

xgb_matrix = confusion_matrix(y_test,y_pred)

class_names=['Non-Fraud', 'Fraud'] # name  of classes
fig, ax = plt.subplots(figsize=(7, 6))
sns.heatmap(pd.DataFrame(xgb_matrix), annot=True, cmap="YlGnBu", fmt='g')
ax.xaxis.set_label_position("bottom")
plt.tight_layout()
plt.title('Confusion matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')
tick_marks = [0.5, 1.5]
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

# Get predicted probabilities from the best Logistic Regression model
y_proba = model.predict_proba(X_test)[:, 1]

# Calculate the ROC curve
fpr, tpr, _ = roc_curve(y_test, y_proba)

# Calculate the AUC
auc = roc_auc_score(y_test, y_proba)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label="XGBoost, AUC={:.3f}".format(auc))
plt.plot([0, 1], [0, 1], color='orange', linestyle='--')

plt.xticks(np.arange(0.0, 1.1, step=0.1))
plt.xlabel("False Positive Rate", fontsize=15)

plt.yticks(np.arange(0.0, 1.1, step=0.1))
plt.ylabel("True Positive Rate", fontsize=15)

# Display AUC score at the bottom left of the curve
plt.title('ROC Curve Analysis', fontweight='bold', fontsize=15)
plt.legend(prop={'size': 13}, loc='lower right')

plt.show()


In [None]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import PrecisionRecallDisplay

display = PrecisionRecallDisplay.from_estimator(
    model, X_test, y_test, name="Average precision")
_ = display.ax_.set_title("Precision-Recall curve with Class weights")

In [None]:
## XGboost Random Undersampling ##
def objective(trial):
    """Define the objective function"""

    params = {
        'max_depth': trial.suggest_int('max_depth', 6, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 5, 15),
        'subsample': trial.suggest_loguniform('subsample', 0.1, 1.0),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.6, 1.0),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01,1.0),
        'n_estimators': trial.suggest_int('n_estimators', 600,1000,100),
        'eval_metric': 'aucpr'
    }

    # Apply Random Undersampling to the training data
    sampler = RandomUnderSampler(sampling_strategy='auto', random_state=42)
    X_train_rus, y_train_rus = sampler.fit_resample(X_train, y_train)

    # Fit the model on the resampled data
    optuna_model = XGBClassifier(**params)
    optuna_model.fit(X_train_rus, y_train_rus)

    # Make predictions
    y_pred = optuna_model.predict(X_test)

    # Evaluate predictions
    recall = recall_score(y_test, y_pred)
    return recall

In [None]:
# Create an Optuna study
study = optuna.create_study(direction='maximize')

# Optimize hyperparameters
study.optimize(objective, n_trials=20)

# Get the best hyperparameters
best_params = study.best_params
print("Best Hyperparameters:", best_params)

In [None]:
trial = study.best_trial

In [None]:
params_rus = trial.params

In [None]:
XGB_under = XGBClassifier(**params_rus)
XGB_under.fit(X_train, y_train)

In [None]:
y_pred = XGB_under.predict(X_test)

In [None]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

In [None]:
%matplotlib inline

matrix_xgb_under = confusion_matrix(y_test,y_pred)

class_names=['Non-Fraud', 'Fraud'] # name  of classes
fig, ax = plt.subplots(figsize=(7, 6))
sns.heatmap(pd.DataFrame(matrix_xgb_under), annot=True, cmap="YlGnBu", fmt='g')
ax.xaxis.set_label_position("bottom")
plt.tight_layout()
plt.title('Confusion matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')
tick_marks = [0.5, 1.5]
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

# Get predicted probabilities from the best Logistic Regression model
y_proba = XGB_under.predict_proba(X_test)[:, 1]

# Calculate the ROC curve
fpr, tpr, _ = roc_curve(y_test, y_proba)

# Calculate the AUC
auc = roc_auc_score(y_test, y_proba)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label="XGBoost, AUC={:.3f}".format(auc))
plt.plot([0, 1], [0, 1], color='orange', linestyle='--')

plt.xticks(np.arange(0.0, 1.1, step=0.1))
plt.xlabel("False Positive Rate", fontsize=15)

plt.yticks(np.arange(0.0, 1.1, step=0.1))
plt.ylabel("True Positive Rate", fontsize=15)

# Display AUC score at the bottom left of the curve
plt.title('ROC Curve Analysis', fontweight='bold', fontsize=15)
plt.legend(prop={'size': 13}, loc='lower right')

plt.show()


In [None]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import PrecisionRecallDisplay

display = PrecisionRecallDisplay.from_estimator(
    XGB_under, X_test, y_test, name="Average precision")
_ = display.ax_.set_title("Precision-Recall curve with Class weights")

In [None]:
## XGboost Random Oversampling ##
def objective(trial):
    """Define the objective function"""

    params = {
        'max_depth': trial.suggest_int('max_depth', 6, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 5, 15),
        'subsample': trial.suggest_loguniform('subsample', 0.1, 1.0),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.6, 1.0),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01,1.0),
        'n_estimators': trial.suggest_int('n_estimators', 600,1000,100),
        'eval_metric': 'aucpr'
    }

    # Apply Random Oversampling to the training data
    sampler = RandomOverSampler(sampling_strategy='auto', random_state=42)
    X_train_ros, y_train_ros = sampler.fit_resample(X_train, y_train)

    # Fit the model on the resampled data
    optuna_model = XGBClassifier(**params)
    optuna_model.fit(X_train_ros, y_train_ros)

    # Make predictions
    y_pred = optuna_model.predict(X_test)

    # Evaluate predictions
    recall = recall_score(y_test, y_pred)
    return recall

In [None]:
# Create an Optuna study
study = optuna.create_study(direction='maximize')

# Optimize hyperparameters
study.optimize(objective, n_trials=20)

best_params = study.best_params
formatted_params = {key: f"{value:.3f}" for key, value in best_params.items()}

print("Best Hyperparameters:")
for key, value in formatted_params.items():
    print(f"  {key}: {value}")



In [None]:
trial = study.best_trial

In [None]:
params_ros = trial.params

In [None]:
XGB_over = XGBClassifier(**params_ros)
XGB_over.fit(X_train, y_train)

In [None]:
predict_xgb_over = XGB_over.predict(X_test)
print(confusion_matrix(y_test,predict_xgb_over))
print(classification_report(y_test,predict_xgb_over))

In [None]:
%matplotlib inline

cnf_matrix = confusion_matrix(y_test,predict_xgb_over)

class_names=['Non-Fraud', 'Fraud'] # name  of classes
fig, ax = plt.subplots(figsize=(7, 6))
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu", fmt='g')
ax.xaxis.set_label_position("bottom")
plt.tight_layout()
plt.title('Confusion matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')
tick_marks = [0.5, 1.5]
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

# Get predicted probabilities from the best Logistic Regression model
y_proba = XGB_over.predict_proba(X_test)[:, 1]

# Calculate the ROC curve
fpr, tpr, _ = roc_curve(y_test, y_proba)

# Calculate the AUC
auc = roc_auc_score(y_test, y_proba)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label="Logistic Regression, AUC={:.3f}".format(auc))
plt.plot([0, 1], [0, 1], color='orange', linestyle='--')

plt.xticks(np.arange(0.0, 1.1, step=0.1))
plt.xlabel("False Positive Rate", fontsize=15)

plt.yticks(np.arange(0.0, 1.1, step=0.1))
plt.ylabel("True Positive Rate", fontsize=15)

# Display AUC score at the bottom left of the curve
plt.title('ROC Curve Analysis', fontweight='bold', fontsize=15)
plt.legend(prop={'size': 13}, loc='lower right')

plt.show()


In [None]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import PrecisionRecallDisplay

display = PrecisionRecallDisplay.from_estimator(
    XGB_over, X_test, y_test, name="Average precision")
_ = display.ax_.set_title("Precision-Recall curve with Class weights")

In [None]:
## XGboost SMOTE ##
def objective(trial):
    """Define the objective function"""

    params = {
        'max_depth': trial.suggest_int('max_depth', 6, 12),
        'min_child_weight': trial.suggest_int('min_child_weight', 5, 15),
        'subsample': trial.suggest_loguniform('subsample', 0.1, 1.0),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.6, 1.0),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01,1.0),
        'n_estimators': trial.suggest_int('n_estimators', 600,1000,100),
        'eval_metric': 'aucpr'
    }

    # Apply Random Oversampling to the training data
    sampler = SMOTE(sampling_strategy='auto', random_state=42)
    X_train_smote, y_train_smote = sampler.fit_resample(X_train, y_train)

    # Fit the model on the resampled data
    optuna_model = XGBClassifier(**params)
    optuna_model.fit(X_train_smote, y_train_smote)

    # Make predictions
    y_pred = optuna_model.predict(X_test)

    # Evaluate predictions
    recall = recall_score(y_test, y_pred)
    return recall

In [None]:
# Create an Optuna study
study = optuna.create_study(direction='maximize')

# Optimize hyperparameters
study.optimize(objective, n_trials=10)

best_params = study.best_params
formatted_params = {key: f"{value:.3f}" for key, value in best_params.items()}

print("Best Hyperparameters:")
for key, value in formatted_params.items():
    print(f"  {key}: {value}")

In [None]:
trial = study.best_trial

In [None]:
params_smote = trial.params

In [None]:
XGB_smote = XGBClassifier(**params_smote)
XGB_smote.fit(X_train, y_train)

In [None]:
y_pred = XGB_smote.predict(X_test)

In [None]:
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))

In [None]:
%matplotlib inline

matrix_xgb_under = confusion_matrix(y_test,y_pred)

class_names=['Non-Fraud', 'Fraud'] # name  of classes
fig, ax = plt.subplots(figsize=(7, 6))
sns.heatmap(pd.DataFrame(matrix_xgb_under), annot=True, cmap="YlGnBu", fmt='g')
ax.xaxis.set_label_position("bottom")
plt.tight_layout()
plt.title('Confusion matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')
tick_marks = [0.5, 1.5]
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

# Get predicted probabilities from the best Logistic Regression model
y_proba = XGB_smote.predict_proba(X_test)[:, 1]

# Calculate the ROC curve
fpr, tpr, _ = roc_curve(y_test, y_proba)

# Calculate the AUC
auc = roc_auc_score(y_test, y_proba)

# Plot the ROC curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label="XGBoost, AUC={:.3f}".format(auc))
plt.plot([0, 1], [0, 1], color='orange', linestyle='--')

plt.xticks(np.arange(0.0, 1.1, step=0.1))
plt.xlabel("False Positive Rate", fontsize=15)

plt.yticks(np.arange(0.0, 1.1, step=0.1))
plt.ylabel("True Positive Rate", fontsize=15)

# Display AUC score at the bottom left of the curve
plt.title('ROC Curve Analysis', fontweight='bold', fontsize=15)
plt.legend(prop={'size': 13}, loc='lower right')

plt.show()


In [None]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import PrecisionRecallDisplay

display = PrecisionRecallDisplay.from_estimator(
    XGB_smote, X_test, y_test, name="Average precision")
_ = display.ax_.set_title("Precision-Recall curve with Class weights")