<a href="https://colab.research.google.com/github/inbalv/tictactoe/blob/master/train_eval_and_exp_help_functions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#hyperopt
import xgboost as xgb
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score
from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
import numpy as np
from hyperopt.pyll.base import scope
from hyperopt.pyll.stochastic import sample
from sklearn.metrics import average_precision_score

# Returns an integer from 0 to 10, inclusive
sample(scope.int(hp.quniform('example', 0, 10, 1)))

# Objective function for XGBoost
def objective_xgb(params):
    params['n_estimators'] = 1000

    print("Training with params: {}".format(params))
    clf = xgb.XGBClassifier(
        objective='binary:logistic',
        use_label_encoder=False,
        eval_metric='logloss',
        **params
    )

    clf.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],
        early_stopping_rounds=25,
        verbose=True  # Set to True to see evaluation messages
    )

    pred_proba = clf.predict_proba(X_val)[:, 1]
    auc = roc_auc_score(y_val, pred_proba)
    return {'loss': -auc, 'status': STATUS_OK}

# Hyperparameter search space (note: 'min_data_in_leaf' has been removed)
space = {
    'max_depth': scope.int(hp.quniform('max_depth', 4, 10, 1)),
    'learning_rate': hp.quniform('learning_rate', 0.1, 0.4, 0.05),
    'reg_alpha': hp.quniform('reg_alpha', 0, 5, 0.1),
    'reg_lambda': hp.quniform('reg_lambda', 0, 5, 0.1),
    'gamma': hp.quniform('gamma', 0, 5, 0.1),
    'subsample': hp.uniform('subsample', 0.5, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.5, 1),
    'min_child_weight': hp.quniform('min_child_weight', 1, 10, 0.5)
}

trials = Trials()
best = fmin(fn=objective_xgb, space=space, algo=tpe.suggest, max_evals=50, trials=trials)

print("Best hyperparameters:", best)

# Train the final model using the best hyperparameters
best_clf = xgb.XGBClassifier(
    objective='binary:logistic',
    use_label_encoder=False,
    eval_metric='logloss',
    **best,
    n_estimators=1000,
    random_state=42,
    verbosity=0  # Set to 0 to suppress detailed logs during final training
)

best_clf.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    early_stopping_rounds=25,
    verbose=True
)

y_test_pred_proba = best_clf.predict_proba(X_test)[:, 1]
y_test_pred = (y_test_pred_proba >= 0.5).astype(int)

accuracy_xgb = accuracy_score(y_test, y_test_pred)
precision_xgb = precision_score(y_test, y_test_pred)
recall_xgb = recall_score(y_test, y_test_pred)
roc_auc_xgb = roc_auc_score(y_test, y_test_pred_proba)
pr_auc = average_precision_score(y_test, y_test_pred_proba)

print("XGBoost (fit) metrics:")
print("Accuracy:", accuracy_xgb)
print("Precision:", precision_xgb)
print("Recall:", recall_xgb)
print("ROC AUC:", roc_auc_xgb)
print("PR AUC:", pr_auc)




Training with params: {'colsample_bytree': 0.9502779280110689, 'gamma': 2.0, 'learning_rate': 0.35000000000000003, 'max_depth': 9, 'min_child_weight': 7.0, 'reg_alpha': 4.2, 'reg_lambda': 1.6, 'subsample': 0.5256585540467105, 'n_estimators': 1000}
  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]

ERROR:hyperopt.fmin:job exception: name 'X_train' is not defined


  0%|          | 0/50 [00:00<?, ?trial/s, best loss=?]


NameError: name 'X_train' is not defined

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, average_precision_score, precision_recall_curve, auc
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
from tabulate import tabulate

def print_formatted_metrics(y_test, y_pred):
    """
    Print a nicely formatted confusion matrix and classification report.

    Args:
        y_test: Array-like of true labels.
        y_pred: Array-like of predicted labels.
    """
    # Create confusion matrix DataFrame with labels.
    cm = confusion_matrix(y_test, y_pred)
    cm_df = pd.DataFrame(cm,
                         index=["Actual Negative", "Actual Positive"],
                         columns=["Predicted Negative", "Predicted Positive"])

    # Print the confusion matrix using tabulate for prettier formatting.
    print("\nConfusion Matrix:")
    print(tabulate(cm_df, headers="keys", tablefmt="psql"))

    # Create the classification report as a DataFrame.
    report = classification_report(y_test, y_pred, output_dict=True)
    report_df = pd.DataFrame(report).transpose()

    # Print the classification report using tabulate.
    print("\nClassification Report:")
    print(tabulate(report_df, headers="keys", tablefmt="psql"))



def evaluate_model(model, X_test, y_test):
    """
    Evaluate the trained model on the test data using AUC-PR as the main metric.
    """
    y_pred = model.predict(X_test)
    y_pred_proba = model.predict_proba(X_test)[:, 1]

    roc_auc = roc_auc_score(y_test, y_pred_proba)
    avg_precision = average_precision_score(y_test, y_pred_proba)

    print("ROC AUC Score: {:.4f}".format(roc_auc))
    print("Average Precision (AUC-PR): {:.4f}".format(avg_precision))
    #print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
    #print("\nClassification Report:\n", classification_report(y_test, y_pred))
    print_formatted_metrics(y_test, y_pred)
    # Plot Precision-Recall Curve
    precision, recall, thresholds = precision_recall_curve(y_test, y_pred_proba)
    pr_auc = auc(recall, precision)
    plt.figure(figsize=(8, 6))
    plt.plot(recall, precision, marker='.', label='Precision-Recall curve (AUC = {:.4f})'.format(pr_auc))
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision-Recall Curve')
    plt.legend()
    plt.show()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

def plot_feature_importance(model, importance_type='gain', top_n=40, figsize=(20,10)):
    """
    Plots the top N features based on the specified importance type from an XGBoost model.

    Parameters:
        model: The trained XGBoost model.
        importance_type (str): The type of feature importance to use ('gain', 'weight', etc.). Default is 'gain'.
        top_n (int): Number of top features to plot. Default is 40.
        figsize (tuple): Size of the figure for the plot. Default is (20, 10).
    """
    # Retrieve feature importance scores from the booster.
    feature_importance = model.get_booster().get_score(importance_type=importance_type)

    # Extract keys (feature names) and values (importance scores)
    keys = list(feature_importance.keys())
    values = list(feature_importance.values())

    # Create a DataFrame, sort it, and plot the top features.
    data = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by="score", ascending=False)
    data.nlargest(top_n, columns="score").plot(kind='barh', figsize=figsize)

    plt.title(f'Top {top_n} Features by {importance_type.capitalize()} Importance')
    plt.xlabel("Importance Score")
    plt.ylabel("Features")
    plt.show()


In [None]:
import shap
import numpy as np
import matplotlib.pyplot as plt

def plot_shap_summary(model, X_test):
    """
    Generates and displays a SHAP summary plot for the provided tree-based model and test dataset.

    Parameters:
        model: Trained tree-based model (e.g., XGBoost) with a booster containing feature names.
        X_test: Test dataset (pandas DataFrame or similar) with the same features as used in training.

    The function computes the SHAP values using shap.TreeExplainer and then plots the summary plot.
    """
    # Optional check: verify feature names match if X_test is a DataFrame.
    feature_names = model.get_booster().feature_names
    if hasattr(X_test, 'columns'):
        if not np.array_equal(feature_names, X_test.columns):
            print("Warning: The model's feature names do not match X_test columns.")

    explainer = shap.TreeExplainer(model)
    shap_values = explainer.shap_values(X_test)

    # Plot the SHAP summary plot.
    shap.summary_plot(shap_values, X_test, feature_names=feature_names, show=False)
    plt.tight_layout()
    plt.show()
