In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import learning_curve, StratifiedKFold

def plot_learning_curve(model, X, y, title, scoring="roc_auc"):
    """
    Plots a learning curve (training vs. cross-val score) for any classifier.
    
    model    : an unfitted sklearn estimator (or one you want to re-fit)
    X, y     : training data (we’ll re-fit on subsets)
    title    : plot title
    scoring  : metric, e.g. 'roc_auc', 'accuracy', 'neg_log_loss'
    """
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    train_sizes = np.linspace(0.1, 1.0, 5)

    train_sizes, train_scores, val_scores = learning_curve(
        estimator   = model,
        X           = X,
        y           = y,
        cv          = cv,
        scoring     = scoring,
        train_sizes = train_sizes,
        n_jobs      = -1,
        shuffle     = True,
        random_state=42
    )

    train_mean = np.mean(train_scores, axis=1)
    train_std  = np.std (train_scores, axis=1)
    val_mean   = np.mean(val_scores,   axis=1)
    val_std    = np.std (val_scores,   axis=1)

    plt.figure(figsize=(8,6))
    plt.plot(train_sizes, train_mean, 'o-', label='Training score')
    plt.fill_between(train_sizes,
                     train_mean - train_std,
                     train_mean + train_std,
                     alpha=0.2)
    plt.plot(train_sizes, val_mean, 'o-', label='Validation score')
    plt.fill_between(train_sizes,
                     val_mean - val_std,
                     val_mean + val_std,
                     alpha=0.2)
    plt.xlabel('Number of training examples')
    plt.ylabel(scoring)
    plt.title(title)
    plt.legend(loc='best')
    plt.grid(alpha=0.3)
    plt.show()

In [None]:
# XGBoost learning curve

# from xgboost import XGBClassifier
# from sklearn.metrics import classification_report, log_loss

# import numpy as np
# import matplotlib.pyplot as plt
# from sklearn.model_selection import learning_curve, StratifiedKFold

# # 1) Re-instantiate your model (or use the one you already have)
# xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

# plot_learning_curve(
#     model    = xgb,
#     X        = X_resampled,
#     y        = y_resampled,
#     title    = "Learning Curve — XGBoost",
#     scoring  = "roc_auc"
# )

In [None]:
# XGBoost classification report + log loss

xgb.fit(X_resampled, y_resampled)

y_pred_xgb = xgb.predict(X_test)
proba_xgb = xgb.predict_proba(X_test)

print(classification_report(y_test, y_pred_xgb))
print("Log Loss:", log_loss(y_test, proba_xgb))

In [None]:
# XGBoost confusion matrix

from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

sns.heatmap(confusion_matrix(y_test, y_pred_xgb), annot=True, fmt='d')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix")
plt.show()

In [None]:
# # XGBoost test results (not very helpful in determining the quality of the model)

# results_df = X_test.copy()
# results_df['Actual'] = y_test.values
# results_df['Predicted'] = y_pred_xgb
# results_df['Champion_Prob'] = proba_xgb[:, 1]
# results_df['Index'] = X_test_indices  # Save index for merging

# # Get metadata for the test indices
# test_metadata = playoff_teams.loc[X_test_indices, ['TEAM_NAME', 'SEASON']].copy()

# # You can manually define which teams are East/West
# east_teams = ['Boston Celtics', 'Milwaukee Bucks', 'Miami Heat', 'Philadelphia 76ers', 'Atlanta Hawks', 'Cleveland Cavaliers', 'Brooklyn Nets', 'Toronto Raptors', 'Chicago Bulls', 'Indiana Pacers', 'New York Knicks', 'Orlando Magic', 'Washington Wizards', 'Detroit Pistons', 'Charlotte Hornets']
# test_metadata['CONFERENCE'] = test_metadata['TEAM_NAME'].apply(lambda x: 'East' if x in east_teams else 'West')

# results_df = pd.DataFrame(X_test, index=X_test_indices)  # Match index
# results_df['Champion_Prob'] = proba_xgb[:, 1]
# results_df['Predicted'] = y_pred_xgb
# results_df['Actual'] = y_test.values

# # Merge with metadata
# results_df = results_df.merge(test_metadata, left_index=True, right_index=True)

# top_by_conference = (
#     results_df
#     .groupby(['SEASON', 'CONFERENCE'])
#     .apply(lambda df: df.nlargest(1, 'Champion_Prob'))
#     .reset_index(drop=True)
# )

# top_by_conference = top_by_conference.sort_values(by='SEASON')

# print(top_by_conference[['SEASON', 'CONFERENCE', 'TEAM_NAME', 'Champion_Prob', 'Actual']])

In [None]:
# # XGBoost feature importance

# from xgboost import plot_importance
# import matplotlib.pyplot as plt

# plt.figure(figsize=(10, 8))
# plot_importance(xgb, max_num_features=20, importance_type='gain')
# plt.title("XGBoost Feature Importance (Top 20)")
# plt.show()

In [None]:
# # XGBoost ROC curve

# import matplotlib.pyplot as plt
# from sklearn.metrics import roc_curve, auc

# fpr, tpr, thresholds = roc_curve(y_test, proba_xgb[:, 1])
# roc_auc = auc(fpr, tpr)

# # 4) Plot
# plt.figure(figsize=(6, 6))
# plt.plot(fpr, tpr, label=f"XGBoost (AUC = {roc_auc:.3f})")
# plt.plot([0, 1], [0, 1], linestyle="--", color="gray", label="Random Guess")
# plt.xlabel("False Positive Rate")
# plt.ylabel("True Positive Rate")
# plt.title("ROC Curve — XGBoost Champion Model")
# plt.legend(loc="lower right")
# plt.grid(alpha=0.3)
# plt.show()

In [None]:
# # LR learning curve

# plot_learning_curve(
#     model    = lr,
#     X        = X_resampled,
#     y        = y_resampled,
#     title    = "Learning Curve — Logistic Regression",
#     scoring  = "roc_auc"
# )

In [None]:
# # LR classification report

# from sklearn.metrics import classification_report, accuracy_score

# y_pred_lr = lr.predict(X_test)
# print(classification_report(y_test, y_pred_lr))

In [None]:
# # LR confusion matrix

# from sklearn.metrics import confusion_matrix
# import seaborn as sns
# import matplotlib.pyplot as plt

# sns.heatmap(confusion_matrix(y_test, y_pred_lr), annot=True, fmt='d')
# plt.xlabel("Predicted")
# plt.ylabel("Actual")
# plt.title("Confusion Matrix")
# plt.show()

In [None]:
# LR log loss

from sklearn.metrics import log_loss

proba_lr = lr.predict_proba(X_test)
print("Log Loss:", log_loss(y_test, proba_lr))

In [None]:
# # LR test results (not helpful)

# results_df = X_test.copy()
# results_df['Actual'] = y_test.values
# results_df['Predicted'] = y_pred_lr
# results_df['Champion_Prob'] = proba_lr[:, 1]
# results_df['Index'] = X_test_indices  # Save index for merging

# # Get metadata for the test indices
# test_metadata = playoff_teams.loc[X_test_indices, ['TEAM_NAME', 'SEASON']].copy()

# # You can manually define which teams are East/West
# east_teams = ['Boston Celtics', 'Milwaukee Bucks', 'Miami Heat', 'Philadelphia 76ers', 'Atlanta Hawks', 'Cleveland Cavaliers', 'Brooklyn Nets', 'Toronto Raptors', 'Chicago Bulls', 'Indiana Pacers', 'New York Knicks', 'Orlando Magic', 'Washington Wizards', 'Detroit Pistons', 'Charlotte Hornets']
# test_metadata['CONFERENCE'] = test_metadata['TEAM_NAME'].apply(lambda x: 'East' if x in east_teams else 'West')

# results_df = pd.DataFrame(X_test, index=X_test_indices)  # Match index
# results_df['Champion_Prob'] = proba_lr[:, 1]
# results_df['Predicted'] = y_pred_lr
# results_df['Actual'] = y_test.values

# # Merge with metadata
# results_df = results_df.merge(test_metadata, left_index=True, right_index=True)

# top_by_conference = (
#     results_df
#     .groupby(['SEASON', 'CONFERENCE'])
#     .apply(lambda df: df.nlargest(1, 'Champion_Prob'))
#     .reset_index(drop=True)
# )

# top_by_conference = top_by_conference.sort_values(by='SEASON')

# print(top_by_conference[['SEASON', 'CONFERENCE', 'TEAM_NAME', 'Champion_Prob', 'Actual']])

In [None]:
# # LR ROC curve

# import matplotlib.pyplot as plt
# from sklearn.metrics import roc_curve, auc

# fpr, tpr, thresholds = roc_curve(y_test, proba_lr[:, 1])
# roc_auc = auc(fpr, tpr)

# # 4) Plot
# plt.figure(figsize=(6, 6))
# plt.plot(fpr, tpr, label=f"LR (AUC = {roc_auc:.3f})")
# plt.plot([0, 1], [0, 1], linestyle="--", color="gray", label="Random Guess")
# plt.xlabel("False Positive Rate")
# plt.ylabel("True Positive Rate")
# plt.title("ROC Curve — LR Champion Model")
# plt.legend(loc="lower right")
# plt.grid(alpha=0.3)
# plt.show()

In [None]:
# # LR feature importance

# import pandas as pd
# import matplotlib.pyplot as plt

# # 1) Extract absolute coefficient values and pair with feature names
# coef = lr.coef_[0]
# imp = pd.Series(data=abs(coef), index=X_test.columns)

# # 2) Sort descending and (optionally) take top 10
# imp = imp.sort_values(ascending=False).head(10)

# # 3) Plot bar chart
# plt.figure(figsize=(8, 6))
# imp.plot.barh()
# plt.gca().invert_yaxis()               # largest at top
# plt.xlabel("Absolute Coefficient Value")
# plt.title("Top 10 Feature Importances — Logistic Regression")
# plt.tight_layout()
# plt.show()

In [None]:
# # RF classification report

# print(classification_report(y_test, y_pred_rf))

In [None]:
# # RF confusion matrix

# from sklearn.metrics import confusion_matrix
# import seaborn as sns
# import matplotlib.pyplot as plt

# sns.heatmap(confusion_matrix(y_test, y_pred_rf), annot=True, fmt='d')
# plt.xlabel("Predicted")
# plt.ylabel("Actual")
# plt.title("Confusion Matrix")
# plt.show()

In [None]:
# RF learning curve

# plot_learning_curve(
#     model    = rf,
#     X        = X_resampled,
#     y        = y_resampled,
#     title    = "Learning Curve — Random Forest",
#     scoring  = "roc_auc"
# )

In [None]:
# # RF log loss

# proba_rf = rf.predict_proba(X_test)

# print("Log Loss:", log_loss(y_test, proba_rf))

In [None]:
# # RF test results (not helpful)

# results_df = X_test.copy()
# results_df['Actual'] = y_test.values
# results_df['Predicted'] = y_pred_rf
# results_df['Champion_Prob'] = proba_rf[:, 1]
# results_df['Index'] = X_test_indices  # Save index for merging

# # Get metadata for the test indices
# test_metadata = playoff_teams.loc[X_test_indices, ['TEAM_NAME', 'SEASON']].copy()

# # You can manually define which teams are East/West
# east_teams = ['Boston Celtics', 'Milwaukee Bucks', 'Miami Heat', 'Philadelphia 76ers', 'Atlanta Hawks', 'Cleveland Cavaliers', 'Brooklyn Nets', 'Toronto Raptors', 'Chicago Bulls', 'Indiana Pacers', 'New York Knicks', 'Orlando Magic', 'Washington Wizards', 'Detroit Pistons', 'Charlotte Hornets']
# test_metadata['CONFERENCE'] = test_metadata['TEAM_NAME'].apply(lambda x: 'East' if x in east_teams else 'West')

# results_df = pd.DataFrame(X_test, index=X_test_indices)  # Match index
# results_df['Champion_Prob'] = proba_rf[:, 1]
# results_df['Predicted'] = y_pred_rf
# results_df['Actual'] = y_test.values

# # Merge with metadata
# results_df = results_df.merge(test_metadata, left_index=True, right_index=True)

# top_by_conference = (
#     results_df
#     .groupby(['SEASON', 'CONFERENCE'])
#     .apply(lambda df: df.nlargest(1, 'Champion_Prob'))
#     .reset_index(drop=True)
# )

# top_by_conference = top_by_conference.sort_values(by='SEASON')

# print(top_by_conference[['SEASON', 'CONFERENCE', 'TEAM_NAME', 'Champion_Prob', 'Actual']])

In [None]:
# RF ROC curve

# import matplotlib.pyplot as plt
# from sklearn.metrics import roc_curve, auc

# fpr, tpr, thresholds = roc_curve(y_test, proba_rf[:, 1])
# roc_auc = auc(fpr, tpr)

# # 4) Plot
# plt.figure(figsize=(6, 6))
# plt.plot(fpr, tpr, label=f"RF (AUC = {roc_auc:.3f})")
# plt.plot([0, 1], [0, 1], linestyle="--", color="gray", label="Random Guess")
# plt.xlabel("False Positive Rate")
# plt.ylabel("True Positive Rate")
# plt.title("ROC Curve — RF Champion Model")
# plt.legend(loc="lower right")
# plt.grid(alpha=0.3)
# plt.show()

In [None]:
# # RF feature importance

# import pandas as pd
# import matplotlib.pyplot as plt

# # 1) Extract importances and pair with feature names
# importances = rf.feature_importances_
# feat_names  = X_train.columns   # or X_test.columns

# imp_series = pd.Series(importances, index=feat_names)

# # 2) Sort descending and pick top N (e.g. 10)
# top_n = imp_series.sort_values(ascending=False).head(10)

# # 3) Plot as a horizontal bar chart
# plt.figure(figsize=(8, 6))
# top_n.plot.barh()
# plt.gca().invert_yaxis()               # largest at top
# plt.xlabel("Feature Importance")
# plt.title("Top 10 Feature Importances — Random Forest")
# plt.tight_layout()
# plt.show()
