In [None]:
# Model testing Code (All code below this comment)

In [None]:
# Kevin Code
# Importing March Madness data we scraped

import pandas as pd

df_combined_22_23 = pd.read_csv("MarchMadness2022-23.csv")
df_combined_23_24 = pd.read_csv("MarchMadness2023-24.csv")

In [None]:
# Mack Code
# Model libraries needed along with scalers and metrics
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import (MaxAbsScaler, MinMaxScaler, Normalizer, PowerTransformer, QuantileTransformer, RobustScaler, StandardScaler)
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

scalers = [MaxAbsScaler(), MinMaxScaler(), Normalizer(), PowerTransformer(), QuantileTransformer(n_quantiles=60, output_distribution='normal'), RobustScaler(), StandardScaler()]
dist_metrics = ["euclidean", "manhattan", "chebyshev", "minkowski"]

In [None]:
# Mack code
# model 1 -- logistic regression --> hyperparameters then features

# opinion of best features
features = ["Seed_diff", "W-L%_diff", "SRS_diff", "SOS_diff", "Tm._diff", "Opp._diff", "FG%_diff", "3P%_diff", "TRB_diff", "AST_diff", "STL_diff", "BLK_diff", "TOV_diff", "PF_diff"]

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

X_train = df_combined_22_23[features]
Y_train = df_combined_22_23["winner"]

models = {}

for scaler in scalers:
  pipeline = make_pipeline(scaler, LogisticRegression())

  grid_cv = GridSearchCV(pipeline,
                         param_grid = {
                        "logisticregression__C": [0.01, 0.1, 1, 10],
                        "logisticregression__solver": ["lbfgs", "liblinear"],
                        "logisticregression__penalty": ["l2"]
},
                         scoring="f1_macro", cv=10)

  grid_cv.fit(X_train, Y_train)

  potential_model = [f"Scaler: {scaler}"]
  for key, value in (grid_cv.best_params_).items():
    potential_model.append(f"{key}: {value}")
  potential_model = tuple(potential_model)
  models[potential_model] = grid_cv.best_score_
  print(scaler)

models

In [None]:
# Mack code
# Finding the best hyperparameters

best_hypes = ()
score = 0
for key in models.keys():
  if models[key] > score:
    score = models[key]
    best_hypes = key

best_hypes

In [None]:
# Mack code
best_scaler = RobustScaler()
best_c = 0.01
best_penalty = "l2"
best_solver = "lbfgs"

# Choosing best input features

from itertools import combinations

best_pipeline = make_pipeline(best_scaler, LogisticRegression(C=best_c, solver=best_solver, penalty=best_penalty))

best_score = 0
best_features = []
for i in range(1,len(features) + 1):
  for combo in combinations(features, i):
    features_combo = list(combo)
    score = cross_val_score(best_pipeline,
                    X=df_combined_22_23[features_combo], y=df_combined_22_23["winner"],
                    scoring="f1_macro", cv=10).mean()
    if score > best_score:
      best_score = score
      best_features = features_combo


print(f"{best_features}: {best_score}")

In [None]:
# Mack code
best_features_logistic = ["SOS_diff", "Tm._diff", "Opp._diff", "FG%_diff", "3P%_diff", "TRB_diff"]

best_pipeline_logistic = make_pipeline(RobustScaler(), LogisticRegression(C=0.01, solver="lbfgs", penalty="l2"))

x_train = df_combined_22_23[best_features_logistic]
y_train = df_combined_22_23["winner"]

x_test = df_combined_23_24[best_features_logistic]
y_test = df_combined_23_24["winner"]

best_pipeline_logistic.fit(x_train, y_train)

best_pipeline_logistic.predict(x_test)

In [None]:
# Mack code
from sklearn.model_selection import cross_val_score

# Estimate the test error for the Logistic Regression model using 10-fold cross-validation on the 23-24 data
best_logistic_YTest_f1macro = cross_val_score(
    best_pipeline_logistic,
    x_test,
    y_test,
    scoring="f1_macro",
    cv=10
).mean()

print(f"Best Logistic Regression f1_score: {best_logistic_YTest_f1macro}")


In [None]:
# Mack code
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

best_features_logistic = ["SOS_diff", "Tm._diff", "Opp._diff", "FG%_diff", "3P%_diff", "TRB_diff"]

best_pipeline_logistic = make_pipeline(
    RobustScaler(),
    LogisticRegression(C=0.01, solver="lbfgs", penalty="l2", random_state=42)
)

# Training data: 2022-23
X_train = df_combined_22_23[best_features_logistic]
y_train = df_combined_22_23["winner"]

# Test data: 2023-24
X_test = df_combined_23_24[best_features_logistic]
y_test = df_combined_23_24["winner"]

# Fit the pipeline on 22-23 data
best_pipeline_logistic.fit(X_train, y_train)

# Predict on the 23-24 data
y_pred = best_pipeline_logistic.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="macro")
cm = confusion_matrix(y_test, y_pred)

print(f"Accuracy on 2023-24 data: {accuracy:.3f}")
print(f"F1-macro on 2023-24 data: {f1:.3f}")
print("Confusion Matrix:\n", cm)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

.83

In [None]:
# Mack code
y_proba = best_pipeline_logistic.predict_proba(X_test)[:, 1]

from sklearn.metrics import precision_recall_curve
import matplotlib.pyplot as plt

# 2. Compute precision and recall at various thresholds
precision, recall, thresholds = precision_recall_curve(y_test, y_proba, pos_label=1)

# 3. Plot the precision-recall curve
plt.figure(figsize=(6, 4))
plt.plot(recall, precision, label="Precision-Recall Curve")
plt.xlabel("Recall", fontsize=14)
plt.ylabel("Precision", fontsize=14)
plt.title("Figure 1 - LogisticRegression Precision-Recall Curve", fontsize=14)
plt.savefig("logistic_precision_recall.png")
plt.legend()
plt.show()

In [None]:
# Mack code
import pandas as pd
import matplotlib.pyplot as plt

# 1. Get the trained LogisticRegression model from your pipeline
logistic_model = best_pipeline_logistic.named_steps["logisticregression"]

# 2. Extract the coefficients (coef_ is a 2D array, with shape [n_classes, n_features])
#    For a binary classification with one set of coefficients, use logistic_model.coef_[0]
coefs = logistic_model.coef_[0]

# 3. Create a DataFrame for easy sorting and plotting
feature_names = x_train.columns
feat_imp_df = pd.DataFrame({"Feature": feature_names, "Coefficient": coefs})

# 4. Sort by absolute coefficient value (largest magnitude is most influential)
feat_imp_df["abs_coeff"] = feat_imp_df["Coefficient"].abs()
feat_imp_df.sort_values("abs_coeff", ascending=False, inplace=True)

# 5. Plot
plt.barh(feat_imp_df["Feature"], feat_imp_df["Coefficient"])
plt.xlabel("Coefficient Value")
plt.title("Figure 2 - Logistic Regression Coefficients")
plt.gca().invert_yaxis()  # Top features on top
plt.savefig("logistic_coefficients.png")
plt.show()

In [None]:
# Mack code
# model 1 -- logisticregression (input features --> Hyperparameters)

# DO NOT RUN THIS CELL

from itertools import combinations
from tqdm.auto import tqdm

start_pipeline = make_pipeline(StandardScaler(), LogisticRegression())

best_score = 0
best_features = []
for i in tqdm(range(1,len(features) + 1)):
  for combo in tqdm(combinations(features, i)):
    features_combo = list(combo)
    score = cross_val_score(start_pipeline,
                    X=df_combined_22_23[features_combo], y=df_combined_22_23["winner"],
                    scoring="f1_macro", cv=10).mean()
    if score > best_score:
      best_score = score
      best_features = features_combo
print(f"{best_features}: {best_score}")

0.8063492063492064

In [None]:
# Mack code
# DO NOT RUN THIS CELL

X_train = df_combined_22_23[best_features]
Y_train = df_combined_22_23["winner"]

models = {}

for scaler in tqdm(scalers):
  pipeline = make_pipeline(scaler, LogisticRegression())

  grid_cv = GridSearchCV(pipeline,
                         param_grid={},
                         scoring="f1_macro", cv=10)
  grid_cv.fit(X_train, Y_train)

  potential_model = [f"Scaler: {scaler}"]
  for key, value in (grid_cv.best_params_).items():
    potential_model.append(f"{key}: {value}")
  potential_model = tuple(potential_model)
  models[potential_model] = grid_cv.best_score_

In [None]:
# Mack code
# DO NOT RUN THIS CELL

best_hypes = ()
best_score = 0
for key in models.keys():
  if models[key] > best_score:
    best_score = models[key]
    best_hypes = key

best_hypes
print(f"{best_hypes}: {best_score}")

In [None]:
# Mack code
# RUN THIS ONE!!

logistic_HypInputs_Scaler = RobustScaler()
logistic_HypInputs_c = 0.01
logistic_HypInputs_penalty = "l2"
logistic_HypInputs_solver = "lbfgs"


logistic_HypInputs_features = ['SRS_diff', 'Opp._diff', 'PF_diff']
logistic_HypInputs_Pipeline = make_pipeline(logistic_HypInputs_Scaler,
                                                   LogisticRegression(C=logistic_HypInputs_c, penalty=logistic_HypInputs_penalty, solver=logistic_HypInputs_solver))

In [None]:
# Mack code
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

# Features, pipeline, and hyperparams as given
logistic_HypInputs_Scaler = RobustScaler()
logistic_HypInputs_c = 0.01
logistic_HypInputs_penalty = "l2"
logistic_HypInputs_solver = "lbfgs"

logistic_HypInputs_features = ['SRS_diff', 'Opp._diff', 'PF_diff']

logistic_HypInputs_Pipeline = make_pipeline(
    logistic_HypInputs_Scaler,
    LogisticRegression(
        C=logistic_HypInputs_c,
        penalty=logistic_HypInputs_penalty,
        solver=logistic_HypInputs_solver,
        random_state=42
    )
)

# 1. Separate training (22–23) and testing (23–24) sets
X_train = df_combined_22_23[logistic_HypInputs_features]
y_train = df_combined_22_23["winner"]

X_test = df_combined_23_24[logistic_HypInputs_features]
y_test = df_combined_23_24["winner"]

# 2. Fit the pipeline on 22–23 data
logistic_HypInputs_Pipeline.fit(X_train, y_train)

# 3. Predict on 23–24 data
y_pred = logistic_HypInputs_Pipeline.predict(X_test)

# 4. Evaluate
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="macro")
cm = confusion_matrix(y_test, y_pred)

print(f"Accuracy on 2023–24 data: {accuracy:.3f}")
print(f"F1-macro on 2023–24 data: {f1:.3f}")
print("Confusion Matrix:\n", cm)
print("\nClassification Report:\n", classification_report(y_test, y_pred))


In [None]:
# Mack code
y_proba = logistic_HypInputs_Pipeline.predict_proba(X_test)[:, 1]

# 2. Compute precision and recall at various thresholds
precision, recall, thresholds = precision_recall_curve(y_test, y_proba, pos_label=1)

# 3. Plot the precision-recall curve
plt.figure(figsize=(6, 4))
plt.plot(recall, precision, label="Precision-Recall Curve")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve (2023-24 Data)")
plt.legend()
plt.show()

In [None]:
# Mack code
import pandas as pd
import matplotlib.pyplot as plt

# 1. Get the trained LogisticRegression model from your pipeline
logistic_model = logistic_HypInputs_Pipeline.named_steps["logisticregression"]

# Ensure the pipeline is fit before accessing coefficients:
logistic_HypInputs_Pipeline.fit(df_combined_22_23[logistic_HypInputs_features], df_combined_22_23["winner"]) # This line was added

# 2. Extract the coefficients (coef_ is a 2D array, with shape [n_classes, n_features])
#    For a binary classification with one set of coefficients, use logistic_model.coef_[0]
coefs = logistic_model.coef_[0]

# 3. Create a DataFrame for easy sorting and plotting
feat_imp_df = pd.DataFrame({"Feature": logistic_HypInputs_features, "Coefficient": coefs})

# 4. Sort by absolute coefficient value (largest magnitude is most influential)
feat_imp_df["abs_coeff"] = feat_imp_df["Coefficient"].abs()
feat_imp_df.sort_values("abs_coeff", ascending=False, inplace=True)

# 5. Plot
plt.barh(feat_imp_df["Feature"], feat_imp_df["Coefficient"])
plt.xlabel("Coefficient Value\n(Positive => Increases Probability, Negative => Decreases Probability)")
plt.title("Logistic Regression Coefficients")
plt.gca().invert_yaxis()  # Top features on top
plt.show()

In [None]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import cross_val_score
import numpy as np
import random

In [None]:
# Mack code
# ensemble of logisticregression

# ensemble model of randomforestclassifier

# Create two pipelines with different scalers
pipeline1 = make_pipeline(RobustScaler(), LogisticRegression(C=0.01, solver="lbfgs", penalty="l2"))

pipeline2 = make_pipeline(logistic_HypInputs_Scaler,
                                                   LogisticRegression(C=logistic_HypInputs_c, penalty=logistic_HypInputs_penalty, solver=logistic_HypInputs_solver))

# Create a VotingClassifier that combines the pipelines (using soft voting)
ensemble_model = VotingClassifier(
    estimators=[('standard_rf', pipeline1), ('minmax_rf', pipeline2)],
    voting='soft'
)

# Evaluate using cross-validation
scores = cross_val_score(ensemble_model, x_train, y_train, cv=10, scoring="f1_macro", n_jobs=-1)
print("Cross-validated F1-macro scores:", scores)
print("Mean F1-macro score:", np.mean(scores))


.7463888

In [None]:
# Mack code
# model 2 -- randomforest --> hyperparameters then input features

# DONT NEED TO RUN CELL --> CORRECT OUTPUT BELOW

features = ["Seed_diff", "W-L%_diff", "SRS_diff", "SOS_diff", "Tm._diff", "Opp._diff", "FG%_diff",
            "3P%_diff", "TRB_diff", "AST_diff", "STL_diff", "BLK_diff", "TOV_diff", "PF_diff"]

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

X_train = df_combined_22_23[features]
Y_train = df_combined_22_23["winner"]

models = {}

for scaler in scalers:
    pipeline = make_pipeline(scaler, RandomForestClassifier(random_state=42))

    grid_cv = GridSearchCV(
        pipeline,
        param_grid={
            "randomforestclassifier__n_estimators": [50, 100, 200],
            "randomforestclassifier__max_depth": [None, 5, 10, 20],
            "randomforestclassifier__min_samples_split": [2, 5, 10]
        },
        scoring="f1_macro",
        cv=10
    )

    grid_cv.fit(X_train, Y_train)

    potential_model = [f"Scaler: {scaler}"]
    for key, value in grid_cv.best_params_.items():
        potential_model.append(f"{key}: {value}")
    potential_model = tuple(potential_model)
    models[potential_model] = grid_cv.best_score_
    print(scaler)

models

In [None]:
# Mack code
# Finding the best hyperparameters

# DO NOT NEED TO RUN CELL

best_hypes = ()
score = 0
for key in models.keys():
  if models[key] > score:
    score = models[key]
    best_hypes = key

best_hypes

In [None]:
# Mack code
# DO NOT NEED TO RUN CELL

best_scaler = MaxAbsScaler()
best_depth = 5
best_split = 2
best_estimators = 100

# Choosing best input features

from itertools import combinations

best_pipeline = make_pipeline(best_scaler, RandomForestClassifier(max_depth=best_depth, min_samples_split=best_split, n_estimators=best_estimators))

best_score = 0
best_features = []
for i in range(1,len(features) + 1):
  for combo in combinations(features, i):
    features_combo = list(combo)
    score = cross_val_score(best_pipeline,
                    X=df_combined_22_23[features_combo], y=df_combined_22_23["winner"],
                    scoring="f1_macro", cv=10).mean()
    if score > best_score:
      best_score = score
      best_features = features_combo

print(f"{best_features}: {best_score}")

In [None]:
# Mack code
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

best_features_forests = ['SRS_diff', 'SOS_diff', 'Tm._diff', 'Opp._diff']

best_pipeline_forests = make_pipeline(MaxAbsScaler(), RandomForestClassifier(max_depth=5, min_samples_split=2, n_estimators=100))

x_train = df_combined_22_23[best_features_forests]
y_train = df_combined_22_23["winner"]

x_test = df_combined_23_24[best_features_forests]
y_test = df_combined_23_24["winner"]

best_pipeline_forests.fit(x_train, y_train)

best_pipeline_forests.predict(x_test)

In [None]:
# Mack code
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

# Features and pipeline definition
best_features_forests = ['SRS_diff', 'SOS_diff', 'Tm._diff', 'Opp._diff']
best_pipeline_forests = make_pipeline(
    MaxAbsScaler(),
    RandomForestClassifier(max_depth=5, min_samples_split=2, n_estimators=100, random_state=42)
)

# Separate training (22–23) and testing (23–24) sets
X_train = df_combined_22_23[best_features_forests]
y_train = df_combined_22_23["winner"]

X_test = df_combined_23_24[best_features_forests]
y_test = df_combined_23_24["winner"]

# Fit the pipeline on 22–23 data
best_pipeline_forests.fit(X_train, y_train)

# Predict on 23–24 data
y_pred = best_pipeline_forests.predict(X_test)

# Evaluate
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="macro")
cm = confusion_matrix(y_test, y_pred)

print(f"Accuracy on 2023–24 data: {accuracy:.3f}")
print(f"F1-macro on 2023–24 data: {f1:.3f}")
print("Confusion Matrix:\n", cm)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

In [None]:
# Mack code
y_proba = best_pipeline_forests.predict_proba(X_test)[:, 1]

# Compute precision, recall, and thresholds
precision, recall, thresholds = precision_recall_curve(y_test, y_proba, pos_label=1)

# Plot the precision-recall curve
plt.figure(figsize=(6, 4))
plt.plot(recall, precision, label="Precision-Recall Curve", color='b')
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve (2023–24 Data)")
plt.legend()
plt.show()

In [None]:
# Mack code
import pandas as pd
import matplotlib.pyplot as plt

rf_model = best_pipeline_forests.named_steps["randomforestclassifier"]
importances = rf_model.feature_importances_

# Use the features from your best model (best_features_forests)
feature_names = best_features_forests  # Changed this line

feat_imp_df = pd.DataFrame({"Feature": feature_names, "Importance": importances})
feat_imp_df.sort_values("Importance", ascending=False, inplace=True)

plt.barh(feat_imp_df["Feature"], feat_imp_df["Importance"])
plt.xlabel("Feature Importance")
plt.title("RandomForest Coefficients")
plt.gca().invert_yaxis()  # so top feature is at the top
plt.show()

In [None]:
# Mack code
import itertools
import random
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from tqdm.auto import tqdm

# Ensure reproducibility:
random.seed(42)
np.random.seed(42)

best_score = 0
best_features = None

# Optional: count total possible subsets (excluding empty set)
total_subsets = sum(1 for i in range(1, len(features)+1) for _ in itertools.combinations(features, i))
print("Total subsets:", total_subsets)

count_tested = 0

# Iterate over all subset sizes
for i in range(1, 8):
    # List all combinations of size i (wrap with list() to use tqdm)
    for combo in tqdm(list(itertools.combinations(features, i)), desc=f"Testing combinations of size {i}"):
        # Randomly decide to test this combination with a probability of 0.5
        if random.random() < 0.5:
            features_combo = list(combo)
            # Evaluate this combination using cross-validation
            cv_score = cross_val_score(
                make_pipeline(StandardScaler(), RandomForestClassifier(n_jobs=-1, random_state=42)),
                X=df_combined_22_23[features_combo],
                y=df_combined_22_23["winner"],
                scoring="f1_macro",
                cv=5,
                n_jobs=-1
            ).mean()
            count_tested += 1
            if cv_score > best_score:
                best_score = cv_score
                best_features = features_combo

print(f"Tested {count_tested} combinations out of {total_subsets} total subsets.")
print(f"Best combination: {best_features} with F1-macro score = {best_score:.3f}")


Tested 8159 combinations out of 16383 total subsets.
Best combination: ['Seed_diff', 'SOS_diff', 'Opp._diff', 'TRB_diff', 'STL_diff', 'BLK_diff', 'PF_diff']
with F1-macro score = 0.804

In [None]:
# Mack code
# DO NOT RUN THIS CELL

forests_2_best_features = ['Seed_diff', 'SOS_diff', 'Opp._diff', 'TRB_diff', 'STL_diff', 'BLK_diff', 'PF_diff']

X_train = df_combined_22_23[forests_2_best_features]
Y_train = df_combined_22_23["winner"]

models = {}

for scaler in tqdm(scalers):
  pipeline = make_pipeline(scaler, RandomForestClassifier())

  grid_cv = GridSearchCV(pipeline,
                         param_grid={
                        "randomforestclassifier__n_estimators": [50, 100, 200],
                        "randomforestclassifier__max_depth": [None, 5, 10, 20],
                        "randomforestclassifier__min_samples_split": [2, 5, 10]
                    },
                         scoring="f1_macro", cv=10)
  grid_cv.fit(X_train, Y_train)

  potential_model = [f"Scaler: {scaler}"]
  for key, value in (grid_cv.best_params_).items():
    potential_model.append(f"{key}: {value}")
  potential_model = tuple(potential_model)
  models[potential_model] = grid_cv.best_score_

In [None]:
# Mack code
# DO NOT RUN THIS CELL

best_hypes = ()
best_score = 0
for key in models.keys():
  if models[key] > best_score:
    best_score = models[key]
    best_hypes = key

best_hypes
print(f"{best_hypes}: {best_score}")

In [None]:
# Mack code
# RUN THIS ONE!!

forests_2_best_features = ['Seed_diff', 'SOS_diff', 'Opp._diff', 'TRB_diff', 'STL_diff', 'BLK_diff', 'PF_diff']

X_train = df_combined_22_23[forests_2_best_features]
Y_train = df_combined_22_23["winner"]

forests_HypInputs_Scaler = Normalizer()
forests_HypInputs_max_depth = 5
forests_HypInputs_min_samples_split = 5
forests_HypInputs_n_estimators = 100

forests_HypInputs_features = ['Seed_diff', 'SOS_diff', 'Opp._diff', 'TRB_diff', 'STL_diff', 'BLK_diff', 'PF_diff']
forests_HypInputs_Pipeline = make_pipeline(forests_HypInputs_Scaler,
                                                   RandomForestClassifier(max_depth=forests_HypInputs_max_depth, min_samples_split=forests_HypInputs_min_samples_split, n_estimators=forests_HypInputs_n_estimators))

In [None]:
# Mack code
forests_HypInputs_Pipeline.fit(df_combined_22_23[forests_HypInputs_features], df_combined_22_23["winner"])

forests_HypInputs_Pipeline.predict(df_combined_23_24[forests_HypInputs_features])

In [None]:
# Mack code
# MACKS BEST MODEL --> RANDOMFORESTCLASSIFIER: INPUT FEATURES --> HYPERPARAMS

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

# Define features
forests_2_best_features = ['Seed_diff', 'SOS_diff', 'Opp._diff', 'TRB_diff', 'STL_diff', 'BLK_diff', 'PF_diff']

# Prepare training data (2022–23)
X_train = df_combined_22_23[forests_2_best_features]
Y_train = df_combined_22_23["winner"]

# Prepare test data (2023–24)
X_test = df_combined_23_24[forests_2_best_features]
Y_test = df_combined_23_24["winner"]

# Define hyperparameters and create pipeline
forests_HypInputs_Scaler = Normalizer()
forests_HypInputs_max_depth = 5
forests_HypInputs_min_samples_split = 5
forests_HypInputs_n_estimators = 100

forests_HypInputs_Pipeline = make_pipeline(
    forests_HypInputs_Scaler,
    RandomForestClassifier(
        max_depth=forests_HypInputs_max_depth,
        min_samples_split=forests_HypInputs_min_samples_split,
        n_estimators=forests_HypInputs_n_estimators,
        random_state=42
    )
)

# Fit the pipeline on the training data
forests_HypInputs_Pipeline.fit(X_train, Y_train)

# Predict on the test data
Y_pred = forests_HypInputs_Pipeline.predict(X_test)

# Evaluate predictions
accuracy = accuracy_score(Y_test, Y_pred)
f1 = f1_score(Y_test, Y_pred, average="macro")
cm = confusion_matrix(Y_test, Y_pred)
report = classification_report(Y_test, Y_pred)

print(f"Accuracy on 2023–24 data: {accuracy:.3f}")
print(f"F1-macro on 2023–24 data: {f1:.3f}")
print("Confusion Matrix:\n", cm)
print("\nClassification Report:\n", report)

In [None]:
# Mack code
Y_proba = forests_HypInputs_Pipeline.predict_proba(X_test)[:, 1]

# Compute precision, recall, and thresholds
precision, recall, thresholds = precision_recall_curve(Y_test, Y_proba, pos_label=1)

# Plot the precision-recall curve
plt.figure(figsize=(6, 4))
plt.plot(recall, precision, label="Precision-Recall Curve", color='b')
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve (2023–24 Data)")
plt.legend()
plt.show()

In [None]:
# Mack code
import pandas as pd
import matplotlib.pyplot as plt

rf_model = forests_HypInputs_Pipeline.named_steps["randomforestclassifier"]  # if that's the name
importances = rf_model.feature_importances_

# Use the features from your best model (best_features_forests)
feature_names = forests_HypInputs_features  # Changed this line

feat_imp_df = pd.DataFrame({"Feature": forests_HypInputs_features, "Importance": importances})
feat_imp_df.sort_values("Importance", ascending=False, inplace=True)

plt.barh(feat_imp_df["Feature"], feat_imp_df["Importance"])
plt.xlabel("Feature Importance")
plt.title("RandomForest Coefficients")
plt.gca().invert_yaxis()  # so top feature is at the top
plt.show()

In [None]:
# Mack code
# ensemble model of randomforestclassifier

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import cross_val_score
import numpy as np

# Create two pipelines with different scalers
pipeline1 = make_pipeline(MaxAbsScaler(), RandomForestClassifier(max_depth=5, min_samples_split=2, n_estimators=100))

pipeline2 = make_pipeline(forests_HypInputs_Scaler, RandomForestClassifier(max_depth=forests_HypInputs_max_depth, min_samples_split=forests_HypInputs_min_samples_split, n_estimators=forests_HypInputs_n_estimators))

# Create a VotingClassifier that combines the pipelines (using soft voting)
ensemble_model = VotingClassifier(
    estimators=[('standard_rf', pipeline1), ('minmax_rf', pipeline2)],
    voting='soft'
)

# Evaluate using cross-validation
scores = cross_val_score(ensemble_model, x_train, y_train, cv=10, scoring="f1_macro", n_jobs=-1)
print("Cross-validated F1-macro scores:", scores)
print("Mean F1-macro score:", np.mean(scores))

In [None]:
# Kevin Code
# Base features variable:
features = ["Seed_diff", "W-L%_diff", "SRS_diff", "SOS_diff", "Tm._diff", "Opp._diff", "FG%_diff", "3P%_diff", "TRB_diff", "AST_diff", "STL_diff", "BLK_diff", "TOV_diff", "PF_diff"]


# Model libraries needed along with scalers and metrics
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import (MaxAbsScaler, MinMaxScaler, Normalizer, PowerTransformer, QuantileTransformer, RobustScaler, StandardScaler)
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import VotingClassifier
from itertools import combinations
import numpy as np
from tqdm.auto import tqdm

# List of scalers and distance metrics
scalers = [MaxAbsScaler(), MinMaxScaler(), Normalizer(), PowerTransformer(), QuantileTransformer(n_quantiles=60, output_distribution='normal'), RobustScaler(), StandardScaler()]
dist_metrics = ["euclidean", "manhattan", "chebyshev", "minkowski"]

In [None]:
# Kevin Code

# model 3 -- kneighborsclassifier (Tuning Hyperparameters/Scaler, then input features)

# Training data starting with all features first
X_train = df_combined_22_23[features]
Y_train = df_combined_22_23["winner"]

# Variables to store the best hyperparameters/scaler and the f1_macro score of the model with the hyperparameters/scaler
best_score = 0
best_hypes = []

# Going through every combination of scalers and hyperparameters
for scaler in scalers:
  pipeline = make_pipeline(scaler, KNeighborsClassifier())
  grid_cv = GridSearchCV(pipeline,
                         param_grid={"kneighborsclassifier__n_neighbors": range(1, 30),"kneighborsclassifier__metric": dist_metrics},
                         scoring="f1_macro", cv=10)
  grid_cv.fit(X_train, Y_train)

  potential_model = [f"Scaler: {scaler}"]
  for key, value in (grid_cv.best_params_).items():
    potential_model.append(f"{key}: {value}")
  score = grid_cv.best_score_

  if score > best_score:
    best_score = score
    best_hypes = potential_model

print(f"{best_hypes}: {best_score}")

In [None]:
# Kevin Code

# Storing the best hyperparamets/scaler for the model when tuning hyperparameters first
best_scaler = MinMaxScaler()
best_metric = "chebyshev"
best_neighbors = 5

# Tuning the input features

# The pipeline with the best scaler and model with the best hyperparameters
best_pipeline = make_pipeline(best_scaler, KNeighborsClassifier(n_neighbors=best_neighbors, metric=best_metric))

# Variables to store the best input features to use for the model and the f1_score it produces
best_score = 0
best_features = []

# Going through every combination of input features
for i in tqdm(range(1,len(features) + 1)):
  for combo in tqdm(combinations(features, i)):
    features_combo = list(combo)
    score = cross_val_score(best_pipeline,
                    X=df_combined_22_23[features_combo], y=df_combined_22_23["winner"],
                    scoring="f1_macro", cv=10).mean()
    if score > best_score:
      best_score = score
      best_features = features_combo

print(f"{best_features}: {best_score}")

In [None]:
# Kevin Code

# Storing the best hyperparameters/scaler and best features for the KNeighborsClassifier model after tuning hyperparameters/scaler first and then input features
kneighborsModel_HypInputs_Scaler = MinMaxScaler()
kneighborsModel_HypInputs_metric = "chebyshev"
kneighborsModel_HypInputs_neighbors = 5
kneighborsModel_HypInputs_features = ['SRS_diff', 'SOS_diff', 'Tm._diff', 'Opp._diff', 'TRB_diff', 'AST_diff', 'BLK_diff', 'TOV_diff', 'PF_diff']
kneighborsModel_HypInputs_Pipeline = make_pipeline(kneighborsModel_HypInputs_Scaler,
                                                   KNeighborsClassifier(n_neighbors=kneighborsModel_HypInputs_neighbors, metric=kneighborsModel_HypInputs_metric))

# Storing the f1_macro score for this model
kneighborsModel_HypInputs_XTrain = df_combined_22_23[kneighborsModel_HypInputs_features]
kneighborsModel_HypInputs_YTrain = df_combined_22_23["winner"]

kneighborsModel_HypInputs_Score = cross_val_score(kneighborsModel_HypInputs_Pipeline, kneighborsModel_HypInputs_XTrain, kneighborsModel_HypInputs_YTrain, scoring= "f1_macro", cv= 10).mean()
kneighborsModel_HypInputs_Score

In [None]:
# Kevin Code

# model 3 -- kneighborsclassifier (Tuning input features, then Hyperparameters/Scaler)

# Making a general starting pipeline to find the best input features for the model first
start_pipeline = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=10))

# Variables to store the best input features for the general model and the f1_macro score they produce
best_score = 0
best_features = []

# Going through every combination of input features
for i in tqdm(range(1,len(features) + 1)):
  for combo in tqdm(combinations(features, i)):
    features_combo = list(combo)
    score = cross_val_score(start_pipeline,
                    X=df_combined_22_23[features_combo], y=df_combined_22_23["winner"],
                    scoring="f1_macro", cv=10).mean()
    if score > best_score:
      best_score = score
      best_features = features_combo
print(f"{best_features}: {best_score}")

In [None]:
# Kevin Code

# Varaibles to store the training data according the the best input features found above
X_train = df_combined_22_23[best_features]
Y_train = df_combined_22_23["winner"]

# Variables to store the best hyperparameters/scaler for the model with the best features and the f1_macro score it outputs
best_score = 0
best_hypes = []

# Going through every combination of scalers and hyperparameters
for scaler in tqdm(scalers):
  pipeline = make_pipeline(scaler, KNeighborsClassifier())

  grid_cv = GridSearchCV(pipeline,
                         param_grid={"kneighborsclassifier__n_neighbors": range(1, 30),"kneighborsclassifier__metric": dist_metrics},
                         scoring="f1_macro", cv=10)
  grid_cv.fit(X_train, Y_train)

  potential_model = [f"Scaler: {scaler}"]
  for key, value in (grid_cv.best_params_).items():
    potential_model.append(f"{key}: {value}")
  score = grid_cv.best_score_

  if score > best_score:
    best_score = score
    best_hypes = potential_model

print(f"{best_hypes}: {best_score}")

In [None]:
# Kevin Code

# Storing the best hyperparameters/scaler and best features for the KNeighborsClassifier model after tuning input features first and then hyperparameters/scaler
kneighborsModel_InputsHyp_Scaler = QuantileTransformer(n_quantiles=60, output_distribution='normal')
kneighborsModel_InputsHyp_metric = "chebyshev"
kneighborsModel_InputsHyp_neighbors = 11
kneighborsModel_InputsHyp_features = ['Seed_diff', 'W-L%_diff', 'SRS_diff', 'Tm._diff', 'Opp._diff', 'STL_diff', 'BLK_diff']
kneighborsModel_InputsHyp_Pipeline = make_pipeline(kneighborsModel_InputsHyp_Scaler,
                                                   KNeighborsClassifier(n_neighbors=kneighborsModel_InputsHyp_neighbors, metric=kneighborsModel_InputsHyp_metric))

kneighborsModel_InputsHyp_XTrain = df_combined_22_23[kneighborsModel_InputsHyp_features]
kneighborsModel_InputsHyp_YTrain = df_combined_22_23["winner"]

kneighborsModel_InputsHyp_Score = cross_val_score(kneighborsModel_InputsHyp_Pipeline, kneighborsModel_InputsHyp_XTrain, kneighborsModel_InputsHyp_YTrain, scoring= "f1_macro", cv= 10).mean()
kneighborsModel_InputsHyp_Score

In [None]:
# Kevin Code

# Ensembling the two KNeighborsClassifier models using VotingRegressor to see if a better model is produced

# Joining the input features of both models together to use with VotingRegressor
ensem_features = list(set(kneighborsModel_HypInputs_features + kneighborsModel_InputsHyp_features))

# Getting the training data accoriding to the joined inputs
X_train = df_combined_22_23[ensem_features]
Y_train = df_combined_22_23["winner"]

# Creating the ensemble model and outputing its f1_marco score
kneighborsModel_Ensemble = VotingClassifier([("kneighborsModel_HypInputs", kneighborsModel_HypInputs_Pipeline), ("kneighborsModel_InputsHyp", kneighborsModel_InputsHyp_Pipeline)])

kneighborsModel_Ensemble_Score = cross_val_score(kneighborsModel_Ensemble, X_train, Y_train, scoring= "f1_macro", cv= 10).mean()
kneighborsModel_Ensemble_Score

In [None]:
# Kevin Code

# Evaluating the scores for all the KNeighborsClassifier models to see which was the best one
print(f"kneighborsModel_HypInputs_Score : {kneighborsModel_HypInputs_Score}")
print(f"kneighborsModel_InputsHyp_Score : {kneighborsModel_InputsHyp_Score}")
print(f"kneighborsModel_Ensemble_Score : {kneighborsModel_Ensemble_Score}")

In [None]:
# Kevin Code

# model 4 -- GaussianNB (Naive Bayes) (Tuning Hyperparameters/Scaler, then input features)
# Doing everything I did above for model 3 but for model 4 which is the GaussianNB model

X_train = df_combined_22_23[features]
Y_train = df_combined_22_23["winner"]

best_hypes = []
best_score = 0

for scaler in tqdm(scalers):
  pipeline = make_pipeline(scaler, GaussianNB())

  grid_cv = GridSearchCV(pipeline,
                         param_grid={"gaussiannb__var_smoothing": np.logspace(-12, 0, 1000)},
                         scoring="f1_macro", cv=10)
  grid_cv.fit(X_train, Y_train)

  potential_model = [f"Scaler: {scaler}"]
  for key, value in (grid_cv.best_params_).items():
    potential_model.append(f"{key}: {value}")
  score = grid_cv.best_score_

  if score > best_score:
    best_score = score
    best_hypes = potential_model

print(f"{best_hypes}: {best_score}")

In [None]:
# Kevin Code

best_scaler = QuantileTransformer(n_quantiles=60, output_distribution='normal')
best_varSmoothing = 1e-12

best_pipeline = make_pipeline(best_scaler, GaussianNB(var_smoothing= best_varSmoothing))

best_score = 0
best_features = []
for i in tqdm(range(1,len(features) + 1)):
  for combo in tqdm(combinations(features, i)):
    features_combo = list(combo)
    score = cross_val_score(best_pipeline,
                    X=df_combined_22_23[features_combo], y=df_combined_22_23["winner"],
                    scoring="f1_macro", cv=10).mean()
    if score > best_score:
      best_score = score
      best_features = features_combo

print(f"{best_features}: {best_score}")

In [None]:
# Kevin Code

gaussianModel_HypInputs_Scaler = QuantileTransformer(n_quantiles=60, output_distribution='normal')
gaussianModel_HypInputs_smoothing = 1e-12
gaussianModel_HypInputs_features = ['SOS_diff', 'Tm._diff', 'Opp._diff', '3P%_diff', 'TRB_diff', 'STL_diff', 'BLK_diff']
gaussianModel_HypInputs_Pipeline = make_pipeline(gaussianModel_HypInputs_Scaler,
                                                  GaussianNB(var_smoothing=gaussianModel_HypInputs_smoothing))

gaussianModel_HypInputs_XTrain = df_combined_22_23[gaussianModel_HypInputs_features]
gaussianModel_HypInputs_YTrain = df_combined_22_23["winner"]

gaussianModel_HypInputs_Score = cross_val_score(gaussianModel_HypInputs_Pipeline, gaussianModel_HypInputs_XTrain, gaussianModel_HypInputs_YTrain, scoring= "f1_macro", cv= 10).mean()
gaussianModel_HypInputs_Score

In [None]:
# Kevin Code

# model 4 -- GaussianNB (NavieBayes) (Tuning input features, then Hyperparameters/Scaler)

start_pipeline = make_pipeline(StandardScaler(), GaussianNB())

best_score = 0
best_features = []
for i in tqdm(range(1,len(features) + 1)):
  for combo in tqdm(combinations(features, i)):
    features_combo = list(combo)
    score = cross_val_score(start_pipeline,
                    X=df_combined_22_23[features_combo], y=df_combined_22_23["winner"],
                    scoring="f1_macro", cv=10).mean()
    if score > best_score:
      best_score = score
      best_features = features_combo
print(f"{best_features}: {best_score}")


In [None]:
# Kevin Code

X_train = df_combined_22_23[best_features]
Y_train = df_combined_22_23["winner"]

best_hypes = []
best_score = 0

for scaler in tqdm(scalers):
  pipeline = make_pipeline(scaler, GaussianNB())

  grid_cv = GridSearchCV(pipeline,
                         param_grid={"gaussiannb__var_smoothing": np.logspace(-12, 0, 1000)},
                         scoring="f1_macro", cv=10)
  grid_cv.fit(X_train, Y_train)

  potential_model = [f"Scaler: {scaler}"]
  for key, value in (grid_cv.best_params_).items():
    potential_model.append(f"{key}: {value}")
  score = grid_cv.best_score_

  if score > best_score:
    best_score = score
    best_hypes = potential_model

print(f"{best_hypes}: {best_score}")

In [None]:
# Kevin Code

gaussianModel_InputsHyp_Scaler = MaxAbsScaler()
gaussianModel_InputsHyp_smoothing = 1e-12
gaussianModel_InputsHyp_features = ['SRS_diff', 'Tm._diff', 'PF_diff']
gaussianModel_InputsHyp_Pipeline = make_pipeline(gaussianModel_InputsHyp_Scaler,
                                                  GaussianNB(var_smoothing=gaussianModel_InputsHyp_smoothing))

gaussianModel_InputsHyp_XTrain = df_combined_22_23[gaussianModel_InputsHyp_features]
gaussianModel_InputsHyp_YTrain = df_combined_22_23["winner"]

gaussianModel_InputsHyp_Score = cross_val_score(gaussianModel_InputsHyp_Pipeline, gaussianModel_InputsHyp_XTrain, gaussianModel_InputsHyp_YTrain, scoring= "f1_macro", cv= 10).mean()
gaussianModel_InputsHyp_Score

In [None]:
# Kevin Code

# Ensembling the two GaussianNB models using VotingRegressor to see if a better model is produced

ensem_features = list(set(gaussianModel_HypInputs_features + gaussianModel_InputsHyp_features))

X_train = df_combined_22_23[ensem_features]
Y_train = df_combined_22_23["winner"]

gaussianModel_Ensemble = VotingClassifier([("gaussianModel_HypInputs", gaussianModel_HypInputs_Pipeline), ("gaussianModel_InputsHyp", gaussianModel_InputsHyp_Pipeline)])

gaussianModel_Ensemble_Score = cross_val_score(gaussianModel_Ensemble, X_train, Y_train, scoring= "f1_macro", cv= 10).mean()
gaussianModel_Ensemble_Score

In [None]:
# Kevin Code

print(f"gaussianModel_HypInputs_Score : {gaussianModel_HypInputs_Score}")
print(f"gaussianModel_InputsHyp_Score : {gaussianModel_InputsHyp_Score}")
print(f"gaussianModel_Ensemble_score : {gaussianModel_Ensemble_Score}")

In [None]:
# Kevin Code

# Getting the pipelines and features data for the best KNeighborsClassifier model and GaussianNB model according to the evalulations above

best_kneighborsModel_Pipeline = kneighborsModel_InputsHyp_Pipeline
best_gaussianModel_Pipeline = gaussianModel_HypInputs_Pipeline

best_kneighborsModel_XTrain = kneighborsModel_InputsHyp_XTrain
best_gaussianModel_XTrain = gaussianModel_HypInputs_XTrain

# Making a global variable for all the known outcomes of the training data
all_YTrain = df_combined_22_23["winner"]

# Storing the test data for the features of each model
best_kneighborsModel_XTest = df_combined_23_24[kneighborsModel_InputsHyp_features]
best_gaussianModel_XTest = df_combined_23_24[gaussianModel_HypInputs_features]

# Making a global variable for all the known outcomes of the test data
all_YTest = df_combined_23_24["winner"]

In [None]:
# Kevin Code

# Graphing the importance of each input for the best KNeighborsClassifier model

# Using permutation_importance learned about online

from sklearn.inspection import permutation_importance
import matplotlib.pyplot as plt

# Fitting the training data to the model
best_kneighborsModel_Pipeline.fit(best_kneighborsModel_XTrain, all_YTrain)

# Storing the information from permutation_importance function
perm_importance = permutation_importance(best_kneighborsModel_Pipeline, best_kneighborsModel_XTrain, all_YTrain, scoring= "f1_macro", n_repeats= 20, random_state= 0)

# Storing the importance means for each input which represents the accuracy drop when a feature is shuffled.
# So the higher the value, the more important a feature is to the model.
best_kneighborsModel_InputsImportance = perm_importance.importances_mean

# Plotting the input importance

# Sorting the input importance values in descending order so that when graph,
# longest bar is at the top
best_InputNames = best_kneighborsModel_XTrain.columns
sorted_iputs = np.argsort(best_kneighborsModel_InputsImportance)
sorted_importances = best_kneighborsModel_InputsImportance[sorted_iputs]
sorted_InputNames = best_InputNames[sorted_iputs]

# Plotting horizontal bar graph of inputs importance
plt.figure(figsize=(10, 5))
plt.barh(sorted_InputNames, sorted_importances)
plt.xlabel("Input Importance Score")
plt.ylabel("Input")
plt.title("Permutation Input Importance for best KNeighborsClassifier Model")
plt.gca()
plt.show()

In [None]:
# Kevin Code

# Graphing the importance of each input for the best GaussianNB model
# Just as done above with KNeighborsClassifier

best_gaussianModel_Pipeline.fit(best_gaussianModel_XTrain, all_YTrain)

perm_importance = permutation_importance(best_gaussianModel_Pipeline, best_gaussianModel_XTrain, all_YTrain, scoring= "f1_macro", n_repeats= 20, random_state= 0)

best_gaussianModel_InputsImportance = perm_importance.importances_mean


best_InputNames = best_gaussianModel_XTrain.columns
sorted_iputs = np.argsort(best_gaussianModel_InputsImportance)
sorted_importances = best_gaussianModel_InputsImportance[sorted_iputs]
sorted_InputNames = best_InputNames[sorted_iputs]

plt.figure(figsize=(10, 5))
plt.barh(sorted_InputNames, sorted_importances)
plt.xlabel("Input Importance Score")
plt.ylabel("Input")
plt.title("Permutation Input Importance for best GaussianNB Model")
plt.gca()
plt.show()

In [None]:
# Kevin Code

# Getting the f1_macro score for the best KNeighborsClassifier model and GaussianNB model on the test data to see which is better

from sklearn.metrics import f1_score

best_kneighborsModel_Pipeline.fit(best_kneighborsModel_XTrain, all_YTrain)
best_kneighborsModel_YTest_predictions = best_kneighborsModel_Pipeline.predict(best_kneighborsModel_XTest)

best_kneighborsModel_YTest_f1macro = f1_score(all_YTest, best_kneighborsModel_YTest_predictions, average="macro")

best_gaussianModel_Pipeline.fit(best_gaussianModel_XTrain, all_YTrain)
best_gaussianModel_YTest_predictions = best_gaussianModel_Pipeline.predict(best_gaussianModel_XTest)

best_gaussianModel_YTest_f1macro = f1_score(all_YTest, best_gaussianModel_YTest_predictions, average="macro")

print(f"Best KNeighborsClassifier f1_score: {best_kneighborsModel_YTest_f1macro}")
print(f"Best GaussianNB f1_score: {best_gaussianModel_YTest_f1macro}")


In [None]:
# Kevin Code

# Checking the estimated test error if using the 23-24 data as training data

best_kneighborsModel_YTest_f1macro = cross_val_score(best_kneighborsModel_Pipeline, best_kneighborsModel_XTest, all_YTest, scoring="f1_macro", cv=10).mean()
best_gaussianModel_YTest_f1macro = cross_val_score(best_gaussianModel_Pipeline, best_gaussianModel_XTest, all_YTest, scoring="f1_macro", cv=10).mean()

print(f"Best KNeighborsClassifier f1_score: {best_kneighborsModel_YTest_f1macro}")
print(f"Best GaussianNB f1_score: {best_gaussianModel_YTest_f1macro}")

In [None]:
# Kevin Code

# Graphing the precision-recall curve for the best KNeighborsClassifier model on the test data

from sklearn.metrics import precision_recall_curve

# Getting the probabilities produced by the model on the training data itself
best_kneighborsModel_Ytest_probs_ = best_kneighborsModel_Pipeline.predict_proba(best_kneighborsModel_XTest)

# Storing the precision and recall values for different thresholds using the precision_recall_curve function
# for the models accuracy on predicting the correct winner
kneighborsModel_precision, kneighborsModel_recall, kneighborsModel_thresholds = precision_recall_curve(
    all_YTest, best_kneighborsModel_Ytest_probs_[:, 1], pos_label=1)

# Plotting the precision-recall curve
pd.DataFrame({
    "precision": kneighborsModel_precision,
    "recall": kneighborsModel_recall
}).plot.line(x="precision", y="recall")

In [None]:
# Kevin Code

# Graphing the precision-recall curve for the best GaussianNB model as done above for the KNeighborsClassifier model

best_gaussianModel_Ytest_probs_ = best_gaussianModel_Pipeline.predict_proba(best_gaussianModel_XTest)

gaussianModel_precision, gaussianModel_recall, gaussianModel_thresholds = precision_recall_curve(
    all_YTest, best_gaussianModel_Ytest_probs_[:, 1], pos_label=1)

pd.DataFrame({
    "precision": gaussianModel_precision,
    "recall": gaussianModel_recall
}).plot.line(x="precision", y="recall")

In [None]:
# Mack Code (Modified by Kevin)
# Checking the estimated test error if using the 23-24 data as training data for Mack's models


best_features_logistic1 = ["SOS_diff", "Tm._diff", "Opp._diff", "FG%_diff", "3P%_diff", "TRB_diff"]

best_pipeline_logistic1 = make_pipeline(
    RobustScaler(),
    LogisticRegression(C=0.01, solver="lbfgs", penalty="l2", random_state=42)
)


best_features_logistic2 = ['SRS_diff', 'Opp._diff', 'PF_diff']

best_pipeline_logistic2 = make_pipeline(RobustScaler(), LogisticRegression(C=0.01, penalty="l2", solver="lbfgs"))

logistic1_f1macro = cross_val_score(best_pipeline_logistic1, df_combined_23_24[best_features_logistic1], all_YTest, scoring="f1_macro", cv=10).mean()
logistic2_f1macro = cross_val_score(best_pipeline_logistic2, df_combined_23_24[best_features_logistic2], all_YTest, scoring="f1_macro", cv=10).mean()

print(f"Logistic 1 f1_score: {logistic1_f1macro}")
print(f"Logistic 2 f1_score: {logistic2_f1macro}")

In [None]:
# Kevin code
# Testing an ensemble of Mack's models

pipeline1 = best_pipeline_logistic1
pipeline2 = best_pipeline_logistic2

ensemble_model = VotingClassifier(
    estimators=[('best_pipeline_logistic1', pipeline1), ('best_pipeline_logistic2', pipeline2)],
    voting='soft'
)

logistic_ensemble =cross_val_score(ensemble_model, df_combined_23_24[list(set(best_features_logistic1 + best_features_logistic2))],
                all_YTest, scoring="f1_macro", cv=10).mean()
print(f"Forest Ensemble f1_score: {logistic_ensemble}")

In [None]:
# Mack Code (Modified by Kevin)
# Checking the estimated test error if using the 23-24 data as training data for Mack's models

best_features_forests1 = ['SRS_diff', 'SOS_diff', 'Tm._diff', 'Opp._diff']
best_pipeline_forests1 = make_pipeline(MaxAbsScaler(), RandomForestClassifier(max_depth=5, min_samples_split=2, n_estimators=100))

best_features_forests2 = ['Seed_diff', 'SOS_diff', 'Opp._diff', 'TRB_diff', 'STL_diff', 'BLK_diff', 'PF_diff']
best_pipeline_forests2 = make_pipeline(Normalizer(), RandomForestClassifier(max_depth=5, min_samples_split=5,
                                                                            n_estimators=100, random_state=42))

forest1_f1macro = cross_val_score(best_pipeline_forests1, df_combined_23_24[best_features_forests1], all_YTest, scoring="f1_macro", cv=10).mean()
forest2_f1macro = cross_val_score(best_pipeline_forests2, df_combined_23_24[best_features_forests2], all_YTest, scoring="f1_macro", cv=10).mean()

print(f"Forest 1 f1_score: {forest1_f1macro}")
print(f"Forest 2 f1_score: {forest2_f1macro}")

In [None]:
# Kevin code
# Testing an ensemble of Mack's models

pipeline1 = best_pipeline_forests1
pipeline2 = best_pipeline_forests2

ensemble_model = VotingClassifier(
    estimators=[('best_pipeline_forests1', pipeline1), ('best_pipeline_forests2', pipeline2)],
    voting='soft'
)

forest_ensemble =cross_val_score(ensemble_model, df_combined_23_24[list(set(best_features_forests1 + best_features_forests2))],
                all_YTest, scoring="f1_macro", cv=10).mean()
print(f"Forest Ensemble f1_score: {forest_ensemble}")