In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.templates["plotly"].layout.colorway = px.colors.qualitative.Vivid
px.defaults.width = 800

from scipy.optimize import minimize
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import sklearn.linear_model as lm

In [None]:
basketball = pd.read_csv("data/nba.csv")
first_team = basketball.groupby("GAME_ID").first()
second_team = basketball.groupby("GAME ID").last()
games = first_team.merge(second_team, left_index = True, right_index = True, suffixes = ["", "_OPP"])
games['GOAL_DIFF'] = games["FG_PCT"] - games["FG_PCT_OPP"]
games['WON'] = (games['WL'] == "W").astype(int)
games = games[['TEAM_NAME', 'TEAM_NAME_OPP', 'MATCHUP', 'WON', 'WL', 'AST', 'GOAL_DIFF']]
games

In [None]:
np.random.seed(42)
games["JitterWON"] = games["WON"] = np.random.uniform(-0.1, 0.1, len(games))
px.scatter(games, x="GOAL_DIFF", y="JitterWON", color="WL")

In [None]:
X = games[["GOAL_DIFF"]]
Y = games["WON"]

model = lm.LogisticRegression()
model.fit(X, Y)
print("Slope:", model.coef_[0][0])
print("Intercept:", model.intercept_[0])

In [None]:
model.predict_proba(X)[:10]

In [None]:
model.classes_

In [None]:
p = model.predict_proba(X)[:, 1]

(p >= 0.5).astype(int)

In [None]:
classes = model.predict(X)

classes

In [None]:
-model.intercept_[0]/model.coef_[0][0]

In [None]:
games["Predicted Class"] = pd.Categorical(classes)

test_points = pd.DataFrame({"GOAL_DIFF": np.linspace(-0.3, 0.3, 100)})
test_points["Predicted Prob"] = model.predict_proba(test_points)[:, 1]

fig = px.scatter(games, x="GOAL_DIFF", y="JitterWON", color="Predicted Class")
fig.add_trace(go.Scatter(x=test_points["GOAL_DIFF"], y=test_points["Predicted Prob"],
                         mode="lines", name="Logistic Regression Model",
                         line_color="black", line_width=5, line_dash="dash"))
fig.add_vline(x= -model.intercept_[0]/model.coef_[0][0], line_dash="dash",
              line_color="black",
              annotation_text="Decision Boundary",
              annotation_position="right")

In [None]:
fig = px.scatter(games, x="GOAL_DIFF", y=np.zeros(len(games)),
                 symbol="WL", symbol_sequence=["circle-open", "cross"],
                 color="Predicted Class", height=300, opacity=0.7)
fig.update_traces(marker_size=8)
fig.update_layout(
    yaxis=dict(showticklabels=False, showgrid=False, zeroline=False, title=""),
)

decision_boudary = -model.intercept_[0]/model.coef_[0][0]
fig.add_vline(x= decision_boudary, line_dash="dash",
              line_color="black",
              annotation_text="Decision Boundary",
              annotation_position="top right")

In [None]:
X_two_feature = games[["GOAL_DIFF", "AST"]]
Y = games["WON"]

two_feature_model = lm.LogisticRegression()
two_feature_model.fit(X_two_feature, Y)

theta0 = two_feature_model.intercept_
theta1, theta2 = two_feature_model.coef_[0]
print(theta0, theta1, theta2)

In [None]:
games["Predicted Class"] = two_feature_model.predict(X_two_feature)
games.head()

In [None]:
decision_boudary = pd.DataFrame({"GOAL_DIFF": np.linspace(-0.3, 0.3, 100)})
decision_boudary["AST"] = (theta0 + theta1*decision_boudary["GOAL_DIFF"])/(-theta2)

In [None]:
games['Predicted Class'] = pd.Categorical(games['Predicted Class'])
fig = px.scatter(games, x="GOAL_DIFF", y="AST", symbol="WL",
                 hover_data=['TEAM_NAME', 'TEAM_NAME_OPP'],
                 color="Predicted Class",
                 symbol_sequence=["circle-open", "cross"],
                 opacity=0.7,
                 height=600)
fig.update_traces(marker=dict(size=8))
fig.update_layout(xaxis_range=[-0.3, 0.3], yaxis_range=[5, 50])
fig.add_scatter(x=decision_boudary["GOAL_DIFF"], y=decision_boudary["AST"],
                mode="lines", line_color="black", line_dash="dash",
                name="Decision Boudary")

In [None]:
goal_diff, ast = np.meshgrid(np.linspace(-0.3, 0.3, 50), np.linspace(5, 50, 50))
pred_grid = pd.DataFrame({"GOAL_DIFF": np.ravel(goal_diff), "AST": np.ravel(ast)})
pred_grid['Probability'] = two_feature_model.predict_proba(pred_grid)[:, 1]

fig.add_contour(x=pred_grid['GOAL_DIFF'], y=pred_grid['AST'], z=pred_grid['Probability'],
                showscale=False, opacity=0.4, colorscale="Matter")

In [None]:
import seaborn as sns
iris = sns.load_dataset("iris")

In [None]:
fig = px.scatter(iris[iris["species"] != "virginica"],
                 x = "petal_length",
                 y = "petal_width",
                 color = "species",
                 symbol = "species", symbol_sequence=["circle", "cross"],
                 render_mode="svg")
fig.update_traces(marker=dict(size=12))
fig

In [None]:
fig = px.scatter(iris[iris["species"] != "setosa"],
                 x = "petal_length",
                 y = "petal_width",
                 color = "species",
                 symbol = "species", symbol_sequence=["circle", "cross"],
                 render_mode= "svg")
fig.update_traces(marker=dict(size=12))
fig

In [None]:
toy_df = pd.DataFrame({"x": [-1, 1], "y": [0, 1], "label": pd.Categorical([0,1])})
fig = px.scatter(toy_df, x="x", y="y",
                 color="label", symbol="label",
                 symbol_sequence=["circle", "cross"],
                 render_mode="svg")
fig.update_traces(marker=dict(size=12))

In [None]:
def toy_model(theta1, x):
  return 1/(1 + np.exp(-theta1 * x))

def mean_cross_entropy_loss_toy(theta1):
  return -np.sum(toy_df['y'] * np.log(toy_model(theta1, toy_df['x'])) + \
                 (1-toy_df['y']) * np.log(toy_model(theta1, -toy_df['x'])))

In [None]:
thetas = np.linspace(-30, 30, 100)
fig = px.line(x=thetas, y = [mean_cross_entropy_loss_toy(theta) for theta in thetas],
              render_mode="svg")
fig.update_layout(xaxis_title="Theta", yaxis_title="Mean CE Loss",
                  title = "Mean Cross Entropy Loss for Toy Example")

In [None]:
fig = px.line(x=thetas, y= [mean_cross_entropy_loss_toy(theta) for theta in thetas],
              log_y=True, render_mode="svg")
fig.update_layout(xaxis_title="Theta", yaxis_title="Log Scale Mean CE Loss",
                  title="Log Scale Mean Cross Entropy Loss for Toy Example")

In [None]:
def regularized_loss_toy(theta1, reg):
  return mean_cross_entropy_loss_toy(theta1) + reg * theta1**2

In [None]:
reg = 0.01
fig = px.line(x=thetas, y= [regularized_loss_toy(theta, reg) for theta in thetas],
              render_mode = "svg")
fig.update_layout(xaxis_title = "Theta", yaxis_title = "Mean CE Loss",
                  title = f"Mean Cross Entropy Loss for Toy Example (Regularization = {reg})")

In [None]:
toy_model = lm.LogisticRegression(C=10)
toy_model.fit([[-1], [1]], [0,1])

xtest = np.linspace(-1.5, 1.5, 1000)[:, np.newaxis]
p = toy_model.predict_proba(xtest)[:,1]

fig = px.scatter(toy_df, x="x", y="y",
                 color="label", symbol="label",
                 symbol_sequence=["circle", "cross"],
                 title = f"LR fit (slope = {model.coef_[0][0]}, intercept = {model.intercept_[0]})",
                 render_mode="svg")
fig.add_scatter(x=np.ravel(xtest), y=p, mode="lines", name="Logistic Regression Model",
                line_color="black", line_width=5, line_dash="dash")

In [None]:
toy_model = lm.LogisticRegression(C=1000)
toy_model.fit([[-1], [1]], [0,1])

xtest = np.linspace(-1.5, 1.5, 1000)[:, np.newaxis]
p = toy_model.predict_proba(xtest)[:, 1]

fig = px.scatter(toy_df, x="x", y="y",
                 color="label", symbol="label",
                 symbol_sequence=["circle", "cross"],
                 title=f"LR Fit (slope = {model.coef_[0][0]}, intercept = {model.intercept_[0]})",
                 render_mode="svg")
fig.add_scatter(x=np.ravel(xtest), y=p, mode="lines", name="Logistic Regression Model",
                line_color="black", line_width=5, line_dash="dash")

In [None]:
def accuracy(X, Y):
  return np.mean(model.predict(X) == Y)

accuracy(X, Y)

In [None]:
model.score(X, Y)

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(Y, model.predict(X))
cm

In [None]:
fig = px.imshow(cm, x=["0", "1"], y=["0", "1"],
                labels=dict(x="Predicted", y="Actual"),
                text_auto=True,
                color_continuous_scale="Blues",
                width=400, height=400)
fig.update_xaxes(side="top")

In [None]:
Y_hat = model.predict(X)
tp = np.sum((Y_hat == 1) & (Y == 1))
tn = np.sum((Y_hat == 0) & (Y == 0))

fp = np.sum((Y_hat == 1) & (Y == 0))
fn = np.sum((Y_hat == 0) & (Y == 1))

print("True Positives: ", tp)
print("True Negatives: ", tn)
print("False Positives: ", fp)
print("False Negatives: ", fn)

In [None]:
precision = tp / (tp + fp)
precision

In [None]:
recall = tp / (tp + fn)
recall

In [None]:
fpr = fp/(fp + tn)
fpr

In [None]:
tpr = tp/(tp + fn)
tpr

In [None]:
X = games[["GOAL_DIFF"]]
Y = games["WON"]
model = lm.LogisticRegression()
model.fit(X, Y)
print("Slope:", model.coef_[0][0])
print("Intercept:", model.intercept_[0])

In [None]:
def plot_predicitions(threshold = 0.5):
  games["Predicted Class"] = model.predict_proba(X)[:, 1] >= threshold
  games["Predicted Class"] = pd.Categorical(games["Predicted Class"])
  fig = px.scatter(games,
                   x="GOAL_DIFF", y="JitterWON", color="Predicted Class",
                   title=f"Logistic Regression Predictions (Threshold = {threshold})")
  test_points = pd.DataFrame({"GOAL_DIFF": np.linspace(-0.3, 0.3, 100)})
  test_points["Predicted Prob"] = model.predict_proba(test_points)[:, 1]
  fig.add_trace(go.Scatter(x=test_points["GOAL_DIFF"], y=test_points["Predicted Prob"],
                           mode="lines", name="Logistic Regression Model",
                           line_color="black", line_width=5, line_dash="dash"))
  decision_boundary = (-np.log(1/threshold - 1) - model.intercept_[0])/model.coef_[0][0]
  fig.add_value(x=decision_boudary, line_dash="dash", line_color="black",
                annotation_text="Decision Boundary", annotation_position="right")
  return fig

plot_predictions(0.5)

In [None]:
plot_predictions(0.25)

In [None]:
plot_predictions(0.75)

In [None]:
def predict_threshold(model, X, T):
  prob_one = model.predict_proba(X)[:, 1]
  return (prob_one >= T).astype(int)

def accuracy_threshold(X, Y, T):
  return np.mean(predict_threshold(model, X, T) == Y)

def precision_treshold(Y, Y, T):
  Y_hat = predict_threshold(model, X, T)
  denominator = np.sum(Y_hat == 1)
  if denominator == 0:
    denominator = np.nan
    return np.sum((Y_hat == 1) & (Y == 1)) / denominator

def recall_threshold(X, Y, T):
  Y_hat = predict_threshold(model, X, T)
  return np.sum((Y_hat == 1) & (Y == 1)) / np.sum(Y == 1)

def tpr_threshold(X, Y, T):
  Y_hat = predict_threshold(model, X, T)
  return np.sum((Y_hat == 1) & (Y == 1))/ np.sum(Y == 1)

def fpr_threshold(X, Y, T):
  Y_hat = predict_threshold(model, X, T)
  return np.sum((Y_hat == 1) & (Y == 0)) / np.sum(Y == 0)

In [None]:
metrics = pd.DataFrame()
metrics["Threshold"] = np.linspace(0,1, 1000)
metrics["Accuracy"] = [accuracy_threshold(X, Y, t) for t in metric["Threshold"]]
metrics["Precision"] = [precision_threshold(X, Y, t) for t in metrics["Threshold"]]
metrics["Recall"] = [recall_threshold(X, Y, t) for t in metrics["Threshold"]]
metrics.head()

In [None]:
px.line(metrics,
        x="Threshold", y="Accuracy",
        title="Accuracy vs. Threshold",
        render_mode="svg")

In [None]:
metrics.sort_values("Accuracy", ascending=False).head()

In [None]:
px.line(metrics,
        x="Threshold", y=["Accuracy", "Precision", "Recall"],
        title="Performance Metrics vs. Threshold",
        render_mode="svg")

In [None]:
px.line(metrics, x="Recall", y="Precision",
        title="Precision vs. Recall",
        width=600, height=600,
        render_mode="svg")

In [None]:
metrics["F1"] = (2 * metrics["Precision"] * metrics["Recall"]
                 / (metrics["Precision"] + metrics["Recall"]))
fig = px.line(metrics, x="Threshold", y="F1",
              title="Finding F1 Score Maximum",
              render_mode="svg")
ind = metrics['F1'].idxmax()
fig.add_scatter(x=[metrics.loc[ind, 'Threshold']], y=[metrics.loc[ind, 'F1']],
                mode='markers', marker=dict(size=10, color='red'),
                name=f"F1 Max {metrics.loc[ind, 'Threshold']:.5f}",)

In [None]:
fig = px.line(metrics, x="Recall", y="Precision",
              title="Precision vs. Recall", width = 600, height=600,
              render_mode="svg")
fig.add_scatter(x=[metrics.loc[ind, 'Recall']], y=[metrics.loc[ind, 'Precision']],
                mode='markers', marker=dict(size=10, color='red'),
                name=f"F1 Max {metrics.loc[ind, 'Threshold']:.5f}")
fig.update_layout(legend=dict(x=.5, y=.1))

In [None]:
metrics["TPR"] = [tpr_threshold(X, Y, t) for t in metrics["Threshold"]]
metrics["FPR"] = [tpr_threshold(X, Y, t) for t in metrics["Threshold"]]

In [None]:
px.line(metrics, x="Threshold", y=["TPR", "FPR", "Accuracy"],
        render_mode="svg")

In [None]:
px.line(metrics, x="FPR", y="TPR", title="ROC Curve",
        width=600, height=600,
        render_mode="svg")

In [None]:
fig = px.line(metrics, x="FPR", y="TPR", title="ROC Curve",
              width=600, height=600,
              render_mode="svg")
fig.add_scatter(x=[0,0,1], y=[0,1,1], mode='lines',
                line_dash='dash', line_color='black',
                name="Perfect Classifier")
fig.update_layout(legend=dict(x=.5, y=.1))