In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.templates["plotly"].layout.colorway = px.colors.qualitative.Vivid
px.defauts.width = 800

from scipy.optimize import minimize
import sklearn.linear_model as lm
from sklearn.metrics import r2_score

In [None]:
basketball = pd.read_csv("data/nba.csv")
basketball.head()

In [None]:
basketball = pd.read_csv("data/nba.csv")
first_team = basketball.groupby("GAME_ID").first()
secont_team = basketball.groupby("GAME_ID").last()
games = first_team.merge(second_team, left_index = True, right_index = True, suffixes = ["", "_OPP"])
games['GOAL_DIFF'] = games["FG_PCT"] - games["FG_PCT_OPP"]
games['WON'] = (games['WL'] == "W").astype(int)
games = games[['TEAM_NAME', 'TEAM_NAME_OPP', 'MATCHUP', 'WON', 'WL', 'GOAL_DIFF']]
games

In [None]:
px.scatter(games,
           x = "GOAL_DIFF", y="WON", color="WL",
           hover_data=['TEAM_NAME', 'TEAM_NAME_OPP'])

In [None]:
px.strip(games, x="GOAL_DIFF", y="WL", color="WL",
         hover_data=['TEAM_NAME', 'TEAM_NAME_OPP'],
         category_orders={"WL": ["W", "L"]})

In [None]:
np.random.seed(42)
games["JitterWON"] = games["WON"] + np.random.uniform(-0.1, 0.1, len(games))
px.scatter(games,
           x="GOAL_DIFF", y="JitterWON", color="WL",
           hover_data=['TEAM_NAME', 'TEAM_NAME_OPP'])

In [None]:
X = games[["GOAL_DIFF"]]
Y = games["WON"]
least_squares_model = lm.LinearRegression()
least_squares_model.fit(X,Y)

pred = pd.DataFrame({"GOAL_DIFF": np.linspace(-0.3, 0.3)})
pred["LS_Pred"] = least_squares_model.predict(pred)

fig = px.scatter(games,
                 x="GOAL_DIFF", y="JitterWON", color="WL",
                 hover_data=['TEAM_NAME', 'TEAM_NAME_OPP'])
fig.add_trace(go.Scatter(x=pred["GOAL_DIFF"], y=pred["LS_Pred"],
                         mode="lines", name="Least Squares Fit"))

In [None]:
bins, cut = pd.cut(games["GOAL_DIFF"], 20, retbins=True)

games.join(bins, rsuffix="_bins").head()

In [None]:
fig = px.scatter(games,
                 x="GOAL_DIFF", y="JitterWON", color="WL",
                 hover_data=['TEAM_NAME', 'TEAM_NAME_OPP'])
for cut in cuts:
  fig.add_vline(x=cut, line_dash="dash", line_color="black")

  fig.show()

In [None]:
games['bin_center'] = bins.apply(lambda x: (x.left + x.right)/2).astype(float)

win_rates_by_bin = (
    games[["bin_center", "WON"]]
    .groupby("bin_center")
    .mean()
    .rename(columns={"WON": "Win Rate"})
)
win_rates_by_bin

In [None]:
fig = px.scatter(games,
                 x="GOAL_DIFF", y="JitterWON", color="WL",
                 hover_data=['TEAM_NAME', 'TEAM_NAME_OPP'])
fig.add_trace(go.Scatter(x=win_rates_by_bin.index, y=win_rates_by_bin['Win Rate'],
                         mode="markers+lines", name="Win Rate by Bin"))
for cut in cuts:
  fig.add_vline(x=cut, line_dash="dash", line_color="black")

fig.show()

In [None]:
win_rates_by_bin.index.name = "GOAL_DIFF"
win_rates_by_bin["odds"] = win_rates_by_bin["Win Rate"]/(1 - win_rates_by_bin["Win Rate"])
win_rates_by_bin

In [None]:
px.line(win_rates_by_bin, y="odds")

In [None]:
win_rates_by_bin["log(odds)"] = np.log(win_rates_by_bin["odds"])
px.line(win_rates_by_bin, y="log(odds)")

In [None]:
from plotly.subplots import make_subplots
fig = make_subplots(rows=1, cols=3, subplots_title=("Win Rates", "Odds", "Log(Odds)"))
fig.add_trace(go.Scatter(x=win_rates_by_bin.index, y=win_rates_by_bin['Win Rate'], mode="markers+lines"), row=1, col=1)
fig.add_trace(go.Scatter(x=win_rates_by_bin.index, y=win_rates_by_bin['odds'], mode="markers+lines"), row=1, col=2)
fig.add_trace(go.Scatter(x=win_rates_by_bin.index, y=win_rates_by_bin['log(odds)'], mode="markers+lines"), row=1, col=3)
fig.update_layout(showlegend=False)

In [None]:
logistic_model = lm.LogisticRegression(C=20)
logistic_model.fit(X, Y)
pred["Logistic_Pred"] = logistic_model.predict_proba(pred[["GOAL_DIFF"]])[:,1]

In [None]:
fig = px.scatter(games,
                 x="GOAL_DIFF", y="JitterWON", color="WL",
                 hover_data=['TEAM_NAME', 'TEAM_NAME_OPP'])

fig.add_trace(go.Scatter(x=win_rates_by_bin.index, y=win_rates_by_bin['Win Rate'],
                         mode="markers+lines", name="Win Rate by Bin"))
fig.add_trace(go.Scatter(x=pred["GOAL_DIFF"], y=pred["Logistic_Pred"],
                         mode="lines", name="Logistic Regression Model",
                         line_color="black", line_width=5, line_dash="dash"))

fig.show()

In [None]:
toy_df = pd.DataFrame({
    "x":[-4, -2, -0.5, 1, 3, 5],
    "y": [0,0,1,0,1,1]
})
toy_df["str_y"] = toy_df["y"].astype(str)
toy_df.sort_values("x")

In [None]:
fig = px.scatter(toy_df, x="x", y="y", color="str_y", width=800)
fig.update_traces(marker_size=20)

In [None]:
def sigmoid(z):
  return 1/(1+np.e**-z)

def mse_on_toy_data(theta):
  p_hat = sigmoid(toy_df['x'] * theta)
  return np.mean((toy_df['y'] - p_hat)**2)

theta_loss = pd.DataFrame({"theta": np.linspace(-10,10,100)})
theta_loss["MSE"] = theta_loss["theta"].apply(mse_on_toy_data)
px.line(theta_loss, x="theta", y="MSE", width=800,
        title="MSE on Toy Classification Data")

In [None]:
best_theta = minimize(mse_on_toy_data, x0=0)["x"][0]
best_theta

In [None]:
fig = px.scatter(toy_df, x="x", y="y", color="str_y", width=800)
xs = linspace(-10, 10, 100)
fig.add_space(go.Scatter(
    x=xs, y=sigmoid(xs * best_theta),
    mode="lines", line_color="black",
    name=f"LR Model: theta = {best_theta:.2f}"))
fig.udpate_traces(marker_size=20)

In [None]:
best_theta_2 = minimize(mse_on_toy_data, x0 = -5)["x"][0]
best_theta_2

In [None]:
fig = px.scatter(toy_df, x="x", y="y", color="str_y", width=800)
xs = np.linspace(-10,10, 100)
fig.add_trace(go.Scatter(
    x=xs, y=sigmoid(xs * best_theta_2),
    mode="lines", line_color="black",
    name=f"LR Model: theta = {best_theta_2:.2f}"))
fig.update_traces(marker_size=20)

In [None]:
fig = px.line(theta_loss, x="theta", y="MSE", width=800,
              title="MSE on Toy Classification Data")
fig.add_scatter(x=[best_theta], y=[mse_on_toy_data(best_theta)],
                mode="markers", marker_size=10, marker_color="red",
                name=f"Theta_1: {best_theta:.2f}")
fig.add_scatter(x=[best_theta_2], y=[mse_on_toy_data(best_theta_2)],
                mode="markers", marker_size=10, marker_color="red",
                name=f"Theta_2: {best_theta_2:.2f}")

In [None]:
p_hat_loss = pd.DataFrame({"p_hat": np.arange(0.001, 0.999, 0.01)})
p_hat_loss["L2 Loss"] = (1 - p_hat_loss["p_hat"])**2
px.line(p_hat_loss, x="p_hat", y="L2 Loss", width=800,
        title="Squared Loss for One Individual when y=1")

In [None]:
p_hat_loss["Neg Log Loss"] = -np.log(p_hat_loss["p_hat"])

In [None]:
px.line(p_hat_loss.melt(id_vars="p_hat", value_name="Loss"),
        x="p_hat", y="Loss", color="variable", width=800,
        title="Loss Comparison for One Observation when y = 1")

In [None]:
p_hat_loss = pd.DataFrame({"p_hat": np.arange(0.001, 0.999, 0.01)})
p_hat_loss["L2 Loss"] = (1 - (1-p_hat_loss["p_hat"]))*2
p_hat_loss["Neg Log Loss"] = -np.log(1- p_hat_loss["p_hat"])
px.line(p_hat_loss.melt(id_vars="p_hat", value_name="Loss"),
        x="p_hat", y="Loss", color="variable", width=800,
        title="Loss Comparison for One Observation when y = 0")

In [None]:
def cross_entropy(y, p_hat):
  return - y * np.log(p_hat) - (1 - y) * np.log(1 - p_hat)
def mean_cross_entropy_on_toy_data(theta):
  p_hat = sigmoid(toy_df["x"] * theta)
  return np.mean(cross_entropy(toy_df["y"], p_hat))

In [None]:
theta_loss["Cross-Entropy"] = theta_loss["theta"].apply(mean_cross_entropy_on_toy_data).dropna()
px.line(theta_loss, x="theta", y="Cross-Entropy", width=800,
        title="Cross-Entropy on Toy Classification Data")

In [None]:
def mean_cross_entropy_on_toy_data(theta):
  y = toy_df["y"]
  z = toy_df["x"] * theta
  return -np.mean((y-1) * z - np.log1p(np.exp(-z)))

In [None]:
theta_loss["Cross-Entropy"] = theta_loss["theta"].apply(mean_cross_entropy_on_toy_data).dropna()
px.line(theta_loss.melt(id_vars="theta", value_name="Loss"),
        x="theta", y="Loss", color="variable",
        title="Cross-Entropy on Toy Classification Data")

In [None]:
best_ce_theta = minimize(mean_cross_entropy_on_toy_data, x0 = -5)["x"][0]
best_ce_theta

In [None]:
fig = px.line(theta_loss.melt(id_vars="theta", value_name="Loss"),
              x="theta", y="Loss", color="variable",
              title="Cross-Entropy on Toy Classification Data")
fig.add_scatter(x=[best_theta], y=[mse_on_toy_data(best_theta)],
                mode="markers", marker_size=10, marker_color="red",
                name=f"Theta_1: {best_theta:.2f}")
fig.add_trace(go.Scatter(x=[best_ce_theta], y=[mean_cross_entropy_on_toy_data(best_ce_theta)],
                         mode="markers", marker_size=10, marker_color="Blue",
                         name=f"CE Theta: {best_ce_theta:.2f}"))

In [None]:
fig = px.scatter(toy_df, x="x", y="y", color="str_y", width=800)
xs = np.linspace(-10,10, 100)
fig.add_trace(go.Scatter(
    x=xs, y=sigmoid(xs * best_theta),
    mode="lines", line_color="red",
    name=f"LR + MSE Loss"))
fig.add_trace(go.Scatter(
    x=xs, y=sigmoid(xs * best_ce_theta),
    mode="lines", line_color="blue",
    name=f"LR + CE Loss"))

fig.update_traces(marker_size=20)