In [None]:
import warnings
warnings.filterwarnings('ignore')


import pandas as pd
import numpy as np
from plotnine import *

# modelling
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelBinarizer #Z-score variables

# performance
from sklearn.metrics import accuracy_score, confusion_matrix,\
 f1_score, recall_score, precision_score, roc_auc_score
from sklearn.metrics import ConfusionMatrixDisplay, RocCurveDisplay
from sklearn.calibration import calibration_curve

# model validation imports
from sklearn.model_selection import train_test_split # simple TT split cv
from sklearn.model_selection import KFold # k-fold cv
from sklearn.model_selection import LeaveOneOut #LOO cv


## Metrics
We spoke about several metrics you can use to interpret how well your logistic regression model.

- Accuracy: $\frac{TP + TN}{TP + TN + FP + FN}$
- Confusion Matrix Patterns

|                 | **Actually 1**          | **Actually 0**          |
|-----------------|-------------------------|-------------------------|
| **Predicted 1** | True Positive **(TP)**  | False Positive **(FP)** |
| **Predicted 0** | False Negative **(FN)** | True Negative **(TN)**  |


- **Precision**: $\frac{TP}{TP + FP}$, how many of the predicted positives are true positives?

- **Recall/Sensitivity**: $\frac{TP}{TP + FN}$, how many of the actual positives did we accurately predict?

- **Specificity**: $\frac{TN}{TN + FP}$, how many of the actual negatives did we accurately predict?

- **F1 Score**: $\frac{2 * Precision * Recall}{Precision + Recall}$, a combination of precision and recall.

- **ROC AUC**: The area under the ROC curve which puts the False Positive Rate (FPR) on the x-axis, and the True Positive Rate on the y-axis.


The metrics we talked about were not an exhaustive list. [Here](https://scikit-learn.org/stable/api/sklearn.metrics.html#module-sklearn.metrics) are all of the metrics that sklearn has implemented.


TODO:
If you were designing a Flu test, which of the metrics we spoke about in class would be most important to you (there's no one right answer) and why?

## ROC-AUC

<img src="https://drive.google.com/uc?export=view&id=1n-Pg6y8wD-UM05kPyVKorbmteMpHdmcG" alt="ROCAUC Curve" width = "600"/>


TODO:

Is the ROC AUC of this model closer to 0.5 or 1? How did you decide that?

## Calibration

If you'd like to use a model's predicted probabilities, calibration is important.


TODO:

Discuss with your table group the following question: Is calibration important for an *accurate* prediction?




## Logistic Regression Building

TODO:

Using the Lizzo dataset on GitHub, build a logistic regression model to predict the mode. Then check the performance and calibration

In [1]:
df = pd.read_csv("https://raw.githubusercontent.com/katherinehansen2/CPSC392Hansen/refs/heads/main/data/Lizzo_data.csv")

# turn boolean into 0's and 1's (not necessary, but helps with confusion over True/False)
df["explicit"] = df["explicit"].astype("int")
df.head()

In [None]:
predictors = ["danceability", "energy", "instrumentalness", "explicit"]
contin = ["danceability", "energy", "instrumentalness"]

X = df[predictors]
y = df["mode"]

# TTS
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)

# z-score continuous predictors only
scaler = StandardScaler()
X_train[contin] = scaler.fit_transform(X_train[contin])
X_test[contin] = scaler.transform(X_test[contin])

# Create logistic regression model
lr = #TODO

# fit logistic regression model
#TODO


In [None]:
# make predictions for train and test
# with logistic regression, we can return both the predicted class AND the predicted probabilities
y_pred_train = lr.predict(X_train)
y_pred_test = lr.predict(X_test)

y_pred_train_prob = lr.predict_proba(X_train)[:,1]
y_pred_test_prob = lr.predict_proba(X_test)[:,1]

In [2]:
# assess
print("Train Acc       : ", accuracy_score(y_train, y_pred_train))
print("Train Prescision: ", precision_score(y_train, y_pred_train))
print("Train Recall    : ", recall_score(y_train, y_pred_train))
print("Train F1        : ", f1_score(y_train, y_pred_train))
print("Train ROC AUC   : ", roc_auc_score(y_train, y_pred_train_prob))


print("Test Acc        : ", accuracy_score(y_test, y_pred_test))
print("Test Prescision : ", precision_score(y_test, y_pred_test))
print("Test Recall     : ", recall_score(y_test, y_pred_test))
print("Test F1         : ", f1_score(y_test, y_pred_test))
print("Test ROC AUC    : ", roc_auc_score(y_test, y_pred_test_prob))

In [3]:
# confusion matrix

ConfusionMatrixDisplay.from_predictions(y_train,y_pred_train)

In [4]:
# ROC AUC
RocCurveDisplay.from_predictions(y_train, y_pred_train_prob)

In [5]:
# Calibration Curve
prob_true, prob_pred = calibration_curve(y_train, y_pred_train_prob, n_bins=10)

(ggplot() + geom_line(aes(x = prob_pred, y = prob_true), color = "blue")
+ geom_abline(slope = 1, intercept = 0, color = "black", linetype = "dashed") +
labs(x = "Average Predicted Probability",
     y = "Average Acual Probability",
     title = "Calibration Curve") +
ylim([0,1]) + xlim([0,1]))

TODO:

Discuss in detail the performance of your model based on at least 3 of the metrics above. Then, discuss how well calibrated your model is.

In [6]:
# Coefficients

coefficients = pd.DataFrame({
    "Coef": lr.coef_[0],  # grab array of coefficients
    "Name": predictors
})

intercept = pd.DataFrame({
    "Coef": lr.intercept_[0],  # grab intercept
    "Name": "intercept"
}, index=[coefficients.shape[0]])  # assign row index


coefficients_all = pd.concat([coefficients, intercept])

# create odds column
coefficients_all["Odds"] = np.exp(coefficients_all["Coef"])

coefficients_all

TODO: Interpret the coefficients above.

## Another Logistic Regression example
TODO:

Now let's do the same thing but with [this dataset](https://www.kaggle.com/datasets/ruthgn/wine-quality-data-set-red-white-wine). Download the data from Kaggle, and upload it to Colab.

Let's build a model that predicts whether a wine is `red` or `white` using *all* the other variables.

(NOTE: because the column `type` is a string not a binary `0`/`1` variable, we'll need to use [`LabelBinarizer()`](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelBinarizer.html) to turn it into a binary variable.)

In [None]:
# load in data
w = pd.read_csv("wine-quality-white-and-red.csv")
# w.head()

# check missing


# drop


# Binarize and X and Y



# TTS

# Create Empty Model


# fit


# predict



In [None]:
# assess - pick 3+ metrics to assess

In [None]:
# calibration

## The Math of Log Odds, Odds, and Probabilities

In the lecture, you saw a graph demonstrating the way that Log Odds, Odds, and Probabilities change as a result of a 1-unit change in a predictor variable.

Now YOU'RE going to make your own version to help you gain an intuitive sense for the math of log odds, odds, and probabilities.

We're going to write a function that simulates a super simple logistic relationship: 1 predictor (X) and 1 binary outcome (y).

(if it helps, think of this as predicting whether or not someone is registered to vote based on their age.)

In [None]:
def LogisticDataSimulator(intercept, slope, limits = 5):

    #generate 1000 evenly spaced values between -limits and limits
    predictor = np.linspace(-limits, limits, 1000)

    # log odds of being registered to vote
    log_odds = intercept + slope*predictor


    # odds of being registered to vote
    odds = np.exp(log_odds)


    # probability of being registered to vote
    probabilities =  odds/(1 + odds)



    # put into a dataframe
    df = pd.DataFrame({"x": predictor,
                      "logodds": log_odds,
                       "odds": odds,
                       "probabilities": probabilities})

    return(df)

Use the dataframe that calling `LogisticDataSimulator()` with an intercept of 1 and slope of 0.5 makes, and recreate the graph of the log odds, odds, and probabilities from the lecture (see slide 69). Make separate graphs for logodds, odds, and probabilities to make your life easier.

Be sure to include the colored lines, the titles/axis labels, and the dotted vertical lines! Don't be afraid to google how to do things in ggplot (hint: google `geom_line()` and `geom_vline()`).

In [None]:
plot_data = LogisticDataSimulator(intercept = 1, slope = 0.5)

#################################
# TODO
#################################

TODO: Describe in detail how the graphs demonstrate what we learned in class: that the change in predicted probability in response to a 1 unit increase of our predictor is *not constant*? Why might this cause an issue?

