In [None]:
import numpy as np
import pandas as pd
from matplotlib.pyplot import subplots
import statsmodels.api as sm
from ISLP import load_data
from ISLP.models import (ModelSpec as MS,
                         summarize)

In [None]:
from ISLP import confusion_table
from ISLP.models import contrast
from sklearn.discriminant_analysis import \
     (LinearDiscriminantAnalysis as LDA,
      QuadraticDiscriminantAnalysis as QDA)
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import RocCurveDisplay
roc_curve_est = RocCurveDisplay.from_estimator 
roc_curve_pred = RocCurveDisplay.from_predictions 

In [None]:
# Load the data

Default = load_data('Default')
Default

In [None]:
# Do some preliminary feature engineering prior to any modeling
X=Default[['balance','income','student']]

X.loc[X['student']=="Yes", 'student_yes'] = 1
X.loc[X['student']=="No", 'student_yes'] = 0
X.drop(columns='student', axis=1, inplace=True)
X

In [None]:
# Create the target

y = Default.default == 'Yes'

In [None]:
# Create train and test split

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    random_state=314,
                                                    test_size=0.25,
                                                    shuffle=True) 
Train = pd.merge_ordered(X_train,y_train,left_on=X_train.index,right_on=y_train.index).drop(columns=['key_0'])
Test = pd.merge_ordered(X_test,y_test,left_on=X_test.index,right_on=y_test.index).drop(columns=['key_0'])

## $k$-Nearest Neighbors

In [None]:
# kNN requires an array as input, so we'll make copies of the train and test X's

X_train_knn, X_test_knn = [np.asarray(X) for X in [X_train, X_test]]

In [None]:
# First we'll try using only the nearest neighbor (k=1)
knn1 = KNeighborsClassifier(n_neighbors=1)
knn1.fit(X_train_knn, y_train)

knn1_test = knn1.predict(X_test_knn)

In [None]:
confusion_table(knn1_test, y_test)

In [None]:
# We'll create an ROC plot and keep adding curves from new models to it

fig, ax = subplots(figsize=(8,8))
roc_curve_est(knn1,
              X_test_knn,
              y_test,
              name='kNN1 (Test)',
              color='r',
              ax=ax);

In [None]:
# Now we'll try 10 nearest neighbors

knn10 = KNeighborsClassifier(n_neighbors=10)
knn10.fit(X_train_knn, y_train)
knn10_test = knn10.predict(X_test_knn)

In [None]:
confusion_table(knn10_test, y_test)

In [None]:
# Wait, only 7 records were marked true?

knn10_test_prob = knn10.predict_proba(X_test_knn)[:,1]
knn10_test_prob

In [None]:
# What if we change the threshold? 
# Try to get the number of false positives and false negatives to be about the same

knn10_test_pred = np.array([True]*len(y_test))
knn10_test_pred[knn10_test_prob<=0.2] = False

In [None]:
confusion_table(knn10_test_pred, y_test)

In [None]:
roc_curve_est(knn10,
              X_test_knn,
              y_test,
              name='kNN10 (Test)',
              color='y',
              ax=ax);
fig

In [None]:
# How about 20 nearest neighbors?

knn20 = KNeighborsClassifier(n_neighbors=20)
knn20.fit(X_train_knn, y_train)
knn20_test = knn20.predict(X_test_knn)

In [None]:
confusion_table(knn20_test, y_test)

In [None]:
knn20_test_prob = knn20.predict_proba(X_test_knn)[:,1]

knn20_test_pred = np.array([True]*len(y_test))
knn20_test_pred[knn20_test_prob<=0.15] = False

confusion_table(knn20_test_pred, y_test)

In [None]:
# How does the ROC curve look?

roc_curve_est(knn20,
              X_test_knn,
              y_test,
              name='kNN20 (Test)',
              color='g',
              ax=ax);
fig

In [None]:
# How about 100 nearest neighbors?

knn100 = KNeighborsClassifier(n_neighbors=100)
knn100.fit(X_train_knn, y_train)
knn100_test = knn100.predict(X_test_knn)

In [None]:
confusion_table(knn100_test, y_test)

In [None]:
knn100_test_prob = knn100.predict_proba(X_test_knn)[:,1]

knn100_test_pred = np.array([True]*len(y_test))
knn100_test_pred[knn100_test_prob<=0.09] = False

confusion_table(knn100_test_pred, y_test)

In [None]:
roc_curve_est(knn100,
              X_test_knn,
              y_test,
              name='kNN100 (Test)',
              color='c',
              ax=ax);
fig

## Naive Bayes

In [None]:
# Build the model

nb = GaussianNB()
nb.fit(X_train_knn, y_train)

In [None]:
# get predictions and construct confusion matrix on test data

nb_test = nb.predict(X_test_knn)
confusion_table(nb_test, y_test)

In [None]:
roc_curve_est(nb,
              X_test_knn,
              y_test,
              name='Naive Bayes (Test)',
              color='m',
              ax=ax);
fig

## Logistic Regression

In [None]:
# We'll need to add an intercept column to our X dataframes

X_train['intercept'] = np.ones(X_train.shape[0])
X_test['intercept'] = np.ones(X_test.shape[0])

In [None]:
# Run the logistic regression model

glm = sm.GLM(y_train,
             X_train,
             family=sm.families.Binomial())
results = glm.fit()
summarize(results)

In [None]:
def predict(X, model):
    # the built-in get_prediction tool returns an array, so we need to convert to a dataframe
    predictions_df = pd.DataFrame(model.get_prediction(X).predicted, columns=['y_hat'], index=X.index)
    return predictions_df['y_hat']

In [None]:
# Get the predicted probabilities

probs_train=predict(X_train,results)
probs_test=predict(X_test,results)

In [None]:
# We'll use 0.3 as the threshold for True vs. False

predictions_train = np.array([True]*len(y_train))
predictions_train[probs_train<0.3] = False

predictions_test = np.array([True]*len(y_test))
predictions_test[probs_test<0.3] = False


In [None]:
confusion_table(predictions_test, y_test)

In [None]:
# Let's look at the ROC curve

roc_curve_pred(y_test,
               probs_test,
               name='Logistic:Prob (Test)',
               color='b',
               ax=ax);
fig

## LDA (optional)

In [None]:
# We'll initialize it here

lda = LDA(store_covariance=True)


In [None]:
# Let's remove the intercept column that we needed for logistic regression

X_train.drop(columns='intercept', axis=1, inplace=True)
X_test.drop(columns='intercept', axis=1, inplace=True)

In [None]:
lda.fit(X_train, y_train)

In [None]:
# Here are the centroids for each class on the training set

lda.means_

In [None]:
# Here are the class names

lda.classes_

In [None]:
# Here is the order of the column names

X_train.columns

In [None]:
# Here are the expected probabilities of the classes

lda.priors_

In [None]:
# This is the linear combination used to decide which class to assign

lda.scalings_

In [None]:
# Calculate predictions on test set (we chose 0.26 in this case)

lda_probs = lda.predict_proba(X_test)[:,1]

lda_test = np.array([True]*len(y_test))
lda_test[lda_probs<0.26] = False
lda_test

In [None]:
confusion_table(lda_test, y_test)

In [None]:
roc_curve = RocCurveDisplay.from_estimator 

roc_curve(lda,
          X_test,
          y_test,
          name='LDA Test',
          color='m',
          ax=ax);
fig

## Likelihood

In [None]:
# Create helper functions for computing the likelihood

def predict(X, model):
    # the built-in get_prediction tool returns an array, so we need to convert to a dataframe
    predictions_df = pd.DataFrame(model.get_prediction(X).predicted, columns=['y_hat'], index=X.index)
    return predictions_df['y_hat']

def like(y, y_hat):
    # create a columns of ones
    ones = np.ones(X_test.shape[0])
    # calculate the likelihood for each individual record
    like_indiv = y*(y_hat)+(ones-y)*(ones-y_hat)
    # calculate the product across records
    likelihood = np.prod(like_indiv)
    return likelihood

In [None]:
likelihood_test = like(y_test,probs_test)

In [None]:
print(likelihood_test)