# Logistic Regression with MNIST dataset

#### Holland Brown

#### 2022-04-21

In [None]:
# Load environment packages and import MNIST dataset
import matplotlib.pyplot as plt
import numpy as np
from sklearn import metrics
from sklearn.linear_model import LogisticRegression from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split from sklearn.metrics import roc_auc_score

### Split datasets into testing and training datasets

In [None]:
# Load dataset as data, target
X, y = load_digits(return_X_y = True) y = y.ravel()
for i in range(0,len(y)): # change labels of non-ones to 0 --> now a binary classification problem if y[i] != 1:
y[i] = 0 print(y[:30], "\n")
# Split data into 50% train and 50% test subsets
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.5, shuffle=False
)
print(X_train.data.shape) print(y_train.data.shape) print(X_test.data.shape) print(y_test.data.shape)

### Create classifier

In [None]:
# Create a classifier: logistic regression
f = LogisticRegression(solver='liblinear', max_iter=600) f.fit(X_train, y_train)
prob = f.predict_proba(X_test) pred = f.predict(X_test)
print(prob[:20,:])
print(pred[:20])
#returns probabilities that each im belongs to class 0 or 1
#returns decision of class 0 or 1 for each im
print(pred.shape)
print("model score: %.4f" % f.score(X_test, y_test)) #returns mean accuracy

### Use ROC curve to test best threshold for λ

In [None]:
fpr, tpr, thresholds = metrics.roc_curve(y_test, pred, pos_label=1)
print(f"False Positive Rates (Specificity): {fpr}\nTrue Positive Rates (Sensitivity): {tpr}\nThresholds: {thresholds}\n")
print(f"Area Under ROC Curve: {roc_auc_score(y_test, prob[:,1], multi_class='ovr')}\n") print(f"The optimal threshold is λ = 1.")

### Use confusion matrix to assess performance on test set

In [None]:
cm = metrics.confusion_matrix(y_test, pred)
plt.figure(figsize=(5,5))
plt.imshow(cm, interpolation='nearest', cmap='BuGn') plt.title('Confusion matrix', size = 15)
plt.colorbar()
tick_marks = np.arange(2)
plt.xticks(tick_marks, ["0", "1"], rotation=45, size = 10) plt.yticks(tick_marks, ["0", "1"], size = 10) plt.tight_layout()
plt.ylabel('Actual label', size = 15) plt.xlabel('Predicted label', size = 15)
width, height = cm.shape
for x in range(width):
for y in range(height):
plt.annotate(str(cm[x][y]), xy=(y, x), horizontalalignment='center',
verticalalignment='center') plt.savefig('mth4330_ml_hw4_confusionmat_v3.png')