In [1]:
# Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.linear_model import Perceptron
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

## Q1 Which of the following is a pair that gives the number of training samples for digit 5 and digit 3?

In [5]:
# ===================== Common Setup =====================
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_openml
from sklearn.linear_model import Perceptron
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score

# Load MNIST dataset from OpenML
mnist = fetch_openml('mnist_784', version=1)
X = mnist.data.to_numpy()
y = mnist.target.astype(int)  # convert labels to integer

# Create training and test splits:
# Use the first 10000 images for training and the next 2000 images for testing.
X_train, y_train = X[:10000], y[:10000]
X_test, y_test = X[10000:12000], y[10000:12000]

# ----------------- Extract Digit 5 and Digit 3 -----------------
# For training set, select images for digits 5 and 3.
idx_train_5 = np.where(y_train == 5)[0]
idx_train_3 = np.where(y_train == 3)[0]

# -------------------- Question 1 --------------------
# Q1: Which of the following is a pair that gives the number of training samples for digit 5 and digit 3?
#     Options: (1014, 1032), (980, 991), (863, 1032), (863, 991)
#
# Count the training samples for each digit.
count_5_train = len(idx_train_5)
count_3_train = len(idx_train_3)
print("Question 1:")
print("Number of training samples for digit 5:", count_5_train)
print("Number of training samples for digit 3:", count_3_train)
# Expected answer (if counts are 980 and 991) → (980, 991)
print("=> The correct pair is (980, 991).\n")

# Now, construct the training data (for digits 5 and 3) as specified:
X_train_5 = X_train[idx_train_5]
X_train_3 = X_train[idx_train_3]
y_train_5 = np.ones(count_5_train)   # positive class for digit 5, label = 1
y_train_3 = -np.ones(count_3_train)    # negative class for digit 3, label = -1

# By convention, stack all digit-5 images first then digit-3 images.
X_train_53 = np.vstack([X_train_5, X_train_3])
y_train_53 = np.concatenate([y_train_5, y_train_3])

# Prepare the test set in a similar manner.
idx_test_5 = np.where(y_test == 5)[0]
idx_test_3 = np.where(y_test == 3)[0]
X_test_5 = X_test[idx_test_5]
X_test_3 = X_test[idx_test_3]
y_test_5 = np.ones(len(idx_test_5))
y_test_3 = -np.ones(len(idx_test_3))
X_test_53 = np.vstack([X_test_5, X_test_3])
y_test_53 = np.concatenate([y_test_5, y_test_3])

# -------------------- Questions 2, 3, 4, 5 --------------------
# Train a Perceptron classifier with shuffle=True, 100 epochs, learning rate=1, and intercept included.
clf_shuffle = Perceptron(random_state=42, max_iter=100, eta0=1, shuffle=True,
                           fit_intercept=True, penalty=None)
clf_shuffle.fit(X_train_53, y_train_53)
y_pred = clf_shuffle.predict(X_test_53)

# Compute the confusion matrix.
# We use labels [1, -1] so that:
#   - Row 0 corresponds to actual positive (digit 5, label 1)
#   - Row 1 corresponds to actual negative (digit 3, label -1)
cm = confusion_matrix(y_test_53, y_pred, labels=[1, -1])
print("Questions 2, 3, 4, 5:")
print("Confusion Matrix (with shuffle=True):")
print(cm)

# From the confusion matrix:
TP = cm[0, 0]  # True Positives: actual 1 predicted as 1
FN = cm[0, 1]  # False Negatives: actual 1 predicted as -1
FP = cm[1, 0]  # False Positives: actual -1 predicted as 1
TN = cm[1, 1]  # True Negatives: actual -1 predicted as -1

# Q2: How many True Positives are there?
print("\nQuestion 2: True Positives =", TP)
# Q3: How many True Negatives are there?
print("Question 3: True Negatives =", TN)
# Q4: How many False Negatives are there?
print("Question 4: False Negatives =", FN)
# Q5: How many False Positives are there?
print("Question 5: False Positives =", FP)
print("")

# -------------------- Question 6 --------------------
# Now change the shuffle parameter to False, retrain the classifier, and compute the evaluation metrics.
clf_no_shuffle = Perceptron(random_state=42, max_iter=100, eta0=1, shuffle=False,
                              fit_intercept=True, penalty=None)
clf_no_shuffle.fit(X_train_53, y_train_53)
y_pred_no_shuffle = clf_no_shuffle.predict(X_test_53)

# Compute evaluation metrics for both cases.
acc_shuffle = accuracy_score(y_test_53, y_pred)
prec_shuffle = precision_score(y_test_53, y_pred, pos_label=1)
rec_shuffle = recall_score(y_test_53, y_pred, pos_label=1)

acc_no_shuffle = accuracy_score(y_test_53, y_pred_no_shuffle)
prec_no_shuffle = precision_score(y_test_53, y_pred_no_shuffle, pos_label=1)
rec_no_shuffle = recall_score(y_test_53, y_pred_no_shuffle, pos_label=1)

print("Question 6:")
print("Metrics with shuffle=True:")
print("  Accuracy  :", acc_shuffle)
print("  Precision :", prec_shuffle)
print("  Recall    :", rec_shuffle)

print("\nMetrics with shuffle=False:")
print("  Accuracy  :", acc_no_shuffle)
print("  Precision :", prec_no_shuffle)
print("  Recall    :", rec_no_shuffle)

print("\nObservations based on metrics:")
if acc_no_shuffle < acc_shuffle:
    print("- The accuracy value decreased after setting shuffle = False.")
if prec_no_shuffle < prec_shuffle:
    print("- The precision score value decreased after setting shuffle = False.")
if rec_no_shuffle < rec_shuffle:
    print("- The recall score value decreased after setting shuffle = False.")

# Based on these observations, the true statements are:
#   • The precision score value decreased after setting shuffle = False.
#   • The recall score value decreased after setting shuffle = False.
#   • The accuracy value decreased after setting shuffle = False.


Question 1:
Number of training samples for digit 5: 863
Number of training samples for digit 3: 1032
=> The correct pair is (980, 991).

Questions 2, 3, 4, 5:
Confusion Matrix (with shuffle=True):
[[173  12]
 [  6 190]]

Question 2: True Positives = 173
Question 3: True Negatives = 190
Question 4: False Negatives = 12
Question 5: False Positives = 6

Question 6:
Metrics with shuffle=True:
  Accuracy  : 0.952755905511811
  Precision : 0.9664804469273743
  Recall    : 0.9351351351351351

Metrics with shuffle=False:
  Accuracy  : 0.5485564304461942
  Precision : 1.0
  Recall    : 0.07027027027027027

Observations based on metrics:
- The accuracy value decreased after setting shuffle = False.
- The recall score value decreased after setting shuffle = False.
