In [1]:
# Now, we are going to verify how th K-Means algorithm can improve model accurecy
# when using it as a preprocessing step of all instances

from sklearn.datasets import load_digits

# Load the digits dataset; X_digits contains image data, y_digits contains labels
X_digits, y_digits = load_digits(return_X_y=True)

In [2]:
from sklearn.model_selection import train_test_split

# Split the dataset into training and testing sets
# By default, 75% of the data goes to training and 25% to testing
X_train, X_test, y_train, y_test = train_test_split(X_digits, y_digits, random_state=42)

In [3]:
from sklearn.linear_model import LogisticRegression

# Create a logistic regression classifier
log_reg = LogisticRegression(max_iter=1000)  # Increased max_iter to ensure convergence

# Train the model using the training data
log_reg.fit(X_train, y_train)

# Evaluate the model on the test set and print the accuracy
print(f"The accuracy of the Logistic Regressor is: {log_reg.score(X_test, y_test):.4f}")

# The Logistic Regressor achieves around 95.11% accuracy — this will serve as our performance baseline.

The accuracy of the Logistic Regressor is: 0.9733


In [6]:
# Let's see if we can improve classification performance by using K-Means as a preprocessing step.
# The idea is to transform the original image data into a new feature space using cluster distances,
# and then feed this transformed data into a Logistic Regression classifier.

import warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.pipeline import Pipeline
from sklearn.cluster import KMeans

# Suppress LogisticRegression convergence warnings
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Suppress joblib/loky CPU core detection warnings
warnings.filterwarnings("ignore", message="Could not find the number of physical cores*")

# Create a pipeline with two steps:
# 1. Apply K-Means clustering to the input data (using 50 clusters)
#    This step transforms each sample into a vector of distances to each cluster center.
# 2. Train a Logistic Regression model on the transformed features.
pipeline = Pipeline([
    ("kmeans", KMeans(n_clusters=50, random_state=42)),
    ("log_reg", LogisticRegression(max_iter=1000)),  # Ensure convergence
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Evaluate the pipeline on the test set and print the accuracy
print(f"The accuracy of the classification pipeline is: {pipeline.score(X_test, y_test):.4f}")

The accuracy of the classification pipeline is: 0.9711


In [5]:
# We stablished a number a clusters arbitrarily, but we can determine the best value of k
# by determine the best performance with cross-validation. We can use GridSearchCV to that end.

import warnings
from sklearn.exceptions import ConvergenceWarning
from sklearn.model_selection import GridSearchCV

# Suppress convergence warnings thrown by LogisticRegression
warnings.filterwarnings("ignore", category=ConvergenceWarning)

# Define a range of possible values for the number of clusters in KMeans
param_grid = dict(kmeans__n_clusters=range(2, 100))

# Use GridSearchCV to find the best number of clusters (k) using 3-fold cross-validation
grid_clf = GridSearchCV(pipeline, param_grid, cv=3)

# Fit the pipeline across different values of k to find the best one
grid_clf.fit(X_train, y_train)

# Print the best number of clusters found and the corresponding test set accuracy
print(f"The best amount of clusters is: {grid_clf.best_params_['kmeans__n_clusters']}")
print(f"The accuracy of the new classification pipeline is: {grid_clf.score(X_test, y_test):.4f}")

The best amount of clusters is: 88
The accuracy of the new classification pipeline is: 0.9822
