In [1]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "dim_reduction"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [2]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', version=1, as_frame=False)
mnist.target = mnist.target.astype(np.uint8)

In [3]:
from sklearn.model_selection import train_test_split

X = mnist["data"]
y = mnist["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [4]:
X_train = mnist['data'][:60000]
y_train = mnist['target'][:60000]

X_test = mnist['data'][60000:]
y_test = mnist['target'][60000:]

In [24]:
import time
import resource 
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score



In [51]:
# Train a random forest classifier 
                 
clf = RandomForestClassifier(max_depth = 2, random_state = 42)


In [30]:

time_start = time.perf_counter()

clf.fit(X_train, y_train)

time_elapsed = (time.perf_counter() - time_start)
memMb=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1024.0/1024.0
print ("%5.1f secs %5.1f MByte" % (time_elapsed,memMb))

  6.6 secs 2219.6 MByte


In [35]:
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)
print("Accuracy Score for {}: {}".format(clf.__class__.__name__, acc))

Accuracy Score for RandomForestClassifier: 0.6452


In [52]:

pca = PCA(n_components = 0.95, random_state = 42)
X_reduced = pca.fit_transform(X_train)


clf2 = RandomForestClassifier(max_depth = 2, random_state = 42)


In [53]:

time_start = time.perf_counter()

clf2.fit(X_reduced, y_train)

time_elapsed = (time.perf_counter() - time_start)
memMb=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss/1024.0/1024.0
print ("%5.1f secs %5.1f MByte" % (time_elapsed,memMb))

 24.9 secs 2289.0 MByte


In [57]:
pca = PCA(n_components = 0.95, random_state = 42)
X_test_reduced = pca.fit_transform(X_test)


In [58]:

y_pred2 = clf2.predict(X_test_reduced)
acc = accuracy_score(y_test, y_pred2)
print("Accuracy Score for {}: {}".format(clf2.__class__.__name__, acc))

ValueError: X has 149 features, but DecisionTreeClassifier is expecting 154 features as input.