In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier
import time
from sklearn import metrics

Construct the training and test sets.

In [4]:
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.fashion_mnist.load_data()
assert X_train.shape == (60000, 28, 28)
assert X_test.shape == (10000, 28, 28)
assert y_train.shape == (60000,)
assert y_test.shape == (10000,)

Reshape X_train such that it becomes a 2D array.

In [5]:
X_train = np.reshape(X_train, (4, -1))
X_train
y_train = np.reshape(y_train, (4, -1))
y_train

array([[9, 0, 0, ..., 0, 5, 6],
       [2, 9, 2, ..., 7, 0, 8],
       [3, 3, 7, ..., 6, 7, 8],
       [2, 9, 6, ..., 3, 0, 5]], dtype=uint8)

Project X_train onto the hyperplane defined by the first d = 4 principal components. Scale the features.

In [6]:
pca = PCA(n_components=4)
x2d = pd.DataFrame(data = pca.fit_transform(X_train), 
                   columns = ['principal component 1', 
                              'principal component 2',
                              'principal component 3',
                              'principal component 4'
                             ]
                  )
x2d

Unnamed: 0,principal component 1,principal component 2,principal component 3,principal component 4
0,164391.225006,-129965.632373,77626.681957,6.047908e-08
1,88189.855787,163236.620051,-123334.766385,6.047908e-08
2,-113112.30911,88370.423093,170378.718304,6.047908e-08
3,-139468.771683,-121641.410772,-124670.633875,6.047908e-08


In [7]:
pca = PCA(n_components=4)
y2d = pd.DataFrame(data = pca.fit_transform(y_train), 
                   columns = ['principal component 1', 
                              'principal component 2',
                              'principal component 3',
                              'principal component 4'
                             ]
                  )
y2d

Unnamed: 0,principal component 1,principal component 2,principal component 3,principal component 4
0,-189.932489,-170.429953,-164.112479,1.819968e-14
1,36.443076,-125.011688,272.550188,1.819968e-14
2,270.159045,17.203205,-138.961379,1.819968e-14
3,-116.669633,278.238436,30.52367,1.819968e-14


In [8]:
X_scaled = pd.DataFrame(data = StandardScaler().fit_transform(x2d))
X_scaled

Unnamed: 0,0,1,2,3
0,1.269744,-1.010685,0.605199,-1.181401
1,0.681171,1.269419,-0.961552,1.186049
2,-0.87367,0.687218,1.32832,-0.775672
3,-1.077245,-0.945951,-0.971967,0.771954


In [9]:
y_scaled = pd.DataFrame(data = StandardScaler().fit_transform(y2d))
y_scaled

Unnamed: 0,0,1,2,3
0,-1.078743,-0.974335,-0.941798,-6.310887e-30
1,0.206983,-0.714683,1.564093,-3.155444e-30
2,1.534399,0.098349,-0.797462,6.310887e-30
3,-0.662639,1.590669,0.175167,-6.310887e-30


Display explained_variance_ratio_ of the PCA obtained from step 2.

In [10]:
print(pca.explained_variance_ratio_)

[3.37098486e-01 3.32712698e-01 3.30188816e-01 3.60182218e-33]


Compute and display the number of principal components required to obtain
    25% variance
    50% variance
    75% variance
    95% variance

In [11]:
pca = PCA()
pca.fit(X_scaled)
cumsum = np.cumsum(pca.explained_variance_ratio_)
a = np.argmax(cumsum >= 0.25) + 1
a

1

In [12]:
pca = PCA()
pca.fit(X_scaled)
cumsum = np.cumsum(pca.explained_variance_ratio_)
b = np.argmax(cumsum >= 0.50) + 1
b

1

In [13]:
pca = PCA()
pca.fit(X_scaled)
cumsum = np.cumsum(pca.explained_variance_ratio_)
c = np.argmax(cumsum >= 0.75) + 1
c

2

In [14]:
pca = PCA()
pca.fit(X_scaled)
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1
d

3

Apply PCA to compress X_train such that 75% of its variance is preserved.

In [15]:
pca = PCA(n_components = 2)
X_reduced = pca.fit_transform(X_scaled)
X_reduced

array([[-1.6710821 , -1.04341417],
       [ 1.67699743, -0.95356974],
       [-1.09729462,  1.06181355],
       [ 1.09137928,  0.93517036]])

In [None]:
pca = PCA(n_components = 2)
y_reduced = pca.fit_transform(y_scaled)
y_reduced

Compare the size of the original X_train and the one obtained from step 5.

In [16]:
X_train.size

47040000

In [17]:
X_reduced.size

8

Pick your favorite classifier. Compare the computation time of the classifier.fit() between using the original dataset and using the compressed dataset obtained from step 5.

In [None]:
# Grab Currrent Time Before Running the Code
start = time.time()

clf = DecisionTreeClassifier(max_depth=2)
clf.fit(X_train,y_train)

# Grab Currrent Time After Running the Code
end = time.time()

#Subtract Start Time from The End Time
total_time = end - start
print("\n"+ str(total_time))

In [None]:
# Grab Currrent Time Before Running the Code
start = time.time()

clf_reduced = DecisionTreeClassifier(max_depth=2)
clf_reduced.fit(X_reduced,y_reduced)

# Grab Currrent Time After Running the Code
end = time.time()

#Subtract Start Time from The End Time
total_time = end - start
print("\n"+ str(total_time))

In [None]:
print(metrics.accuracy_score(y_test, clf.predict(X_test)))

In [None]:
print(metrics.accuracy_score(y_test, clf_reduced.predict(X_test)))