<a href="https://colab.research.google.com/github/hbechara/HertieML/blob/main/Lesson10Lab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Today:

1. Load the MNIST Dataset we used last week and split it into a training set and test set.
2. Train a Random Forest Classifier on the dataset and time how long it takes.
3. Evaluate the model on the test set.
4. Use PCA to reduce the dataset's dimensionality with an explained variance ration of 95%
5. Evaluate the classifier on the test set.

# Setup

In [None]:
# Python ≥3.5 is required
import sys
assert sys.version_info >= (3, 5)

# Scikit-Learn ≥0.20 is required
import sklearn
assert sklearn.__version__ >= "0.20"

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "dim_reduction"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Dataset

In [None]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', version=1, as_frame=False)
mnist.target = mnist.target.astype(np.uint8)

In [None]:
from sklearn.model_selection import train_test_split

X = mnist["data"]
y = mnist["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y)

In [None]:
X_train = mnist['data'][:60000]
y_train = mnist['target'][:60000]

X_test = mnist['data'][60000:]
y_test = mnist['target'][60000:]

# Train a random forest classifier

In [None]:
# Thilos Solution

from sklearn.ensemble import RandomForestClassifier
import timeit

clasf_RF = RandomForestClassifier(random_state=42) 

# Start Timer
print(">... Starting training of", clasf_RF.__class__.__name__)
start_time = timeit.default_timer()
clasf_RF.fit(X_train, y_train)
time_elapsed = timeit.default_timer() - start_time
print(">... Finished training in {} seconds.".format(round(time_elapsed,3)))

In [None]:
# Thilos Solution
# Testing on test set
from sklearn.metrics import accuracy_score

y_pred = clasf_RF.predict(X_test)
acc = accuracy_score(y_test,y_pred)
print("Accuracy Score for {}: {}".format(clasf_RF.__class__.__name__, acc))


In [None]:
# Diego's Solution
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
import time

RF = RandomForestClassifier()
t0 = time.time()
RF.fit(X_train,y_train)
t1 = time.time()

total = t1-t0

# Calculate Accuracy on the Test Set

Use PCA to reduce the dataset's dimensionality, with an explained variance ratio of 95%

In [None]:
# Thilos Solution
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95)
X_red = pca.fit_transform(X)
X_train, X_test, X_train_red, X_test_red, y_train, y_test = train_test_split(X, X_red, y)
print("Full dimensionality: {}".format(X.shape[1]))
print("Reduced dimensionality: {}".format(X_red.shape[1]))

Full dimensionality: 784
Reduced dimensionality: 154


Train a new Random Forest classifier on the reduced dataset and see how long it takes. Was training much faster?

In [None]:
#Thilos Solution
clasf_RF_red = RandomForestClassifier(random_state=42) 

# Start Timer
print(">... Starting training of", clasf_RF_red.__class__.__name__)
start_time = timeit.default_timer()
clasf_RF_red.fit(X_train_red, y_train)
time_elapsed = timeit.default_timer() - start_time
print(">... Finished training in {} seconds.".format(round(time_elapsed,3)))

Evaluate the classifier on the test set: how does it compare to the previous classifier?

In [None]:
#Thilos Solution
y_pred = clasf_RF_red.predict(X_test_red)
acc = accuracy_score(y_test,y_pred)
print("Accuracy Score for {}: {}".format(clasf_RF_red.__class__.__name__, acc))

Discussion: Why did the PCA version take more time? What happens if we replace the Random Forest Classifier with a Softmax classifier? Try it out.

In [None]:
# Thilo Solution Softmax
from sklearn.linear_model import LogisticRegression

clasf_SoftMax = LogisticRegression(random_state=42, multi_class="multinomial") 

# Start Timer
print(">... Starting training of", clasf_SoftMax.__class__.__name__)
start_time = timeit.default_timer()
clasf_SoftMax.fit(X_train, y_train)
time_elapsed = timeit.default_timer() - start_time
print(">... Finished training in {} seconds.".format(round(time_elapsed,3)))

In [None]:
# Thilos Solution Softmax Prediction
# Prediction
y_pred = clasf_SoftMax.predict(X_test)
acc = accuracy_score(y_test,y_pred)
print("Accuracy Score for {}: {}".format(clasf_SoftMax.__class__.__name__, acc))

In [None]:
# Thilos Soltuion Softmax
# Reduced test set training
clasf_SoftMax_red = LogisticRegression(random_state=42, multi_class="multinomial") 

# Start Timer
print(">... Starting training of", clasf_SoftMax_red.__class__.__name__)
start_time = timeit.default_timer()
clasf_SoftMax_red.fit(X_train_red, y_train)
time_elapsed = timeit.default_timer() - start_time
print(">... Finished training in {} seconds.".format(round(time_elapsed,3)))

In [None]:
# Thilos Solution Prediction reduced
y_pred = clasf_SoftMax_red.predict(X_test_red)
acc = accuracy_score(y_test,y_pred)
print("Accuracy Score for {}: {}".format(clasf_SoftMax_red.__class__.__name__, acc))

In [None]:
# Thilo Solution SGD basic
# SGD
from sklearn.linear_model import SGDClassifier

clasf_SGD = SGDClassifier(random_state=42) 

# Start Timer
print(">... Starting training of", clasf_SGD.__class__.__name__)
start_time = timeit.default_timer()
clasf_SGD.fit(X_train, y_train)
time_elapsed = timeit.default_timer() - start_time
print(">... Finished training in {} seconds.".format(round(time_elapsed,3)))

In [None]:
# Prediction
y_pred = clasf_SGD.predict(X_test)
acc = accuracy_score(y_test,y_pred)
print("Accuracy Score for {}: {}".format(clasf_SGD.__class__.__name__, acc))

In [None]:
# SGD reduced
from sklearn.linear_model import SGDClassifier

clasf_SGD_red = SGDClassifier(random_state=42) 

# Start Timer
print(">... Starting training of", clasf_SGD_red.__class__.__name__)
start_time = timeit.default_timer()
clasf_SGD_red.fit(X_train_red, y_train)
time_elapsed = timeit.default_timer() - start_time
print(">... Finished training in {} seconds.".format(round(time_elapsed,3)))

In [None]:
# SGD reduced Prediction
y_pred = clasf_SGD_red.predict(X_test_red)
acc = accuracy_score(y_test,y_pred)
print("Accuracy Score for {}: {}".format(clasf_SGD_red.__class__.__name__, acc))

In [None]:
#Diego SVM

pca2 = PCA(n_components=0.95)
X_reduced = pca2.fit_transform(X)

from sklearn.svm import SVC
import time

SVM_clf_reduced = SVC()

t0 = time.time()
SVM_clf_reduced.fit(X_train_red,y_train)
t1 = time.time()
total_reduced = t1-t0


SVM_clf_normal = SVC()
t0 = time.time()
SVM_clf_normal.fit(X_train,y_train)
t1 = time.time()
total_normal = t1-t0


score_reduced = SVM_clf_reduced.score(X_test_red, y_test)

score_normal =  SVM_clf_normal.score(X_test, y_test)

print(total_reduced)
print(total_normal)
print(score_reduced)
print(score_normal) 



132.16876983642578
455.4890456199646
0.9828
0.9786285714285714


Now let's use the reduced dataset