In [1]:
import sklearn.datasets as dataset
import sklearn.ensemble as ensemble
import sklearn.model_selection as ms
import sklearn.metrics as m
import pandas as pd
import numpy as np
import sklearn.decomposition as decom
import sklearn.linear_model as lm
from timeit import default_timer as timer 

In [2]:
data = dataset.fetch_openml('mnist_784', version=1)

X = data.data
y = data.target

X_train, X_test = X[:60000], X[60000:]
y_train, y_test = y[:60000], y[60000:]

In [3]:
print(X_train.shape)
print(X_train.size)

(60000, 784)
47040000


# Base Model - Random Forest with Full Training Data

In [4]:
ran_clf = ensemble.RandomForestClassifier(random_state=42)

start = timer() # start of timer

ran_clf.fit(X_train, y_train) # training the model

end = timer() # end of timer 

print(end-start)

41.27652709999984


In [5]:
# Score on training data
ran_clf.score(X_train, y_train)

1.0

In [6]:
# Score on testing data
ran_clf.score(X_test, y_test)

0.9705

# Decompositing Data

In [7]:
pca = decom.PCA(n_components=0.95) # retain 95% of the variance in the data
X_reduced = pca.fit_transform(X)

X_train_reduced, X_test_reduced = X_reduced[:60000], X_reduced[60000:]

# Performance of Random Forest Model on Reduced Feature Set

In [8]:
new_ran_clf = ensemble.RandomForestClassifier(random_state=42)

start = timer() # start of timer

new_ran_clf.fit(X_train_reduced, y_train) # training the model

end = timer() # end of timer 

print(end-start)

95.91927740000006


In [9]:
# Performance on training dataset
new_ran_clf.score(X_train_reduced, y_train)

1.0

In [10]:
# Performance on testing dataset
new_ran_clf.score(X_test_reduced, y_test)

0.9498

# Logistic Regression on Full Training Data

In [11]:
log_reg = lm.LogisticRegression(multi_class="multinomial", solver="lbfgs", C=10, random_state=42)

start = timer() # start of timer

log_reg.fit(X_train, y_train) # training the model

end = timer() # end of timer 

print(end-start)

9.848613699999987


In [12]:
print(log_reg.score(X_train, y_train))
print(log_reg.score(X_test, y_test))

0.9341166666666667
0.9243


# Performance of Logistic Regression on Reduced Feature Set

In [13]:
new_log_reg = lm.LogisticRegression(multi_class="multinomial", solver="lbfgs", C=10, random_state=42)

start = timer() # start of timer

new_log_reg.fit(X_train_reduced, y_train) # training the model

end = timer() # end of timer 

print(end-start)

4.912646499999937


In [14]:
print(new_log_reg.score(X_train_reduced, y_train))
print(new_log_reg.score(X_test_reduced, y_test))

0.9217666666666666
0.92
