In [1]:
# check sklearn version
import sklearn
print(sklearn.__version__)

0.23.1


In [None]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)
mnist.keys()
X, y = mnist["data"], mnist["target"]

In [3]:
X.shape

(70000, 784)

In [4]:
y.shape

(70000,)

In [5]:
import time
start_time = time.time()   
from datetime import datetime
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Start Time =", current_time)

# standardize the data as z-scores
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X.astype('float64'))


# split training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.33, random_state=42)

print("--- %s seconds ---" % (time.time() - start_time))
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("End Time =", current_time)

Start Time = 13:52:48
--- 12.951472759246826 seconds ---
End Time = 13:53:01


In [8]:
# MLP
import time
start_time = time.time()   
from datetime import datetime
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Start Time =", current_time)
print('start fitting MLP Model')    
from sklearn.neural_network import MLPClassifier
mlp_clf = MLPClassifier(hidden_layer_sizes=(28, 14,), random_state=1)
mlp_clf.fit(X_train, y_train)

mlp_clf.fit(X_train, y_train)
print('start making prediction') 
y_test_pred_mlp = mlp_clf.predict(X_test)
print('start calculating accuracy score') 
from sklearn.metrics import accuracy_score  
mlp_accuracy = accuracy_score(y_test, y_test_pred_mlp)
print("MLP Model Accuracy: %.2f%%" % (mlp_accuracy * 100.0))

# save the model to disk
import pickle
filename = '/Users/jiahuali1991/Dropbox/Machine Learning/Data/mnist/MLP_model_mnist.sav'
pickle.dump(mlp_clf, open(filename, 'wb'))

print("--- %s seconds ---" % (time.time() - start_time))
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("End Time =", current_time)

Start Time = 14:10:04
start fitting MLP Model
start making prediction
start calculating accuracy score
MLP Model Accuracy: 95.38%
--- 160.8486168384552 seconds ---
End Time = 14:12:45


In [12]:
# deep learning on original data
import time
start_time = time.time()   
from datetime import datetime
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Start Time =", current_time)

import tensorflow as tf
from tensorflow import keras
tf.__version__
keras.__version__

model = keras.models.Sequential()
model.add(keras.layers.Flatten(input_shape=[784]))
model.add(keras.layers.Dense(512, activation="relu"))
model.add(keras.layers.Dense(256, activation="relu"))
model.add(keras.layers.Dense(128, activation="relu"))
model.add(keras.layers.Dense(64, activation="relu"))
model.add(keras.layers.Dense(32, activation="relu"))
model.add(keras.layers.Dense(16, activation="relu"))
model.add(keras.layers.Dense(10, activation="softmax"))

print(model.summary())

model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["acc"]) # sgd

filepath = '/Users/jiahuali1991/Dropbox/Machine Learning/Data/mnist/DeepNet_Model_mnist.h5'
# model.load_weights(filepath) #load previously trained model


history = model.fit(X_train, y_train, epochs=500, batch_size = 28, verbose = 1,
                    validation_data=(X_test, y_test),
                   callbacks = [
        keras.callbacks.ModelCheckpoint(filepath, monitor='val_loss', verbose=0, save_best_only=True, mode='auto'),
        keras.callbacks.EarlyStopping(monitor='val_loss', patience=15, verbose=0, mode='auto')
    ])

# we re-load the best weights once training is finished
model.load_weights(filepath)
'''
import matplotlib.pyplot as plt
pd.DataFrame(history.history).plot(figsize=(8, 5))
plt.grid(True)
plt.gca().set_ylim(0, 1) # set the vertical range to [0-1]
plt.show()
'''

y_test_pred_dl = model.predict_classes(X_test)
# model.evaluate(X_fft_test, y_test)
from sklearn.metrics import accuracy_score  
print("Deep Learning Model Accuracy: %.2f%%" % (accuracy_score(y_test, y_test_pred_dl) * 100.0))

print("--- %s seconds ---" % (time.time() - start_time))
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("End Time =", current_time)

Start Time = 10:58:26
Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
flatten_3 (Flatten)          (None, 784)               0         
_________________________________________________________________
dense_21 (Dense)             (None, 512)               401920    
_________________________________________________________________
dense_22 (Dense)             (None, 256)               131328    
_________________________________________________________________
dense_23 (Dense)             (None, 128)               32896     
_________________________________________________________________
dense_24 (Dense)             (None, 64)                8256      
_________________________________________________________________
dense_25 (Dense)             (None, 32)                2080      
_________________________________________________________________
dense_26 (Dense)             (No

In [None]:
y_test_pred_dl

In [13]:
# Random Forest on original data
import time
start_time = time.time()   
from datetime import datetime
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Start Time =", current_time)
print('start fitting Random Forest Model')    
from sklearn.ensemble import RandomForestClassifier
rf_clf = RandomForestClassifier(n_jobs=-1)
rf_clf.fit(X_train, y_train)
print('start making prediction') 
y_test_pred_rf = rf_clf.predict(X_test)
print('start calculating accuracy score') 
from sklearn.metrics import accuracy_score  
rf_accuracy = accuracy_score(y_test, y_test_pred_rf)
print("Random Forest Model Accuracy: %.2f%%" % (rf_accuracy * 100.0))

# save the model to disk
import pickle
filename = '/Users/jiahuali1991/Dropbox/Machine Learning/Data/mnist/RandomForest_model_mnist.sav'
pickle.dump(rf_clf, open(filename, 'wb'))

print("--- %s seconds ---" % (time.time() - start_time))
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("End Time =", current_time)

Start Time = 11:03:29
start fitting Random Forest Model




start making prediction
start calculating accuracy score
Random Forest Model Accuracy: 94.17%
--- 4.685892105102539 seconds ---
End Time = 11:03:34


In [14]:
# Classification and Regression Trees (CART) on original data
import time
start_time = time.time()   
from datetime import datetime
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Start Time =", current_time)
print('start fitting CART Model')    
from sklearn.tree import DecisionTreeClassifier
dt_clf = DecisionTreeClassifier(random_state=0)
dt_clf.fit(X_train, y_train)
print('start making prediction') 
y_test_pred_dt = dt_clf.predict(X_test)
print('start calculating accuracy score') 
from sklearn.metrics import accuracy_score  
dt_accuracy = accuracy_score(y_test, y_test_pred_dt)
print("CART Model Accuracy: %.2f%%" % (dt_accuracy * 100.0))
# save the model to disk
import pickle
filename = '/Users/jiahuali1991/Dropbox/Machine Learning/Data/mnist/CART_model_mnist.sav'
pickle.dump(dt_clf, open(filename, 'wb'))
print("--- %s seconds ---" % (time.time() - start_time))
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("End Time =", current_time)

Start Time = 11:03:34
start fitting CART Model
start making prediction
start calculating accuracy score
CART Model Accuracy: 86.40%
--- 22.003356218338013 seconds ---
End Time = 11:03:56


In [15]:
# KNN on original data
import time
start_time = time.time()   
from datetime import datetime
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Start Time =", current_time)
print('start fitting KNN Model')    
from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_jobs = -1) 
knn_clf.fit(X_train, y_train)
print('start making prediction') 
y_test_pred_knn = knn_clf.predict(X_test)
print('start calculating accuracy score') 
from sklearn.metrics import accuracy_score  
knn_accuracy = accuracy_score(y_test, y_test_pred_knn)
print("KNN Model Accuracy: %.2f%%" % (knn_accuracy * 100.0))
# save the model to disk
import pickle
filename = '/Users/jiahuali1991/Dropbox/Machine Learning/Data/mnist/KNN_model_mnist.sav'
pickle.dump(knn_clf, open(filename, 'wb'))
print("--- %s seconds ---" % (time.time() - start_time))
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("End Time =", current_time)

Start Time = 11:03:56
start fitting KNN Model
start making prediction
start calculating accuracy score
KNN Model Accuracy: 94.29%
--- 754.6038410663605 seconds ---
End Time = 11:16:30


In [23]:
# Naive Bayes on original data
import time
start_time = time.time()   
from datetime import datetime
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Start Time =", current_time)
print('start fitting Gaussian Naive Bayes Model')    
from sklearn.naive_bayes import GaussianNB
gnb_clf = GaussianNB()
gnb_clf.fit(X_train, y_train)
print('start making prediction') 
y_test_pred_gnb = gnb_clf.predict(X_test)
print('start calculating accuracy score') 
from sklearn.metrics import accuracy_score  
gnb_accuracy = accuracy_score(y_test, y_test_pred_gnb)
print("Gaussian Naive Bayes Model Accuracy: %.2f%%" % (gnb_accuracy * 100.0))
# save the model to disk
import pickle
filename = '/Users/jiahuali1991/Dropbox/Machine Learning/Data/mnist/NB_model_mnist.sav'
pickle.dump(gnb_clf, open(filename, 'wb'))
print("--- %s seconds ---" % (time.time() - start_time))
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("End Time =", current_time)

Start Time = 16:46:54
start fitting Gaussian Naive Bayes Model
start making prediction
start calculating accuracy score
Gaussian Naive Bayes Model Accuracy: 52.63%
--- 6.246715784072876 seconds ---
End Time = 16:47:00


In [16]:
# Logistic Regression on original data
import time
start_time = time.time()   
from datetime import datetime
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Start Time =", current_time)
print('start fitting Logistic Regression Model')    
from sklearn.linear_model import LogisticRegression
lr_clf = LogisticRegression(random_state=0)
lr_clf.fit(X_train, y_train)
print('start making prediction') 
y_test_pred_lr = lr_clf.predict(X_test)
print('start calculating accuracy score') 
from sklearn.metrics import accuracy_score  
lr_accuracy = accuracy_score(y_test, y_test_pred_lr)
print("Logistic Regression Model Accuracy: %.2f%%" % (lr_accuracy * 100.0))
# save the model to disk
import pickle
filename = '/Users/jiahuali1991/Dropbox/Machine Learning/Data/mnist/LR_model_mnist.sav'
pickle.dump(lr_clf, open(filename, 'wb'))
print("--- %s seconds ---" % (time.time() - start_time))
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("End Time =", current_time)

Start Time = 11:16:31
start fitting Logistic Regression Model




start making prediction
start calculating accuracy score
Logistic Regression Model Accuracy: 91.14%
--- 628.726331949234 seconds ---
End Time = 11:26:59


In [17]:
# SGD on original data
import time
start_time = time.time()   
from datetime import datetime
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Start Time =", current_time)
print('start fitting SGD Model') 
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier()
sgd_clf.fit(X_train, y_train)
print('start making prediction') 
y_test_pred_sgd = sgd_clf.predict(X_test)
print('start calculating accuracy score') 
from sklearn.metrics import accuracy_score  
sgd_accuracy = accuracy_score(y_test, y_test_pred_sgd)
print("SGD Model Accuracy: %.2f%%" % (sgd_accuracy * 100.0))
# save the model to disk
import pickle
filename = '/Users/jiahuali1991/Dropbox/Machine Learning/Data/mnist/SGD_model_mnist.sav'
pickle.dump(sgd_clf, open(filename, 'wb'))
print("--- %s seconds ---" % (time.time() - start_time))
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("End Time =", current_time)

Start Time = 11:26:59
start fitting SGD Model
start making prediction
start calculating accuracy score
SGD Model Accuracy: 90.26%
--- 281.60627603530884 seconds ---
End Time = 11:31:41


In [18]:
# Light GBM on original data
import time
start_time = time.time()   
from datetime import datetime
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Start Time =", current_time)
print('start fitting Light GBM Model') 
import lightgbm as lgb
gbm = lgb.LGBMClassifier()
gbm.fit(X_train, y_train)
print('start making prediction') 
y_test_pred_gbm = gbm.predict(X_test)
print('start calculating accuracy score') 
gbm_accuracy = accuracy_score(y_test, y_test_pred_gbm)
print("Light GBM Model Accuracy: %.2f%%" % (gbm_accuracy * 100.0))
# save the model to disk
import pickle
filename = '/Users/jiahuali1991/Dropbox/Machine Learning/Data/mnist/LightGBM_model_mnist.sav'
pickle.dump(gbm, open(filename, 'wb'))
print("--- %s seconds ---" % (time.time() - start_time))
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("End Time =", current_time)

Start Time = 11:31:41
start fitting Light GBM Model


This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


start making prediction
start calculating accuracy score
Light GBM Model Accuracy: 96.89%
--- 168.84644770622253 seconds ---
End Time = 11:34:30


In [19]:
# XGBoost on original data
import time
start_time = time.time()   
from datetime import datetime
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Start Time =", current_time)
print('start fitting XGBoost Model') 
from xgboost import XGBClassifier
XGB_clf = XGBClassifier()
XGB_clf.fit(X_train, y_train)
print('start making prediction') 
y_test_pred_XGB = XGB_clf.predict(X_test)
print('start calculating accuracy score') 
XGB_accuracy = accuracy_score(y_test, y_test_pred_XGB)
print("XGBoost Model Accuracy: %.2f%%" % (XGB_accuracy * 100.0))
# save the model to disk
import pickle
filename = '/Users/jiahuali1991/Dropbox/Machine Learning/Data/mnist/XGBoost_model_mnist.sav'
pickle.dump(XGB_clf, open(filename, 'wb'))
print("--- %s seconds ---" % (time.time() - start_time))
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("End Time =", current_time)

Start Time = 11:34:30
start fitting XGBoost Model
start making prediction
start calculating accuracy score
XGBoost Model Accuracy: 93.41%
--- 2240.834710121155 seconds ---
End Time = 12:11:51


In [20]:
# CatBoost on original data
import time
start_time = time.time()   
from datetime import datetime
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Start Time =", current_time)
import time
start_time = time.time() 
print('start fitting CatBoost Model') 
from catboost import CatBoostClassifier
catb_clf = CatBoostClassifier()
catb_clf.fit(X_train, y_train, verbose=0)
print('start making prediction') 
y_test_pred_catb = catb_clf.predict(X_test)
from sklearn.metrics import accuracy_score
print('start calculating accuracy score') 
catb_accuracy = accuracy_score(y_test, y_test_pred_catb)
print("Cat Boost Model Accuracy: %.2f%%" % (catb_accuracy * 100.0))
# save the model to disk
import pickle
filename = '/Users/jiahuali1991/Dropbox/Machine Learning/Data/mnist/CatBoost_model_mnist.sav'
pickle.dump(catb_clf, open(filename, 'wb'))
print("--- %s seconds ---" % (time.time() - start_time))
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("End Time =", current_time)

Start Time = 12:11:51
start fitting CatBoost Model
start making prediction
start calculating accuracy score
Cat Boost Model Accuracy: 96.77%
--- 1885.4042880535126 seconds ---
End Time = 12:43:16


In [21]:
# Support Vector Machine on original data
import time
start_time = time.time()   
from datetime import datetime
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Start Time =", current_time)
print('start fitting Support Vector Machine Model') 
from sklearn import svm
svm_clf = svm.SVC(C=10000, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
svm_clf.fit(X_train, y_train)
print('start making prediction') 
y_test_pred_svm = svm_clf.predict(X_test)
print('start calculating accuracy score') 
from sklearn.metrics import accuracy_score  
svm_accuracy = accuracy_score(y_test, y_test_pred_svm)
print("Support Vector Machine Model Accuracy: %.2f%%" % (svm_accuracy * 100.0))
# save the model to disk
import pickle
filename = '/Users/jiahuali1991/Dropbox/Machine Learning/Data/mnist/SVM_model_mnist.sav'
pickle.dump(svm_clf, open(filename, 'wb'))
print("--- %s seconds ---" % (time.time() - start_time))
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("End Time =", current_time)

Start Time = 12:43:16
start fitting Support Vector Machine Model
start making prediction
start calculating accuracy score
Support Vector Machine Model Accuracy: 19.27%
--- 14589.99098277092 seconds ---
End Time = 16:46:26


In [22]:
# AdaBoost on original data
import time
start_time = time.time()   
from datetime import datetime
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Start Time =", current_time)
print('start fitting AdaBoost Model') 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
ada_clf = AdaBoostClassifier(DecisionTreeClassifier(), n_estimators=200, algorithm='SAMME.R', learning_rate=0.5)
ada_clf.fit(X_train, y_train)
print('start making prediction') 
y_test_pred_ada = ada_clf.predict(X_test)
print('start calculating accuracy score') 
from sklearn.metrics import accuracy_score  
ada_accuracy = accuracy_score(y_test, y_test_pred_ada)
print("AdaBoost Model Accuracy: %.2f%%" % (ada_accuracy * 100.0))
# save the model to disk
import pickle
filename = '/Users/jiahuali1991/Dropbox/Machine Learning/Data/mnist/AdaBoost_model_mnist.sav'
pickle.dump(ada_clf, open(filename, 'wb'))
print("--- %s seconds ---" % (time.time() - start_time))
now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("End Time =", current_time)

Start Time = 16:46:27
start fitting AdaBoost Model
start making prediction
start calculating accuracy score
AdaBoost Model Accuracy: 86.43%
--- 26.83395004272461 seconds ---
End Time = 16:46:54
