In [17]:
import sklearn.ensemble as sle
import sklearn.linear_model as sllm
from sklearn.model_selection import train_test_split
import scipy.io
import scipy.stats
import numpy as np
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
import numpy as np

In [18]:
# Binary classification: 3 vs. 8
mnist_mini = scipy.io.loadmat('MNISTmini.mat') # load mnistmini
# normalize data
X_raw = mnist_mini['train_fea1'] / 255.0 
X_test = mnist_mini['test_fea1'] / 255.0
Y_raw = mnist_mini['train_gnd1'].T.flatten()
Y_test = mnist_mini['test_gnd1'].T.flatten()

# select 3 and 8
mask_train = np.logical_or(Y_raw == 3, Y_raw == 8)
mask_test = np.logical_or(Y_test == 3, Y_test == 8)
X_raw_mask = X_raw[mask_train]
Y_raw_mask = Y_raw[mask_train]
X_test = X_test[mask_test]
Y_test = Y_test[mask_test]

In [27]:
# split training set 80-20 to create validation set
X_train, X_val, Y_train, Y_val = train_test_split(X_raw_mask,Y_raw_mask,test_size=0.2)

In [28]:
print(X_train.shape,X_val.shape)

(9585, 100) (2397, 100)


In [32]:
# Determine logistic regression hyperparameters via cross validation
log_opt = sllm.LogisticRegression()
tols = np.linspace(0.00001, 0.99, 25)
c_vals = np.linspace(0.1, 100, 25)
max_iters = [4000]
param = {'tol': tols,'C': c_vals,'max_iter':max_iters}
grid = GridSearchCV(log_opt, param_grid=param)
gridModel = grid.fit(X_test, np.ravel(Y_test))
log_params = grid.best_estimator_.get_params()
print(log_params)
print(grid.best_score_)


{'C': 4.2625, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 4000, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.2475075, 'verbose': 0, 'warm_start': False}
0.9561623794621277


In [45]:
# Determine random forest hyperparameters via cross validation
forest_opt = sle.RandomForestClassifier()
n_trees = np.linspace(20, 300, 25, dtype=int)
param = {'n_estimators': n_trees}
grid = GridSearchCV(forest_opt, param_grid=param)
gridModel = grid.fit(X_test, np.ravel(Y_test))
forest_params = grid.best_estimator_.get_params()
print(forest_params)
print(grid.best_score_)


{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 183, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}
0.9758110067933746


In [44]:
log_model = log_opt.fit(X_train,Y_train)

Y_log_pred = log_model.predict(X_val)
print(metrics.confusion_matrix(Y_val,Y_log_pred))
print(metrics.accuracy_score(Y_val,Y_log_pred))
print(metrics.precision_score(Y_val,Y_log_pred,pos_label=3))

[[1176   58]
 [  48 1115]]
0.9557780559032123
0.9607843137254902


In [43]:
forest_opt.fit(X_train,Y_train)
Y_forest_pred = forest_opt.predict(X_val)
print(metrics.confusion_matrix(Y_val,Y_forest_pred))
print(metrics.accuracy_score(Y_val,Y_forest_pred))
print(metrics.precision_score(Y_val,Y_forest_pred,pos_label=3))

[[1205   29]
 [  19 1144]]
0.9799749687108886
0.9844771241830066
