In [1]:
# MSDS 422
# Professor Anil Chaturvedi

# Assignment 5 
# in conjunction with and data source from Kaggle
# # # https://www.kaggle.com/c/digit-recognizer/data

# October 2020

# max_features : size of the random subsets of features to consider when splitting a node

<div style="text-align: right"><b>pkg imports & configuration</b></div>

In [2]:
# basix
import numpy as np
import pandas as pd
import seaborn as sns
import math
import random
import time

# prep
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

#modeling
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# other
from sklearn.metrics import classification_report
from sklearn import tree
from sklearn.cluster import MiniBatchKMeans

%matplotlib inline   
import matplotlib.pyplot as plt

  import pandas.util.testing as tm


In [3]:
pd.options.display.float_format = "{:,.4f}".format
RANDOM_SEED = 8675309
responseVar = "label"

In [4]:
# setup for consolidated table, later
table = {"rfc":{"r2-train":None},
         "rfc_with_pca":{"r2-train":None},
         "adj_rfc_with_pca":{"r2-train":None},
        }

<div style="text-align: right"><b>BRING IN THE DATA</b></div>

In [5]:
mnist_train = pd.read_csv('train.csv')
mnist_test = pd.read_csv('test.csv')
mnist = pd.concat([mnist_test, mnist_train.drop(columns = "label")]) # combined dataset

<div style="text-align: right"><b>eda</b></div>

In [6]:
sum(mnist_train.isna().sum()) # checks for missing data

0

In [7]:
pd.DataFrame(mnist_train["label"].describe(include='all')) # AKA mnist_train.label.describe()

Unnamed: 0,label
count,42000.0
mean,4.4566
std,2.8877
min,0.0
25%,2.0
50%,4.0
75%,7.0
max,9.0


In [8]:
print("most common label (mode) :", mnist_train["label"].mode()[0])

most common label (mode) : 1


<div style="text-align: right"><b>splitting train data and folding, model set-up</b></div>

In [9]:
X = mnist_train.loc[:, mnist_train.columns != responseVar]
y = mnist_train[responseVar]

In [10]:
# splitting og data once for simplicity
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=0.7,
                                                    test_size=0.3,
                                                    shuffle=True,
                                                    random_state=RANDOM_SEED)

folds = KFold(n_splits = 3, shuffle = True, random_state = RANDOM_SEED)


param_grid = { 
    'n_estimators': [100, 200, 500], # nbr of trees in forest
    'max_features': ['auto', 'sqrt', 'log2'], # auto means bagged, 
    'max_depth' : [3, 4, 5],
}

model = RandomForestClassifier(bootstrap=True) 

<div style="text-align: right"><b>model 1 : intial, raw random forest classifier</b></div>

In [11]:
print("approximate start time : ", time.asctime())
start1 = time.perf_counter() # CPU time or real time in seconds since process start or since prev call to perf_counter(). v precise.

approximate start time :  Sat Oct 17 15:49:54 2020


In [12]:
rfc = GridSearchCV(estimator = model, 
                   param_grid = param_grid, 
                   scoring= 'r2', 
                   cv = folds, 
                   verbose = 1,
                   return_train_score=True) 

model_fit_rfc = rfc.fit(X_train, y_train.values.ravel())  

Fitting 3 folds for each of 27 candidates, totalling 81 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  81 out of  81 | elapsed: 10.3min finished


In [13]:
stop1 = time.perf_counter()

In [14]:
tm1 = round(stop1 - start1)
print("approximate completion time, initial rfc : ", time.asctime())
print("time of execution in seconds : ", tm1)
print("time of execution in minutes : ", round(tm1/60, 1))
table["rfc"]["r2-train"] = rfc.score(X,y)

approximate completion time, initial rfc :  Sat Oct 17 16:00:44 2020
time of execution in seconds :  650
time of execution in minutes :  10.8


In [15]:
print("model used : ", rfc.best_estimator_)
#model_fit_rfc.best_estimator_

model used :  RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)


In [16]:
mnist_test_pred = rfc.predict(mnist_test)
rfc_preds = pd.DataFrame({'ImageId':range(1, 28001), 'Label':mnist_test_pred})
rfc_preds.to_csv("rfc_preds.csv", index = False)
rfc_preds.head() # test predictions

Unnamed: 0,ImageId,Label
0,1,2
1,2,0
2,3,9
3,4,7
4,5,2


<div style="text-align: right"><b>PCA on the combined train/test sets</b></div>

In [17]:
print("approximate start time : ", time.asctime())
start2 = time.perf_counter()

approximate start time :  Sat Oct 17 16:00:49 2020


In [18]:
scaled_mnist = mnist.apply(lambda x: (x - min(x))/(max(x) - min(x)), axis=1)
scaled_mnist.describe()[["pixel774", "pixel775", "pixel4"]] # response and random explanatory summary stats

Unnamed: 0,pixel774,pixel775,pixel4
count,70000.0,70000.0,70000.0
mean,0.0008,0.0004,0.0
std,0.0235,0.0167,0.0
min,0.0,0.0,0.0
25%,0.0,0.0,0.0
50%,0.0,0.0,0.0
75%,0.0,0.0,0.0
max,1.0,0.9961,0.0


In [19]:
pca = PCA(n_components = 0.95) # goal : to explain 95% explanatory variable variance
X_train_pca = pca.fit(mnist)
X_test_pca = pca.transform(mnist)

In [20]:
stop2 = time.perf_counter()
tm2 = round(stop2 - start2)
print("approximate completion time : ", time.asctime())
print("time of execution in seconds : ", tm2)
print("time of execution in minutes : ", round(tm2/60, 1))

approximate completion time :  Sat Oct 17 16:01:53 2020
time of execution in seconds :  64
time of execution in minutes :  1.1


In [21]:
print("optimized component count : ", pca.n_components_)

optimized component count :  154


<div style="text-align: right"><b>model 2 : rf classifier using identified PCA's derived from entire dataset</b></div>

In [22]:
print("approximate start time : ", time.asctime())
start3 = time.perf_counter()

approximate start time :  Sat Oct 17 16:01:53 2020


In [23]:
rfc_with_pca = GridSearchCV(estimator = model, 
                        param_grid = param_grid, 
                        scoring= 'r2', 
                        cv = folds, 
                        verbose = 1,
                        return_train_score=True) 

#model_fit_with_pca = rfc_with_pca.fit(X_train_pca, y_train.values.ravel())  

TypeError: Singleton array array(PCA(copy=True, iterated_power='auto', n_components=0.95, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False), dtype=object) cannot be considered a valid collection.

In [24]:
rfc_with_pca

GridSearchCV(cv=KFold(n_splits=3, random_state=8675309, shuffle=True),
             error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                

In [None]:
stop3 = time.perf_counter()
tm3 = round(stop3 - start3)
print("approximate completion time : ", time.asctime())
print("time of execution in seconds : ", tm3)
print("time of execution in minutes : ", round(tm3/60, 1))
table["rfc_with_pca"]["r2-train"] = rfc_with_pca.score(X_pca, y)

In [None]:
print("optimized model used : ", rfc_with_pca.best_estimator_)
#print("optimized model used : ", model_fit_with_pca.best_estimator_)

In [None]:
mnist_test_pred_with_pca = rfc_with_pca.predict(mnist_test)
rfc_with_pca_preds = pd.DataFrame({'ImageId':range(1,28001),'Label': mnist_test_pred_with_pca})
rfc_with_pcs_preds.to_csv("rfc_preds.csv", index = False)
rfc_with_pca_preds.head() # test predictions

<div style="text-align: right"><b>PCA on train set only</b></div>

In [None]:
print("approximate start time : ", time.asctime())
start4 = time.perf_counter()

In [None]:
pca_train = PCA(n_components = 0.95) # goal : to explain 95% explanatory variable variance
X_train_adj = pca_train.fit(mnist_train)
X_test_adj = pca_train.transform(mnist_train)

In [None]:
stop4 = time.perf_counter()
tm4 = round(stop4 - start4)
print("approximate completion time : ", time.asctime())
print("time of execution in seconds : ", tm4)
print("time of execution in minutes : ", round(tm4/60, 1))

In [None]:
print("optimized component count : ", pca_train.n_components_)

<div style="text-align: right"><b>model 3 : rf classifier using adjusted PCA : without any response variable data</b></div>

In [None]:
print("approximate start time : ", time.asctime())
start5 = time.perf_counter()

In [None]:
adj_rfc_with_pca = GridSearchCV(estimator = model, 
                        param_grid = param_grid, 
                        scoring= 'r2', 
                        cv = folds, 
                        verbose = 1,
                        return_train_score=True) 

model_fit_adj = adj_rfc_with_pca.fit(X_train_adj, y_train.values.ravel())  

In [None]:
stop5 = time.perf_counter()
tm5 = round(stop5 - start5)
print("approximate completion time : ", time.asctime())
print("time of execution in seconds : ", tm5)
print("time of execution in minutes : ", round(tm5/60, 1))
table["adj_rfc_with_pca"]["r2-train"] = adj_rfc_with_pca.score(X_train_adj, y_train) #?

In [None]:
print("model used : ", adj_rfc_with_pca.best_estimator_)

In [None]:
mnist_test_pred_adj = adj_rfc_with_pca.predict(mnist_test)
adj_rfc_preds = pd.DataFrame({'ImageId':range(1, 28001), 'Label':mnist_test_pred_adj})
adj_rfc_preds.to_csv("rfc_preds.csv", index = False)
adj_rfc_preds.head() # test predictions

In [None]:
table