# EDA and Model Development

The goal of this notebook is to conduct EDA and develop a model to then shift into a production style format written in a script and containerized with docker.

In [30]:
import pandas as pd
from sklearn.datasets import load_digits
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import silhouette_score, accuracy_score

## Ingest

In [10]:
mnist = pd.DataFrame(load_digits().data)
target = pd.DataFrame(load_digits().target)

In [6]:
mnist.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,5.0,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,9.0,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0


In [7]:
mnist.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
1792,0.0,0.0,4.0,10.0,13.0,6.0,0.0,0.0,0.0,1.0,...,4.0,0.0,0.0,0.0,2.0,14.0,15.0,9.0,0.0,0.0
1793,0.0,0.0,6.0,16.0,13.0,11.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,6.0,16.0,14.0,6.0,0.0,0.0
1794,0.0,0.0,1.0,11.0,15.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,9.0,13.0,6.0,0.0,0.0
1795,0.0,0.0,2.0,10.0,7.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,5.0,12.0,16.0,12.0,0.0,0.0
1796,0.0,0.0,10.0,14.0,8.0,1.0,0.0,0.0,0.0,2.0,...,8.0,0.0,0.0,1.0,8.0,12.0,14.0,12.0,1.0,0.0


In [11]:
target.head()

Unnamed: 0,0
0,0
1,1
2,2
3,3
4,4


In [12]:
target.tail()

Unnamed: 0,0
1792,9
1793,0
1794,8
1795,9
1796,8


In [8]:
mnist.columns

RangeIndex(start=0, stop=64, step=1)

In [13]:
target.columns

RangeIndex(start=0, stop=1, step=1)

## EDA

In [14]:
target.value_counts()

3    183
1    182
5    182
4    181
6    181
9    180
7    179
0    178
2    177
8    174
dtype: int64

In [15]:
print('percentage of total for each digit')
(target.value_counts()/sum(target.value_counts()))*100

percentage of total for each digit


3    10.183639
1    10.127991
5    10.127991
4    10.072343
6    10.072343
9    10.016694
7     9.961046
0     9.905398
2     9.849750
8     9.682805
dtype: float64

In [17]:
mnist.isnull().sum().sum()

0

In [18]:
mnist.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
count,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,...,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0,1797.0
mean,0.0,0.30384,5.204786,11.835838,11.84808,5.781859,1.36227,0.129661,0.005565,1.993879,...,3.725097,0.206455,0.000556,0.279354,5.557596,12.089037,11.809126,6.764051,2.067891,0.364496
std,0.0,0.907192,4.754826,4.248842,4.287388,5.666418,3.325775,1.037383,0.094222,3.19616,...,4.919406,0.984401,0.02359,0.934302,5.103019,4.374694,4.933947,5.900623,4.090548,1.860122
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,10.0,10.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,11.0,10.0,0.0,0.0,0.0
50%,0.0,0.0,4.0,13.0,13.0,4.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,4.0,13.0,14.0,6.0,0.0,0.0
75%,0.0,0.0,9.0,15.0,15.0,11.0,0.0,0.0,0.0,3.0,...,7.0,0.0,0.0,0.0,10.0,16.0,16.0,12.0,2.0,0.0
max,0.0,8.0,16.0,16.0,16.0,16.0,16.0,15.0,2.0,16.0,...,16.0,13.0,1.0,9.0,16.0,16.0,16.0,16.0,16.0,16.0


## Model Development

In [35]:
X_train, X_test, y_train, y_test = train_test_split(mnist, target, test_size=0.3, random_state=123)

### Standerdized version

Get an understanding of where the base model is before tuning it. Will compare the base model against the tuned model on the test set to ensure through compairson that the tuned model hasn't become overfitted.

In [36]:
rf_sd = Pipeline([("scaler", StandardScaler()), ("model", RandomForestClassifier())])

#### Base model version

In [42]:
rf_sd_cv = cross_val_score(rf_sd, X_train, y_train.values.ravel(), cv=5, scoring = 'accuracy')
print("Scores:", rf_sd_cv)
print("Mean:", rf_sd_cv.mean())
print("Standard deviation:", rf_sd_cv.std())

Scores: [0.97619048 0.97619048 0.97609562 0.98007968 0.97211155]
Mean: 0.9761335609941189
Standard deviation: 0.0025201716468269478


#### Grid Search

In [46]:
rf_sd_grid_params = {'model__n_estimators': [100, 200, 400],
                     'model__criterion': ['gini', 'entropy'],
                     'model__max_features': ['auto', 'sqrt', 'log2'],
                     'model__ccp_alpha': [0, .01, .1, 1, 5]}

In [47]:
rf_sd_gs = GridSearchCV(estimator = rf_sd,
                        scoring = 'accuracy',
                        param_grid = rf_sd_grid_params,
                        n_jobs = 3,
                        cv = 5,
                        verbose = 1)

In [48]:
rf_sd_gs.fit(X_train, y_train.values.ravel())

Fitting 5 folds for each of 90 candidates, totalling 450 fits


GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('model', RandomForestClassifier())]),
             n_jobs=3,
             param_grid={'model__ccp_alpha': [0, 0.01, 0.1, 1, 5],
                         'model__criterion': ['gini', 'entropy'],
                         'model__max_features': ['auto', 'sqrt', 'log2'],
                         'model__n_estimators': [100, 200, 400]},
             scoring='accuracy', verbose=1)

In [49]:
rf_sd_gs.best_score_

0.978527161196484

In [50]:
rf_sd_gs.best_estimator_

Pipeline(steps=[('scaler', StandardScaler()),
                ('model',
                 RandomForestClassifier(ccp_alpha=0, n_estimators=400))])

#### Hyperparameter tuned version

In [51]:
rf_sd_hp = Pipeline([("scaler", StandardScaler()), ("model", RandomForestClassifier(ccp_alpha = 0, n_estimators = 400))])

In [52]:
rf_sd_hp_cv = cross_val_score(rf_sd_hp, X_train, y_train.values.ravel(), cv=5, scoring = 'accuracy')
print("Scores:", rf_sd_cv)
print("Mean:", rf_sd_cv.mean())
print("Standard deviation:", rf_sd_cv.std())

Scores: [0.97619048 0.97619048 0.97609562 0.98007968 0.97211155]
Mean: 0.9761335609941189
Standard deviation: 0.0025201716468269478


### Non-standerdized version