### Bootstrap

#### Packages

In [1]:
import pandas as pkg_pandas
import math as pkg_math
from matplotlib import pyplot as pkg_plot
from sklearn import linear_model as pkg_linear_model
from sklearn import model_selection as pkg_model_selection
from sklearn import preprocessing as pkg_preprocessing
from sklearn import tree as pkg_tree
from sklearn import metrics as pkg_metrics
from sklearn import datasets as pkg_datasets
from sklearn import ensemble as pkg_ensemble
from sklearn import svm as pkg_svm
import seaborn as pkg_seaborn

#### Load Data

In [2]:
loaded_data = pkg_datasets.load_digits()
dir(loaded_data)

['DESCR', 'data', 'feature_names', 'frame', 'images', 'target', 'target_names']

In [3]:
print("\nFeature Names = {}\nTarget Names = {}\nColumn".format(\
    loaded_data.feature_names, loaded_data.target_names))


Feature Names = ['pixel_0_0', 'pixel_0_1', 'pixel_0_2', 'pixel_0_3', 'pixel_0_4', 'pixel_0_5', 'pixel_0_6', 'pixel_0_7', 'pixel_1_0', 'pixel_1_1', 'pixel_1_2', 'pixel_1_3', 'pixel_1_4', 'pixel_1_5', 'pixel_1_6', 'pixel_1_7', 'pixel_2_0', 'pixel_2_1', 'pixel_2_2', 'pixel_2_3', 'pixel_2_4', 'pixel_2_5', 'pixel_2_6', 'pixel_2_7', 'pixel_3_0', 'pixel_3_1', 'pixel_3_2', 'pixel_3_3', 'pixel_3_4', 'pixel_3_5', 'pixel_3_6', 'pixel_3_7', 'pixel_4_0', 'pixel_4_1', 'pixel_4_2', 'pixel_4_3', 'pixel_4_4', 'pixel_4_5', 'pixel_4_6', 'pixel_4_7', 'pixel_5_0', 'pixel_5_1', 'pixel_5_2', 'pixel_5_3', 'pixel_5_4', 'pixel_5_5', 'pixel_5_6', 'pixel_5_7', 'pixel_6_0', 'pixel_6_1', 'pixel_6_2', 'pixel_6_3', 'pixel_6_4', 'pixel_6_5', 'pixel_6_6', 'pixel_6_7', 'pixel_7_0', 'pixel_7_1', 'pixel_7_2', 'pixel_7_3', 'pixel_7_4', 'pixel_7_5', 'pixel_7_6', 'pixel_7_7']
Target Names = [0 1 2 3 4 5 6 7 8 9]
Column


In [4]:
loaded_df = pkg_pandas.DataFrame(loaded_data.data, columns=loaded_data.feature_names)
loaded_df['target'] = loaded_data.target
loaded_df.head()

Unnamed: 0,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,pixel_1_1,...,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7,target
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0,1
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0,2
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0,3
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0,4


### Analysis

#### Pre-Work

In [5]:
def get_score(model, train_X, train_Y, test_X, test_Y):
    model.fit(X=train_X, y=train_Y)
    return model.score(X=test_X, y=test_Y)

#### Model: KFold and Various

In [6]:
# Initialize
baseline_df = loaded_df

In [7]:
output_column_name = 'target'
baseline_outputs = baseline_df[output_column_name]
baseline_inputs = baseline_df.drop(columns=[output_column_name]).to_numpy()

In [8]:
baseline_inputs[2:3]

array([[ 0.,  0.,  0.,  4., 15., 12.,  0.,  0.,  0.,  0.,  3., 16., 15.,
        14.,  0.,  0.,  0.,  0.,  8., 13.,  8., 16.,  0.,  0.,  0.,  0.,
         1.,  6., 15., 11.,  0.,  0.,  0.,  1.,  8., 13., 15.,  1.,  0.,
         0.,  0.,  9., 16., 16.,  5.,  0.,  0.,  0.,  0.,  3., 13., 16.,
        16., 11.,  5.,  0.,  0.,  0.,  0.,  3., 11., 16.,  9.,  0.]])

In [9]:
baseline_outputs[5:10]

5    5
6    6
7    7
8    8
9    9
Name: target, dtype: int64

In [10]:
kf = pkg_model_selection.KFold(n_splits=5)
kf

KFold(n_splits=5, random_state=None, shuffle=False)

In [11]:
scores_lnr = []
scores_lgr = []
scores_svm = []
scores_dst = []
scores_rf = []

for train_index, test_index in kf.split(X=baseline_inputs, y=baseline_outputs):
    train_inputs, test_inputs = baseline_inputs[train_index], baseline_inputs[test_index]
    train_outputs, test_outputs = baseline_outputs[train_index], baseline_outputs[test_index]
    scores_lnr.append(get_score(pkg_linear_model.LinearRegression(), \
        train_X=train_inputs, train_Y=train_outputs, test_X=test_inputs, test_Y=test_outputs))
    scores_lgr.append(get_score(pkg_linear_model.LogisticRegression(max_iter=4000), \
        train_X=train_inputs, train_Y=train_outputs, test_X=test_inputs, test_Y=test_outputs))
    scores_svm.append(get_score(pkg_svm.SVC(), \
        train_X=train_inputs, train_Y=train_outputs, test_X=test_inputs, test_Y=test_outputs))
    scores_dst.append(get_score(pkg_tree.DecisionTreeClassifier(), \
        train_X=train_inputs, train_Y=train_outputs, test_X=test_inputs, test_Y=test_outputs))        
    scores_rf.append(get_score(pkg_ensemble.RandomForestClassifier(n_estimators=40), \
        train_X=train_inputs, train_Y=train_outputs, test_X=test_inputs, test_Y=test_outputs))

In [12]:
print("= = = ::: Scores ::: = = =")
print("Linear Regression = {}".format(scores_lnr))
print("Logistic Regression = {}".format(scores_lgr))
print("Support Vector Machine = {}".format(scores_svm))
print("Decision Trees = {}".format(scores_dst))
print("Random Forest = {}".format(scores_rf))

= = = ::: Scores ::: = = =
Linear Regression = [0.4834604815239937, 0.5583602990992982, 0.5753452188928303, 0.5056632027016659, 0.40995457309755634]
Logistic Regression = [0.9305555555555556, 0.8777777777777778, 0.9415041782729805, 0.9387186629526463, 0.8997214484679665]
Support Vector Machine = [0.9694444444444444, 0.9472222222222222, 0.9832869080779945, 0.9888579387186629, 0.9415041782729805]
Decision Trees = [0.7722222222222223, 0.7361111111111112, 0.8105849582172702, 0.8328690807799443, 0.7855153203342619]
Random Forest = [0.9388888888888889, 0.9055555555555556, 0.9526462395543176, 0.958217270194986, 0.9331476323119777]


#### Model: Cross Validation

In [13]:
pkg_model_selection.cross_val_score(pkg_linear_model.LinearRegression(), baseline_inputs, baseline_outputs)

array([0.48346048, 0.5583603 , 0.57534522, 0.5056632 , 0.40995457])

In [14]:
pkg_model_selection.cross_val_score(pkg_linear_model.LogisticRegression(max_iter=5000), baseline_inputs, baseline_outputs)

array([0.925     , 0.87777778, 0.93871866, 0.93314763, 0.89693593])

In [15]:
pkg_model_selection.cross_val_score(pkg_svm.SVC(), baseline_inputs, baseline_outputs)

array([0.96111111, 0.94444444, 0.98328691, 0.98885794, 0.93871866])

In [16]:
pkg_model_selection.cross_val_score(pkg_tree.DecisionTreeClassifier(), baseline_inputs, baseline_outputs)

array([0.78611111, 0.70833333, 0.79665738, 0.8356546 , 0.79387187])

In [17]:
for num_est in [20,35,60,90,140]:
    print(pkg_model_selection.cross_val_score(pkg_ensemble.RandomForestClassifier(n_estimators=num_est), baseline_inputs, baseline_outputs))

[0.925      0.88888889 0.93314763 0.95264624 0.90529248]
[0.93055556 0.93888889 0.97214485 0.94986072 0.93036212]
[0.93055556 0.89444444 0.96657382 0.95543175 0.92479109]
[0.925      0.9        0.96100279 0.97493036 0.93871866]
[0.92777778 0.90277778 0.95543175 0.96100279 0.91643454]
