# Capstone Project Part 08: Model Benchmarks

## Import libraries and modules

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA

from xgboost import XGBClassifier

from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.regularizers import l2
from keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasClassifier

%matplotlib inline

Using TensorFlow backend.


## Load preprocessed data

Load `preprocessed.csv` from `data` folder into a pandas DataFrame.

In [2]:
df = pd.read_csv('../data/preprocessed.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3452 entries, 0 to 3451
Columns: 171 entries, year to sector_Wireless Telecommunication Services
dtypes: float64(37), int64(133), object(1)
memory usage: 4.5+ MB


---

## Set up X and y

In [3]:
target = 'cap'
X = df[[f for f in df.columns if f not in target and f != 'security' and f != 'year']]
y = df[target]

---

## Train/Test Split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

---

## Instantiate models

In [5]:
# Normalize X
ss = StandardScaler()

In [6]:
lr = RandomizedSearchCV(
    estimator=Pipeline([
        ('ss', ss), 
        ('clf', LogisticRegression(max_iter=10_000, multi_class='multinomial'))
    ]),
    param_distributions={
        'clf__penalty': ['l2', 'none'],
        'clf__C': np.logspace(-5, 2), 
        'clf__class_weight': [None, 'balanced'],
        'clf__solver': ['newton-cg', 'sag', 'lbfgs']
    },
    n_jobs=-1,
    cv=5,
    verbose=1
)

In [7]:
lr_l1 = RandomizedSearchCV(
    estimator=Pipeline([
        ('ss', ss), 
        ('clf', LogisticRegression(penalty='l1', solver='saga', max_iter=10_000, multi_class='multinomial'))
    ]),
    param_distributions={
        'clf__C': np.logspace(-5, 2), 
        'clf__class_weight': [None, 'balanced']
    },
    n_jobs=-1,
    cv=5,
    verbose=1
)

In [8]:
lr_elasticnet = RandomizedSearchCV(
    estimator=Pipeline([
        ('ss', ss), 
        ('clf', LogisticRegression(penalty='elasticnet', solver='saga', max_iter=10_000, multi_class='multinomial'))
    ]),
    param_distributions={
        'clf__C': np.logspace(-5, 2), 
        'clf__class_weight': [None, 'balanced'],
        'clf__l1_ratio': np.linspace(0.1, 1)
    },
    n_jobs=-1,
    cv=5,
    verbose=1
)

In [9]:
knn = RandomizedSearchCV(
    estimator=Pipeline([
        ('ss', ss), 
        ('clf', KNeighborsClassifier())
    ]),
    param_distributions={
        'clf__n_neighbors': np.arange(1, 22, 2), 
        'clf__weights': ['uniform', 'distance'],
        'clf__metric': ['euclidean','manhattan']
    },
    n_jobs=-1,
    cv=5,
    verbose=1
)

In [10]:
bnb = RandomizedSearchCV(
    estimator=Pipeline([
        ('ss', ss), 
        ('clf', BernoulliNB())
    ]),
    param_distributions={
        'clf__alpha': np.logspace(-5, 2)
    },
    n_jobs=-1,
    cv=5,
    verbose=1
)

In [11]:
gnb = RandomizedSearchCV(
    estimator=Pipeline([
        ('ss', ss), 
        ('clf', GaussianNB())
    ]),
    param_distributions={
        'clf__var_smoothing': np.logspace(-9, -4)
    },
    n_jobs=-1,
    cv=5,
    verbose=1
)

In [12]:
tree = RandomizedSearchCV(
    estimator=Pipeline([
        ('ss', ss), 
        ('clf', DecisionTreeClassifier())
    ]),
    param_distributions={
        'clf__max_depth': list(np.arange(1, 5)) + [None],
        'clf__min_samples_split': np.arange(2, 21),
        'clf__min_samples_leaf': np.arange(1, 8),
        'clf__class_weight': [None, 'balanced']
    },
    n_jobs=-1,
    cv=5,
    verbose=1
)

In [13]:
bagged = RandomizedSearchCV(
    estimator=Pipeline([
        ('ss', ss), 
        ('clf', BaggingClassifier(DecisionTreeClassifier()))
    ]),
    param_distributions={
        'clf__base_estimator__max_depth': list(np.arange(1, 5)) + [None],
        'clf__base_estimator__min_samples_split': np.arange(2, 21),
        'clf__base_estimator__min_samples_leaf': np.arange(1, 8),
        'clf__base_estimator__class_weight': [None, 'balanced'],
        'clf__n_estimators': np.arange(1, 150)
    },
    n_jobs=-1,
    cv=5,
    verbose=1
)

In [14]:
rf = RandomizedSearchCV(
    estimator=Pipeline([
        ('ss', ss), 
        ('clf', RandomForestClassifier())
    ]),
    param_distributions={
        'clf__max_depth': np.arange(1, 5),
        'clf__n_estimators': np.arange(1, 150),
        'clf__min_samples_split': np.arange(2, 21),
        'clf__min_samples_leaf': np.arange(1, 8),
        'clf__class_weight': [None, 'balanced', 'balanced_subsample']
    },
    n_jobs=-1,
    cv=5,
    verbose=1
)

In [15]:
et = RandomizedSearchCV(
    estimator=Pipeline([
        ('ss', ss), 
        ('clf', ExtraTreesClassifier())
    ]),
    param_distributions={
        'clf__max_depth': np.arange(1, 5),
        'clf__n_estimators': np.arange(1, 150),
        'clf__min_samples_split': np.arange(2, 21),
        'clf__min_samples_leaf': np.arange(1, 8),
        'clf__class_weight': [None, 'balanced', 'balanced_subsample']
    },
    n_jobs=-1,
    cv=5,
    verbose=1
)

In [16]:
ada = RandomizedSearchCV(
    estimator=Pipeline([
        ('ss', ss), 
        ('clf', AdaBoostClassifier(base_estimator=DecisionTreeClassifier()))
    ]),
    param_distributions={
        'clf__base_estimator__max_depth': list(np.arange(1, 5)) + [None],
        'clf__base_estimator__min_samples_split': np.arange(2, 21),
        'clf__base_estimator__min_samples_leaf': np.arange(1, 8),
        'clf__base_estimator__class_weight': [None, 'balanced'],
        'clf__n_estimators': np.arange(1, 150), 
        'clf__learning_rate': np.linspace(0.1, 1)
    },
    n_jobs=-1,
    cv=5,
    verbose=1
)

In [17]:
gb = RandomizedSearchCV(
    estimator=Pipeline([
        ('ss', ss), 
        ('clf', GradientBoostingClassifier())
    ]),
    param_distributions={
        'clf__max_depth': np.arange(1, 5),
        'clf__n_estimators': np.arange(1, 150), 
        'clf__min_samples_split': np.arange(2, 21),
        'clf__min_samples_leaf': np.arange(1, 8),
        'clf__learning_rate': np.linspace(0.1, 1)
    },
    n_jobs=-1,
    cv=5,
    verbose=1
)

In [18]:
vote = RandomizedSearchCV(
    estimator=Pipeline([
        ('ss', ss), 
        ('clf', VotingClassifier([
            ('tree', DecisionTreeClassifier()), 
            ('ada', AdaBoostClassifier(base_estimator=DecisionTreeClassifier())), 
            ('gb', GradientBoostingClassifier())
        ]))
    ]),
    param_distributions={
        'clf__tree__max_depth': list(np.arange(1, 5)) + [None],
        'clf__tree__min_samples_split': np.arange(2, 21),
        'clf__tree__min_samples_leaf': np.arange(1, 8),
        'clf__tree__class_weight': [None, 'balanced'],
        'clf__ada__base_estimator__max_depth': list(np.arange(1, 5)) + [None],
        'clf__ada__base_estimator__min_samples_split': np.arange(2, 21),
        'clf__ada__base_estimator__min_samples_leaf': np.arange(1, 8),
        'clf__ada__base_estimator__class_weight': [None, 'balanced'],
        'clf__ada__n_estimators': np.arange(1, 150), 
        'clf__ada__learning_rate': np.linspace(0.1, 1),
        'clf__gb__max_depth': np.arange(1, 5),
        'clf__gb__n_estimators': np.arange(1, 150), 
        'clf__gb__min_samples_split': np.arange(2, 21),
        'clf__gb__min_samples_leaf': np.arange(1, 8),
        'clf__gb__learning_rate': np.linspace(0.1, 1)
    },
    n_jobs=-1,
    cv=5,
    verbose=1
)

In [19]:
xgb = RandomizedSearchCV(
    estimator=Pipeline([
        ('ss', ss), 
        ('clf', XGBClassifier(objective='multi:softprob'))
    ]),
    param_distributions={
        'clf__max_depth': np.arange(1, 25),
        'clf__n_estimators': np.arange(1, 1000), 
        'clf__learning_rate': np.linspace(0.1, 1)
    },
    n_jobs=-1,
    cv=5,
    verbose=1
)

In [20]:
svm = RandomizedSearchCV(
    estimator=Pipeline([
        ('ss', ss), 
        ('clf', SVC(decision_function_shape='ovo'))
    ]),
    param_distributions={
        'clf__C': np.logspace(-5, 2),
        'clf__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'clf__gamma': list(np.logspace(-5, 2)) + ['scale'],
        'clf__class_weight': [None, 'balanced']
    },
    n_jobs=-1,
    cv=5,
    verbose=1
)

In [21]:
# Since this is a multiclass classification problem, keras needs y to be a one-hot encoded matrix
y_train_dummies = pd.get_dummies(y_train)

def ffnn_fn(
    layer_one_neurons=18,
    layer_two_neurons=18,
    layer_one_dropout=0.5,
    layer_two_dropout=0.5,
    opt_learning_rate=0.01
):
    # Create a neural network using the Dense and Dropout layers from keras. 
    model = Sequential()
    model.add(Dense(layer_one_neurons, activation='relu', kernel_regularizer=l2(), input_dim=X_train.shape[1]))
    model.add(Dropout(layer_one_dropout))
    model.add(Dense(layer_two_neurons, activation='relu', kernel_regularizer=l2()))
    model.add(Dropout(layer_two_dropout))
    
    # Activation function for the final output layer needs to be softmax 
    # to accomidate the three different classes.
    model.add(Dense(y_train_dummies.shape[1], activation='softmax'))
    
    ad = Adam(learning_rate=opt_learning_rate)
    
    # Compile model
    # Since this is a multiclass classification problem,loss function is categorical_crossentropy.
    model.compile(optimizer=ad, loss='categorical_crossentropy', metrics=['accuracy'])
    return model

ffnn = RandomizedSearchCV(
    estimator=Pipeline([
        ('ss', ss), 
        ('clf', KerasClassifier(build_fn=ffnn_fn, verbose=0))
    ]),
    param_distributions={
        'clf__layer_one_neurons': np.arange(10, X_train.shape[1]),
        'clf__layer_two_neurons': np.arange(10, X_train.shape[1]),
        'clf__layer_one_dropout': np.linspace(0.5, 0.8),
        'clf__layer_two_dropout': np.linspace(0.5, 0.8),
        'clf__opt_learning_rate': np.logspace(-4, -1),
        'clf__batch_size': [128, 256],
        'clf__epochs': np.arange(5, 50)
    },
    n_jobs=-1,
    cv=5,
    verbose=1
)

---

### Fit the models on training data

In [22]:
lr.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  4.2min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  6.6min finished
  "Setting penalty='none' will ignore the C and l1_ratio "


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('ss',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('clf',
                                              LogisticRegression(C=1.0,
                                                                 class_weight=None,
                                                                 dual=False,
                                                                 fit_intercept=True,
                                                                 intercept_scaling=1,
                                                                 l1_ratio=None,
                                                                 max_iter=1

In [23]:
lr_l1.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  6.4min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  8.4min finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('ss',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('clf',
                                              LogisticRegression(C=1.0,
                                                                 class_weight=None,
                                                                 dual=False,
                                                                 fit_intercept=True,
                                                                 intercept_scaling=1,
                                                                 l1_ratio=None,
                                                                 max_iter=1

In [24]:
lr_elasticnet.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 11.0min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 15.8min finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('ss',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('clf',
                                              LogisticRegression(C=1.0,
                                                                 class_weight=None,
                                                                 dual=False,
                                                                 fit_intercept=True,
                                                                 intercept_scaling=1,
                                                                 l1_ratio=None,
                                                                 max_iter=1

In [25]:
knn.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   13.1s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   14.1s finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('ss',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('clf',
                                              KNeighborsClassifier(algorithm='auto',
                                                                   leaf_size=30,
                                                                   metric='minkowski',
                                                                   metric_params=None,
                                                                   n_jobs=None,
                                                                   n_neighbors=5,
                                                         

In [26]:
bnb.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.9s finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('ss',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('clf',
                                              BernoulliNB(alpha=1.0,
                                                          binarize=0.0,
                                                          class_prior=None,
                                                          fit_prior=True))],
                                      verbose=False),
                   iid='warn', n_iter=10, n_jobs=-1,
                   param_distributions={'clf__alpha': array([1.00000000e-05, 1.38949549e-05,...
       3.72759372e-01, 5.17947468e-01, 7.19685673e-01, 1.00000

In [27]:
gnb.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.8s finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('ss',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('clf',
                                              GaussianNB(priors=None,
                                                         var_smoothing=1e-09))],
                                      verbose=False),
                   iid='warn', n_iter=10, n_jobs=-1,
                   param_distributions={'clf__var_smoothing': array([1.00000000e-09, 1.26485522e-09, 1.59985872e-09, 2...
       1.84206997e-06, 2.32995181e-06, 2.94705170e-06, 3.72759372e-06,
       4.71486636e-06, 5.96362332e-06, 7.54312006e-06, 9.54095476e-06,
       1.20679264e-05, 1.52641797e-05

In [28]:
tree.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    1.4s finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('ss',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('clf',
                                              DecisionTreeClassifier(class_weight=None,
                                                                     criterion='gini',
                                                                     max_depth=None,
                                                                     max_features=None,
                                                                     max_leaf_nodes=None,
                                                                     min_impurity_decrease=0.0,
                         

In [29]:
bagged.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   35.7s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   40.5s finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('ss',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('clf',
                                              BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None,
                                                                                                      criterion='gini',
                                                                                                      max_depth=None,
                                                                                                      max_features=None,
                                                                               

In [30]:
rf.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    9.9s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   11.9s finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('ss',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('clf',
                                              RandomForestClassifier(bootstrap=True,
                                                                     class_weight=None,
                                                                     criterion='gini',
                                                                     max_depth=None,
                                                                     max_features='auto',
                                                                     max_leaf_nodes=None,
                                  

In [31]:
et.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   12.3s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   14.7s finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('ss',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('clf',
                                              ExtraTreesClassifier(bootstrap=False,
                                                                   class_weight=None,
                                                                   criterion='gini',
                                                                   max_depth=None,
                                                                   max_features='auto',
                                                                   max_leaf_nodes=None,
                                             

In [32]:
ada.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   42.0s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   45.8s finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('ss',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('clf',
                                              AdaBoostClassifier(algorithm='SAMME.R',
                                                                 base_estimator=DecisionTreeClassifier(class_weight=None,
                                                                                                       criterion='gini',
                                                                                                       max_depth=None,
                                                                                                       max_feat

In [33]:
gb.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   41.2s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   45.3s finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('ss',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('clf',
                                              GradientBoostingClassifier(criterion='friedman_mse',
                                                                         init=None,
                                                                         learning_rate=0.1,
                                                                         loss='deviance',
                                                                         max_depth=3,
                                                                         max_features=None,
                

In [34]:
vote.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  2.4min finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('ss',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('clf',
                                              VotingClassifier(estimators=[('tree',
                                                                            DecisionTreeClassifier(class_weight=None,
                                                                                                   criterion='gini',
                                                                                                   max_depth=None,
                                                                                                   max_features=None,
       

In [35]:
xgb.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  9.6min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 10.8min finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('ss',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('clf',
                                              XGBClassifier(base_score=0.5,
                                                            booster='gbtree',
                                                            colsample_bylevel=1,
                                                            colsample_bytree=1,
                                                            gamma=0,
                                                            learning_rate=0.1,
                                                            max_delta_step=0,
                  

In [36]:
svm.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   26.3s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   32.1s finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('ss',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('clf',
                                              SVC(C=1.0, cache_size=200,
                                                  class_weight=None, coef0=0.0,
                                                  decision_function_shape='ovo',
                                                  degree=3,
                                                  gamma='auto_deprecated',
                                                  kernel='rbf', max_iter=-1,
                                                  probability=False,
                                            

In [37]:
ffnn.fit(X_train, y_train_dummies)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  3.1min finished




RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('ss',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('clf',
                                              <keras.wrappers.scikit_learn.KerasClassifier object at 0x14b944110>)],
                                      verbose=False),
                   iid='warn', n_iter=10, n_jobs=-1,
                   param_distributions={'clf__batch_size': [128, 256],
                                        'clf__epochs': array([ 5,  6,...
       0.00339322, 0.00390694, 0.00449843, 0.00517947, 0.00596362,
       0.00686649, 0.00790604, 0.00910298, 0.01048113, 0.01206793,
       0.01389495, 0.01599859, 0.0184207 , 0.02120951, 0.

---

### Evaluate model on test data

In [38]:
print("Best Combination Of Hyperparameters:", lr.best_params_) 
print("Best Accuracy:", lr.best_score_)
print(
    classification_report(
        y_test,
        lr.predict(X_test), 
        target_names=['large-cap', 'mid-cap', 'small-cap']
    )
)

Best Combination Of Hyperparameters: {'clf__solver': 'lbfgs', 'clf__penalty': 'none', 'clf__class_weight': 'balanced', 'clf__C': 10.0}
Best Accuracy: 0.9057551178061027
              precision    recall  f1-score   support

   large-cap       1.00      1.00      1.00       474
     mid-cap       0.65      0.74      0.69       151
   small-cap       0.82      0.75      0.78       238

    accuracy                           0.89       863
   macro avg       0.82      0.83      0.82       863
weighted avg       0.89      0.89      0.89       863



In [39]:
print("Best Combination Of Hyperparameters:", lr_l1.best_params_) 
print("Best Accuracy:", lr_l1.best_score_)
print(
    classification_report(
        y_test,
        lr_l1.predict(X_test),
        target_names=['large-cap', 'mid-cap', 'small-cap']
    )
)

Best Combination Of Hyperparameters: {'clf__class_weight': 'balanced', 'clf__C': 1.389495494373136}
Best Accuracy: 0.8941676322904596
              precision    recall  f1-score   support

   large-cap       1.00      1.00      1.00       474
     mid-cap       0.65      0.76      0.70       151
   small-cap       0.83      0.74      0.78       238

    accuracy                           0.89       863
   macro avg       0.83      0.83      0.83       863
weighted avg       0.89      0.89      0.89       863



In [40]:
print("Best Combination Of Hyperparameters:", lr_elasticnet.best_params_) 
print("Best Accuracy:", lr_elasticnet.best_score_)
print(
    classification_report(
        y_test, 
        lr_elasticnet.predict(X_test),
        target_names=['large-cap', 'mid-cap', 'small-cap']
    )
)

Best Combination Of Hyperparameters: {'clf__l1_ratio': 0.9081632653061225, 'clf__class_weight': 'balanced', 'clf__C': 5.179474679231202}
Best Accuracy: 0.8941676322904596
              precision    recall  f1-score   support

   large-cap       1.00      0.99      1.00       474
     mid-cap       0.63      0.75      0.69       151
   small-cap       0.82      0.73      0.78       238

    accuracy                           0.88       863
   macro avg       0.82      0.83      0.82       863
weighted avg       0.89      0.88      0.88       863



In [41]:
print("Best Combination Of Hyperparameters:", knn.best_params_) 
print("Best Accuracy:", knn.best_score_)
print(
    classification_report(
        y_test,
        knn.predict(X_test),
        target_names=['large-cap', 'mid-cap', 'small-cap']
    )
)

Best Combination Of Hyperparameters: {'clf__weights': 'distance', 'clf__n_neighbors': 7, 'clf__metric': 'manhattan'}
Best Accuracy: 0.9235225955967555
              precision    recall  f1-score   support

   large-cap       1.00      0.99      1.00       474
     mid-cap       0.84      0.81      0.83       151
   small-cap       0.88      0.91      0.89       238

    accuracy                           0.94       863
   macro avg       0.91      0.91      0.91       863
weighted avg       0.94      0.94      0.94       863



In [42]:
print("Best Combination Of Hyperparameters:", bnb.best_params_) 
print("Best Accuracy:", bnb.best_score_)
print(
    classification_report(
        y_test,
        bnb.predict(X_test),
        target_names=['large-cap', 'mid-cap', 'small-cap']
    )
)

Best Combination Of Hyperparameters: {'clf__alpha': 1e-05}
Best Accuracy: 0.8505214368482039
              precision    recall  f1-score   support

   large-cap       1.00      1.00      1.00       474
     mid-cap       0.58      0.58      0.58       151
   small-cap       0.74      0.74      0.74       238

    accuracy                           0.85       863
   macro avg       0.77      0.77      0.77       863
weighted avg       0.85      0.85      0.85       863



In [43]:
print("Best Combination Of Hyperparameters:", gnb.best_params_) 
print("Best Accuracy:", gnb.best_score_)
print(
    classification_report(
        y_test,
        gnb.predict(X_test),
        target_names=['large-cap', 'mid-cap', 'small-cap']
    )
)

Best Combination Of Hyperparameters: {'clf__var_smoothing': 7.906043210907701e-05}
Best Accuracy: 0.8501351873310158
              precision    recall  f1-score   support

   large-cap       1.00      1.00      1.00       474
     mid-cap       0.65      0.68      0.66       151
   small-cap       0.79      0.77      0.78       238

    accuracy                           0.88       863
   macro avg       0.81      0.81      0.81       863
weighted avg       0.88      0.88      0.88       863



In [44]:
print("Best Combination Of Hyperparameters:", tree.best_params_) 
print("Best Accuracy:", tree.best_score_)
print(
    classification_report(
        y_test,
        tree.predict(X_test), 
        target_names=['large-cap', 'mid-cap', 'small-cap']
    )
)

Best Combination Of Hyperparameters: {'clf__min_samples_split': 7, 'clf__min_samples_leaf': 1, 'clf__max_depth': None, 'clf__class_weight': None}
Best Accuracy: 0.8733101583623021
              precision    recall  f1-score   support

   large-cap       0.96      0.97      0.97       474
     mid-cap       0.74      0.72      0.73       151
   small-cap       0.83      0.84      0.84       238

    accuracy                           0.89       863
   macro avg       0.85      0.84      0.85       863
weighted avg       0.89      0.89      0.89       863



In [45]:
print("Best Combination Of Hyperparameters:", bagged.best_params_) 
print("Best Accuracy:", bagged.best_score_)
print(
    classification_report(
        y_test,
        bagged.predict(X_test),
        target_names=['large-cap', 'mid-cap', 'small-cap']
    )
)

Best Combination Of Hyperparameters: {'clf__n_estimators': 85, 'clf__base_estimator__min_samples_split': 13, 'clf__base_estimator__min_samples_leaf': 1, 'clf__base_estimator__max_depth': 4, 'clf__base_estimator__class_weight': None}
Best Accuracy: 0.8427964465044419
              precision    recall  f1-score   support

   large-cap       0.93      0.95      0.94       474
     mid-cap       0.64      0.54      0.58       151
   small-cap       0.78      0.82      0.80       238

    accuracy                           0.84       863
   macro avg       0.78      0.77      0.77       863
weighted avg       0.84      0.84      0.84       863



In [46]:
print("Best Combination Of Hyperparameters:", rf.best_params_) 
print("Best Accuracy:", rf.best_score_)
print(
    classification_report(
        y_test, 
        rf.predict(X_test),
        target_names=['large-cap', 'mid-cap', 'small-cap']
    )
)

Best Combination Of Hyperparameters: {'clf__n_estimators': 60, 'clf__min_samples_split': 6, 'clf__min_samples_leaf': 7, 'clf__max_depth': 4, 'clf__class_weight': 'balanced_subsample'}
Best Accuracy: 0.8091927385090769
              precision    recall  f1-score   support

   large-cap       0.96      0.86      0.91       474
     mid-cap       0.45      0.62      0.52       151
   small-cap       0.79      0.78      0.78       238

    accuracy                           0.79       863
   macro avg       0.73      0.75      0.74       863
weighted avg       0.82      0.79      0.81       863



In [47]:
print("Best Combination Of Hyperparameters:", et.best_params_) 
print("Best Accuracy:", et.best_score_)
print(
    classification_report(
        y_test, 
        et.predict(X_test),
        target_names=['large-cap', 'mid-cap', 'small-cap']
    )
)

Best Combination Of Hyperparameters: {'clf__n_estimators': 149, 'clf__min_samples_split': 10, 'clf__min_samples_leaf': 1, 'clf__max_depth': 3, 'clf__class_weight': 'balanced'}
Best Accuracy: 0.8122827346465817
              precision    recall  f1-score   support

   large-cap       0.98      0.99      0.99       474
     mid-cap       0.51      0.85      0.64       151
   small-cap       0.81      0.45      0.58       238

    accuracy                           0.82       863
   macro avg       0.77      0.76      0.74       863
weighted avg       0.85      0.82      0.81       863



In [48]:
print("Best Combination Of Hyperparameters:", ada.best_params_) 
print("Best Accuracy:", ada.best_score_)
print(
    classification_report(
        y_test, 
        ada.predict(X_test), 
        target_names=['large-cap', 'mid-cap', 'small-cap']
    )
)

Best Combination Of Hyperparameters: {'clf__n_estimators': 112, 'clf__learning_rate': 0.9081632653061225, 'clf__base_estimator__min_samples_split': 18, 'clf__base_estimator__min_samples_leaf': 2, 'clf__base_estimator__max_depth': None, 'clf__base_estimator__class_weight': None}
Best Accuracy: 0.9602162997296253
              precision    recall  f1-score   support

   large-cap       1.00      0.99      1.00       474
     mid-cap       0.93      0.92      0.92       151
   small-cap       0.95      0.97      0.96       238

    accuracy                           0.97       863
   macro avg       0.96      0.96      0.96       863
weighted avg       0.97      0.97      0.97       863



In [49]:
print("Best Combination Of Hyperparameters:", gb.best_params_) 
print("Best Accuracy:", gb.best_score_)
print(
    classification_report(
        y_test,
        gb.predict(X_test),
        target_names=['large-cap', 'mid-cap', 'small-cap']
    )
)

Best Combination Of Hyperparameters: {'clf__n_estimators': 132, 'clf__min_samples_split': 19, 'clf__min_samples_leaf': 2, 'clf__max_depth': 4, 'clf__learning_rate': 0.7244897959183674}
Best Accuracy: 0.9729625337968327
              precision    recall  f1-score   support

   large-cap       1.00      1.00      1.00       474
     mid-cap       0.96      0.90      0.93       151
   small-cap       0.94      0.97      0.96       238

    accuracy                           0.98       863
   macro avg       0.97      0.96      0.96       863
weighted avg       0.98      0.98      0.98       863



In [50]:
print("Best Combination Of Hyperparameters:", vote.best_params_) 
print("Best Accuracy:", vote.best_score_)
print(
    classification_report(
        y_test,
        vote.predict(X_test), 
        target_names=['large-cap', 'mid-cap', 'small-cap']
    )
)

Best Combination Of Hyperparameters: {'clf__tree__min_samples_split': 3, 'clf__tree__min_samples_leaf': 5, 'clf__tree__max_depth': 4, 'clf__tree__class_weight': 'balanced', 'clf__gb__n_estimators': 146, 'clf__gb__min_samples_split': 14, 'clf__gb__min_samples_leaf': 7, 'clf__gb__max_depth': 4, 'clf__gb__learning_rate': 0.6510204081632653, 'clf__ada__n_estimators': 68, 'clf__ada__learning_rate': 0.5959183673469388, 'clf__ada__base_estimator__min_samples_split': 9, 'clf__ada__base_estimator__min_samples_leaf': 6, 'clf__ada__base_estimator__max_depth': None, 'clf__ada__base_estimator__class_weight': 'balanced'}
Best Accuracy: 0.9636925453843183
              precision    recall  f1-score   support

   large-cap       0.99      1.00      1.00       474
     mid-cap       0.94      0.87      0.90       151
   small-cap       0.92      0.95      0.94       238

    accuracy                           0.96       863
   macro avg       0.95      0.94      0.94       863
weighted avg       0.96  

In [51]:
print("Best Combination Of Hyperparameters:", xgb.best_params_) 
print("Best Accuracy:", xgb.best_score_)
print(
    classification_report(
        y_test,
        xgb.predict(X_test), 
        target_names=['large-cap', 'mid-cap', 'small-cap']
    )
)

Best Combination Of Hyperparameters: {'clf__n_estimators': 958, 'clf__max_depth': 3, 'clf__learning_rate': 0.21020408163265308}
Best Accuracy: 0.9652375434530707
              precision    recall  f1-score   support

   large-cap       1.00      1.00      1.00       474
     mid-cap       0.94      0.91      0.93       151
   small-cap       0.95      0.97      0.96       238

    accuracy                           0.97       863
   macro avg       0.96      0.96      0.96       863
weighted avg       0.97      0.97      0.97       863



In [52]:
print("Best Combination Of Hyperparameters:", svm.best_params_) 
print("Best Accuracy:", svm.best_score_)
print(
    classification_report(
        y_test,
        svm.predict(X_test),
        target_names=['large-cap', 'mid-cap', 'small-cap']
    )
)

Best Combination Of Hyperparameters: {'clf__kernel': 'poly', 'clf__gamma': 1.9306977288832496, 'clf__class_weight': 'balanced', 'clf__C': 5.179474679231202}
Best Accuracy: 0.9312475859405176
              precision    recall  f1-score   support

   large-cap       1.00      1.00      1.00       474
     mid-cap       0.84      0.85      0.84       151
   small-cap       0.90      0.90      0.90       238

    accuracy                           0.95       863
   macro avg       0.92      0.92      0.92       863
weighted avg       0.95      0.95      0.95       863



In [53]:
print("Best Combination Of Hyperparameters:", ffnn.best_params_) 
print("Best Accuracy:", ffnn.best_score_)
print(
    classification_report(
        y_test,
        ffnn.predict(X_test),
        target_names=['large-cap', 'mid-cap', 'small-cap']
    )
)

Best Combination Of Hyperparameters: {'clf__opt_learning_rate': 0.00339322177189533, 'clf__layer_two_neurons': 79, 'clf__layer_two_dropout': 0.5612244897959183, 'clf__layer_one_neurons': 130, 'clf__layer_one_dropout': 0.616326530612245, 'clf__epochs': 6, 'clf__batch_size': 128}
Best Accuracy: 0.8455001856655446
              precision    recall  f1-score   support

   large-cap       1.00      1.00      1.00       474
     mid-cap       0.67      0.24      0.35       151
   small-cap       0.65      0.92      0.77       238

    accuracy                           0.84       863
   macro avg       0.77      0.72      0.71       863
weighted avg       0.85      0.84      0.82       863



---

### Instantiate models

In [54]:
lr_pca = RandomizedSearchCV(
    estimator=Pipeline([
        ('ss', ss), 
        ('pca', PCA()),
        ('clf', LogisticRegression(max_iter=10_000, multi_class='multinomial'))
    ]),
    param_distributions={
        'pca__n_components': np.linspace(0.1, 0.9),
        'clf__penalty': ['l2', 'none'],
        'clf__C': np.logspace(-5, 2), 
        'clf__class_weight': [None, 'balanced'],
        'clf__solver': ['newton-cg', 'sag', 'lbfgs']
    },
    n_jobs=-1,
    cv=5,
    verbose=1
)

In [55]:
lr_l1_pca = RandomizedSearchCV(
    estimator=Pipeline([
        ('ss', ss), 
        ('pca', PCA()),
        ('clf', LogisticRegression(penalty='l1', solver='saga', max_iter=10_000, multi_class='multinomial'))
    ]),
    param_distributions={
        'pca__n_components': np.linspace(0.1, 0.9),
        'clf__C': np.logspace(-5, 2), 
        'clf__class_weight': [None, 'balanced']
    },
    n_jobs=-1,
    cv=5,
    verbose=1
)

In [56]:
lr_elasticnet_pca = RandomizedSearchCV(
    estimator=Pipeline([
        ('ss', ss), 
        ('pca', PCA()),
        ('clf', LogisticRegression(penalty='elasticnet', solver='saga', max_iter=10_000, multi_class='multinomial'))
    ]),
    param_distributions={
        'pca__n_components': np.linspace(0.1, 0.9),
        'clf__C': np.logspace(-5, 2), 
        'clf__class_weight': [None, 'balanced'],
        'clf__l1_ratio': np.linspace(0.1, 1)
    },
    n_jobs=-1,
    cv=5,
    verbose=1
)

In [57]:
knn_pca = RandomizedSearchCV(
    estimator=Pipeline([
        ('ss', ss), 
        ('pca', PCA()),
        ('clf', KNeighborsClassifier())
    ]),
    param_distributions={
        'pca__n_components': np.linspace(0.1, 0.9),
        'clf__n_neighbors': np.arange(1, 22, 2), 
        'clf__weights': ['uniform', 'distance'],
        'clf__metric': ['euclidean','manhattan']
    },
    n_jobs=-1,
    cv=5,
    verbose=1
)

In [58]:
bnb_pca = RandomizedSearchCV(
    estimator=Pipeline([
        ('ss', ss), 
        ('pca', PCA()),
        ('clf', BernoulliNB())
    ]),
    param_distributions={
        'pca__n_components': np.linspace(0.1, 0.9),
        'clf__alpha': np.logspace(-5, 2)
    },
    n_jobs=-1,
    cv=5,
    verbose=1
)

In [59]:
gnb_pca = RandomizedSearchCV(
    estimator=Pipeline([
        ('ss', ss), 
        ('pca', PCA()),
        ('clf', GaussianNB())
    ]),
    param_distributions={
        'pca__n_components': np.linspace(0.1, 0.9),
        'clf__var_smoothing': np.logspace(-9, -4)
    },
    n_jobs=-1,
    cv=5,
    verbose=1
)

In [60]:
tree_pca = RandomizedSearchCV(
    estimator=Pipeline([
        ('ss', ss), 
        ('pca', PCA()),
        ('clf', DecisionTreeClassifier())
    ]),
    param_distributions={
        'pca__n_components': np.linspace(0.1, 0.9),
        'clf__max_depth': list(np.arange(1, 5)) + [None],
        'clf__min_samples_split': np.arange(2, 21),
        'clf__min_samples_leaf': np.arange(1, 8),
        'clf__class_weight': [None, 'balanced']
    },
    n_jobs=-1,
    cv=5,
    verbose=1
)

In [61]:
bagged_pca = RandomizedSearchCV(
    estimator=Pipeline([
        ('ss', ss), 
        ('pca', PCA()),
        ('clf', BaggingClassifier(DecisionTreeClassifier()))
    ]),
    param_distributions={
        'pca__n_components': np.linspace(0.1, 0.9),
        'clf__base_estimator__max_depth': list(np.arange(1, 5)) + [None],
        'clf__base_estimator__min_samples_split': np.arange(2, 21),
        'clf__base_estimator__min_samples_leaf': np.arange(1, 8),
        'clf__base_estimator__class_weight': [None, 'balanced'],
        'clf__n_estimators': np.arange(1, 150)
    },
    n_jobs=-1,
    cv=5,
    verbose=1
)

In [62]:
rf_pca = RandomizedSearchCV(
    estimator=Pipeline([
        ('ss', ss), 
        ('pca', PCA()),
        ('clf', RandomForestClassifier())
    ]),
    param_distributions={
        'pca__n_components': np.linspace(0.1, 0.9),
        'clf__max_depth': np.arange(1, 5),
        'clf__n_estimators': np.arange(1, 150),
        'clf__min_samples_split': np.arange(2, 21),
        'clf__min_samples_leaf': np.arange(1, 8),
        'clf__class_weight': [None, 'balanced', 'balanced_subsample']
    },
    n_jobs=-1,
    cv=5,
    verbose=1
)

In [63]:
et_pca = RandomizedSearchCV(
    estimator=Pipeline([
        ('ss', ss), 
        ('pca', PCA()),
        ('clf', ExtraTreesClassifier())
    ]),
    param_distributions={
        'pca__n_components': np.linspace(0.1, 0.9),
        'clf__max_depth': np.arange(1, 5),
        'clf__n_estimators': np.arange(1, 150),
        'clf__min_samples_split': np.arange(2, 21),
        'clf__min_samples_leaf': np.arange(1, 8),
        'clf__class_weight': [None, 'balanced', 'balanced_subsample']
    },
    n_jobs=-1,
    cv=5,
    verbose=1
)

In [64]:
ada_pca = RandomizedSearchCV(
    estimator=Pipeline([
        ('ss', ss), 
        ('pca', PCA()),
        ('clf', AdaBoostClassifier(base_estimator=DecisionTreeClassifier()))
    ]),
    param_distributions={
        'pca__n_components': np.linspace(0.1, 0.9),
        'clf__base_estimator__max_depth': list(np.arange(1, 5)) + [None],
        'clf__base_estimator__min_samples_split': np.arange(2, 21),
        'clf__base_estimator__min_samples_leaf': np.arange(1, 8),
        'clf__base_estimator__class_weight': [None, 'balanced'],
        'clf__n_estimators': np.arange(1, 150), 
        'clf__learning_rate': np.linspace(0.1, 1)
    },
    n_jobs=-1,
    cv=5,
    verbose=1
)

In [65]:
gb_pca = RandomizedSearchCV(
    estimator=Pipeline([
        ('ss', ss), 
        ('pca', PCA()),
        ('clf', GradientBoostingClassifier())
    ]),
    param_distributions={
        'pca__n_components': np.linspace(0.1, 0.9),
        'clf__max_depth': np.arange(1, 5),
        'clf__n_estimators': np.arange(1, 150), 
        'clf__min_samples_split': np.arange(2, 21),
        'clf__min_samples_leaf': np.arange(1, 8),
        'clf__learning_rate': np.linspace(0.1, 1)
    },
    n_jobs=-1,
    cv=5,
    verbose=1
)

In [66]:
vote_pca = RandomizedSearchCV(
    estimator=Pipeline([
        ('ss', ss), 
        ('pca', PCA()),
        ('clf', VotingClassifier([
            ('tree', DecisionTreeClassifier()), 
            ('ada', AdaBoostClassifier(base_estimator=DecisionTreeClassifier())), 
            ('gb', GradientBoostingClassifier())
        ]))
    ]),
    param_distributions={
        'pca__n_components': np.linspace(0.1, 0.9),
        'clf__tree__max_depth': list(np.arange(1, 5)) + [None],
        'clf__tree__min_samples_split': np.arange(2, 21),
        'clf__tree__min_samples_leaf': np.arange(1, 8),
        'clf__tree__class_weight': [None, 'balanced'],
        'clf__ada__base_estimator__max_depth': list(np.arange(1, 5)) + [None],
        'clf__ada__base_estimator__min_samples_split': np.arange(2, 21),
        'clf__ada__base_estimator__min_samples_leaf': np.arange(1, 8),
        'clf__ada__base_estimator__class_weight': [None, 'balanced'],
        'clf__ada__n_estimators': np.arange(1, 150), 
        'clf__ada__learning_rate': np.linspace(0.1, 1),
        'clf__gb__max_depth': np.arange(1, 5),
        'clf__gb__n_estimators': np.arange(1, 150), 
        'clf__gb__min_samples_split': np.arange(2, 21),
        'clf__gb__min_samples_leaf': np.arange(1, 8),
        'clf__gb__learning_rate': np.linspace(0.1, 1)
    },
    n_jobs=-1,
    cv=5,
    verbose=1
)

In [67]:
xgb_pca = RandomizedSearchCV(
    estimator=Pipeline([
        ('ss', ss), 
        ('pca', PCA()),
        ('clf', XGBClassifier(objective='multi:softprob'))
    ]),
    param_distributions={
        'pca__n_components': np.linspace(0.1, 0.9),
        'clf__max_depth': np.arange(1, 25),
        'clf__n_estimators': np.arange(1, 1000), 
        'clf__learning_rate': np.linspace(0.1, 1)
    },
    n_jobs=-1,
    cv=5,
    verbose=1
)

In [68]:
svm_pca = RandomizedSearchCV(
    estimator=Pipeline([
        ('ss', ss), 
        ('pca', PCA()),
        ('clf', SVC(decision_function_shape='ovo'))
    ]),
    param_distributions={
        'pca__n_components': np.linspace(0.1, 0.9),
        'clf__C': np.logspace(-5, 2),
        'clf__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'clf__gamma': list(np.logspace(-5, 2)) + ['scale'],
        'clf__class_weight': [None, 'balanced']
    },
    n_jobs=-1,
    cv=5,
    verbose=1
)

In [69]:
ffnn_pca = RandomizedSearchCV(
    estimator=Pipeline([
        ('ss', ss), 
        ('pca', PCA()),
        ('clf', KerasClassifier(build_fn=ffnn_fn, verbose=0))
    ]),
    param_distributions={
        'clf__layer_one_neurons': np.arange(10, X_train.shape[1]),
        'clf__layer_two_neurons': np.arange(10, X_train.shape[1]),
        'clf__layer_one_dropout': np.linspace(0.5, 0.8),
        'clf__layer_two_dropout': np.linspace(0.5, 0.8),
        'clf__opt_learning_rate': np.logspace(-4, -1),
        'clf__batch_size': [128, 256],
        'clf__epochs': np.arange(5, 50)
    },
    n_jobs=-1,
    cv=5,
    verbose=1
)

---

### Fit the models on training data

In [70]:
lr_pca.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 16.2min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 18.0min finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('ss',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('pca',
                                              PCA(copy=True,
                                                  iterated_power='auto',
                                                  n_components=None,
                                                  random_state=None,
                                                  svd_solver='auto', tol=0.0,
                                                  whiten=False)),
                                             ('clf',
                                              LogisticRegression(C=1.0,
                  

In [71]:
lr_l1_pca.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.8min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  5.2min finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('ss',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('pca',
                                              PCA(copy=True,
                                                  iterated_power='auto',
                                                  n_components=None,
                                                  random_state=None,
                                                  svd_solver='auto', tol=0.0,
                                                  whiten=False)),
                                             ('clf',
                                              LogisticRegression(C=1.0,
                  

In [72]:
lr_elasticnet_pca.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.8min finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('ss',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('pca',
                                              PCA(copy=True,
                                                  iterated_power='auto',
                                                  n_components=None,
                                                  random_state=None,
                                                  svd_solver='auto', tol=0.0,
                                                  whiten=False)),
                                             ('clf',
                                              LogisticRegression(C=1.0,
                  

In [73]:
knn_pca.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    8.1s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    8.8s finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('ss',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('pca',
                                              PCA(copy=True,
                                                  iterated_power='auto',
                                                  n_components=None,
                                                  random_state=None,
                                                  svd_solver='auto', tol=0.0,
                                                  whiten=False)),
                                             ('clf',
                                              KNeighborsClassifier(algorithm='auto',
     

In [74]:
bnb_pca.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    2.2s finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('ss',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('pca',
                                              PCA(copy=True,
                                                  iterated_power='auto',
                                                  n_components=None,
                                                  random_state=None,
                                                  svd_solver='auto', tol=0.0,
                                                  whiten=False)),
                                             ('clf',
                                              BernoulliNB(alpha=1.0,
                     

In [75]:
gnb_pca.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    2.2s finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('ss',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('pca',
                                              PCA(copy=True,
                                                  iterated_power='auto',
                                                  n_components=None,
                                                  random_state=None,
                                                  svd_solver='auto', tol=0.0,
                                                  whiten=False)),
                                             ('clf',
                                              GaussianNB(priors=None,
                    

In [76]:
tree_pca.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    3.9s finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('ss',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('pca',
                                              PCA(copy=True,
                                                  iterated_power='auto',
                                                  n_components=None,
                                                  random_state=None,
                                                  svd_solver='auto', tol=0.0,
                                                  whiten=False)),
                                             ('clf',
                                              DecisionTreeClassifier(class_weight=None,
  

In [77]:
bagged_pca.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.3min finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('ss',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('pca',
                                              PCA(copy=True,
                                                  iterated_power='auto',
                                                  n_components=None,
                                                  random_state=None,
                                                  svd_solver='auto', tol=0.0,
                                                  whiten=False)),
                                             ('clf',
                                              BaggingClassifier(base_estimator=DecisionTre

In [78]:
rf_pca.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    7.3s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    8.6s finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('ss',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('pca',
                                              PCA(copy=True,
                                                  iterated_power='auto',
                                                  n_components=None,
                                                  random_state=None,
                                                  svd_solver='auto', tol=0.0,
                                                  whiten=False)),
                                             ('clf',
                                              RandomForestClassifier(bootstrap=True,
     

In [79]:
et_pca.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.5s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    6.2s finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('ss',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('pca',
                                              PCA(copy=True,
                                                  iterated_power='auto',
                                                  n_components=None,
                                                  random_state=None,
                                                  svd_solver='auto', tol=0.0,
                                                  whiten=False)),
                                             ('clf',
                                              ExtraTreesClassifier(bootstrap=False,
      

In [80]:
ada_pca.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.4min finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('ss',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('pca',
                                              PCA(copy=True,
                                                  iterated_power='auto',
                                                  n_components=None,
                                                  random_state=None,
                                                  svd_solver='auto', tol=0.0,
                                                  whiten=False)),
                                             ('clf',
                                              AdaBoostClassifier(algorithm='SAMME.R',
    

In [81]:
gb_pca.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   25.9s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   28.9s finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('ss',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('pca',
                                              PCA(copy=True,
                                                  iterated_power='auto',
                                                  n_components=None,
                                                  random_state=None,
                                                  svd_solver='auto', tol=0.0,
                                                  whiten=False)),
                                             ('clf',
                                              GradientBoostingClassifier(criterion='friedm

In [82]:
vote_pca.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  2.9min finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('ss',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('pca',
                                              PCA(copy=True,
                                                  iterated_power='auto',
                                                  n_components=None,
                                                  random_state=None,
                                                  svd_solver='auto', tol=0.0,
                                                  whiten=False)),
                                             ('clf',
                                              VotingClassifier(estimators=[('tree',
      

In [83]:
xgb_pca.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  2.3min finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('ss',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('pca',
                                              PCA(copy=True,
                                                  iterated_power='auto',
                                                  n_components=None,
                                                  random_state=None,
                                                  svd_solver='auto', tol=0.0,
                                                  whiten=False)),
                                             ('clf',
                                              XGBClassifier(base_score=0.5,
              

In [84]:
svm_pca.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   14.6s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   16.2s finished


RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('ss',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('pca',
                                              PCA(copy=True,
                                                  iterated_power='auto',
                                                  n_components=None,
                                                  random_state=None,
                                                  svd_solver='auto', tol=0.0,
                                                  whiten=False)),
                                             ('clf',
                                              SVC(C=1.0, cache_size=200,
                 

In [85]:
ffnn_pca.fit(X_train, y_train_dummies)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.9min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  3.4min finished




RandomizedSearchCV(cv=5, error_score='raise-deprecating',
                   estimator=Pipeline(memory=None,
                                      steps=[('ss',
                                              StandardScaler(copy=True,
                                                             with_mean=True,
                                                             with_std=True)),
                                             ('pca',
                                              PCA(copy=True,
                                                  iterated_power='auto',
                                                  n_components=None,
                                                  random_state=None,
                                                  svd_solver='auto', tol=0.0,
                                                  whiten=False)),
                                             ('clf',
                                              <keras.wrappers.scikit_learn.KerasClassifier

---

### Evaluate model on test data

In [86]:
print("Best Combination Of Hyperparameters:", lr.best_params_) 
print("Best Accuracy:", lr.best_score_)
print(
    classification_report(
        y_test,
        lr.predict(X_test), 
        target_names=['large-cap', 'mid-cap', 'small-cap']
    )
)

Best Combination Of Hyperparameters: {'clf__solver': 'lbfgs', 'clf__penalty': 'none', 'clf__class_weight': 'balanced', 'clf__C': 10.0}
Best Accuracy: 0.9057551178061027
              precision    recall  f1-score   support

   large-cap       1.00      1.00      1.00       474
     mid-cap       0.65      0.74      0.69       151
   small-cap       0.82      0.75      0.78       238

    accuracy                           0.89       863
   macro avg       0.82      0.83      0.82       863
weighted avg       0.89      0.89      0.89       863



In [87]:
print("Best Combination Of Hyperparameters:", lr_l1.best_params_) 
print("Best Accuracy:", lr_l1.best_score_)
print(
    classification_report(
        y_test,
        lr_l1.predict(X_test),
        target_names=['large-cap', 'mid-cap', 'small-cap']
    )
)

Best Combination Of Hyperparameters: {'clf__class_weight': 'balanced', 'clf__C': 1.389495494373136}
Best Accuracy: 0.8941676322904596
              precision    recall  f1-score   support

   large-cap       1.00      1.00      1.00       474
     mid-cap       0.65      0.76      0.70       151
   small-cap       0.83      0.74      0.78       238

    accuracy                           0.89       863
   macro avg       0.83      0.83      0.83       863
weighted avg       0.89      0.89      0.89       863



In [88]:
print("Best Combination Of Hyperparameters:", lr_elasticnet.best_params_) 
print("Best Accuracy:", lr_elasticnet.best_score_)
print(
    classification_report(
        y_test, 
        lr_elasticnet.predict(X_test),
        target_names=['large-cap', 'mid-cap', 'small-cap']
    )
)

Best Combination Of Hyperparameters: {'clf__l1_ratio': 0.9081632653061225, 'clf__class_weight': 'balanced', 'clf__C': 5.179474679231202}
Best Accuracy: 0.8941676322904596
              precision    recall  f1-score   support

   large-cap       1.00      0.99      1.00       474
     mid-cap       0.63      0.75      0.69       151
   small-cap       0.82      0.73      0.78       238

    accuracy                           0.88       863
   macro avg       0.82      0.83      0.82       863
weighted avg       0.89      0.88      0.88       863



In [89]:
print("Best Combination Of Hyperparameters:", knn.best_params_) 
print("Best Accuracy:", knn.best_score_)
print(
    classification_report(
        y_test,
        knn.predict(X_test),
        target_names=['large-cap', 'mid-cap', 'small-cap']
    )
)

Best Combination Of Hyperparameters: {'clf__weights': 'distance', 'clf__n_neighbors': 7, 'clf__metric': 'manhattan'}
Best Accuracy: 0.9235225955967555
              precision    recall  f1-score   support

   large-cap       1.00      0.99      1.00       474
     mid-cap       0.84      0.81      0.83       151
   small-cap       0.88      0.91      0.89       238

    accuracy                           0.94       863
   macro avg       0.91      0.91      0.91       863
weighted avg       0.94      0.94      0.94       863



In [90]:
print("Best Combination Of Hyperparameters:", bnb.best_params_) 
print("Best Accuracy:", bnb.best_score_)
print(
    classification_report(
        y_test,
        bnb.predict(X_test),
        target_names=['large-cap', 'mid-cap', 'small-cap']
    )
)

Best Combination Of Hyperparameters: {'clf__alpha': 1e-05}
Best Accuracy: 0.8505214368482039
              precision    recall  f1-score   support

   large-cap       1.00      1.00      1.00       474
     mid-cap       0.58      0.58      0.58       151
   small-cap       0.74      0.74      0.74       238

    accuracy                           0.85       863
   macro avg       0.77      0.77      0.77       863
weighted avg       0.85      0.85      0.85       863



In [91]:
print("Best Combination Of Hyperparameters:", gnb.best_params_) 
print("Best Accuracy:", gnb.best_score_)
print(
    classification_report(
        y_test,
        gnb.predict(X_test),
        target_names=['large-cap', 'mid-cap', 'small-cap']
    )
)

Best Combination Of Hyperparameters: {'clf__var_smoothing': 7.906043210907701e-05}
Best Accuracy: 0.8501351873310158
              precision    recall  f1-score   support

   large-cap       1.00      1.00      1.00       474
     mid-cap       0.65      0.68      0.66       151
   small-cap       0.79      0.77      0.78       238

    accuracy                           0.88       863
   macro avg       0.81      0.81      0.81       863
weighted avg       0.88      0.88      0.88       863



In [92]:
print("Best Combination Of Hyperparameters:", tree.best_params_) 
print("Best Accuracy:", tree.best_score_)
print(
    classification_report(
        y_test,
        tree.predict(X_test), 
        target_names=['large-cap', 'mid-cap', 'small-cap']
    )
)

Best Combination Of Hyperparameters: {'clf__min_samples_split': 7, 'clf__min_samples_leaf': 1, 'clf__max_depth': None, 'clf__class_weight': None}
Best Accuracy: 0.8733101583623021
              precision    recall  f1-score   support

   large-cap       0.96      0.97      0.97       474
     mid-cap       0.74      0.72      0.73       151
   small-cap       0.83      0.84      0.84       238

    accuracy                           0.89       863
   macro avg       0.85      0.84      0.85       863
weighted avg       0.89      0.89      0.89       863



In [93]:
print("Best Combination Of Hyperparameters:", bagged.best_params_) 
print("Best Accuracy:", bagged.best_score_)
print(
    classification_report(
        y_test,
        bagged.predict(X_test),
        target_names=['large-cap', 'mid-cap', 'small-cap']
    )
)

Best Combination Of Hyperparameters: {'clf__n_estimators': 85, 'clf__base_estimator__min_samples_split': 13, 'clf__base_estimator__min_samples_leaf': 1, 'clf__base_estimator__max_depth': 4, 'clf__base_estimator__class_weight': None}
Best Accuracy: 0.8427964465044419
              precision    recall  f1-score   support

   large-cap       0.93      0.95      0.94       474
     mid-cap       0.64      0.54      0.58       151
   small-cap       0.78      0.82      0.80       238

    accuracy                           0.84       863
   macro avg       0.78      0.77      0.77       863
weighted avg       0.84      0.84      0.84       863



In [94]:
print("Best Combination Of Hyperparameters:", rf.best_params_) 
print("Best Accuracy:", rf.best_score_)
print(
    classification_report(
        y_test, 
        rf.predict(X_test),
        target_names=['large-cap', 'mid-cap', 'small-cap']
    )
)

Best Combination Of Hyperparameters: {'clf__n_estimators': 60, 'clf__min_samples_split': 6, 'clf__min_samples_leaf': 7, 'clf__max_depth': 4, 'clf__class_weight': 'balanced_subsample'}
Best Accuracy: 0.8091927385090769
              precision    recall  f1-score   support

   large-cap       0.96      0.86      0.91       474
     mid-cap       0.45      0.62      0.52       151
   small-cap       0.79      0.78      0.78       238

    accuracy                           0.79       863
   macro avg       0.73      0.75      0.74       863
weighted avg       0.82      0.79      0.81       863



In [95]:
print("Best Combination Of Hyperparameters:", et.best_params_) 
print("Best Accuracy:", et.best_score_)
print(
    classification_report(
        y_test, 
        et.predict(X_test),
        target_names=['large-cap', 'mid-cap', 'small-cap']
    )
)

Best Combination Of Hyperparameters: {'clf__n_estimators': 149, 'clf__min_samples_split': 10, 'clf__min_samples_leaf': 1, 'clf__max_depth': 3, 'clf__class_weight': 'balanced'}
Best Accuracy: 0.8122827346465817
              precision    recall  f1-score   support

   large-cap       0.98      0.99      0.99       474
     mid-cap       0.51      0.85      0.64       151
   small-cap       0.81      0.45      0.58       238

    accuracy                           0.82       863
   macro avg       0.77      0.76      0.74       863
weighted avg       0.85      0.82      0.81       863



In [96]:
print("Best Combination Of Hyperparameters:", ada.best_params_) 
print("Best Accuracy:", ada.best_score_)
print(
    classification_report(
        y_test, 
        ada.predict(X_test), 
        target_names=['large-cap', 'mid-cap', 'small-cap']
    )
)

Best Combination Of Hyperparameters: {'clf__n_estimators': 112, 'clf__learning_rate': 0.9081632653061225, 'clf__base_estimator__min_samples_split': 18, 'clf__base_estimator__min_samples_leaf': 2, 'clf__base_estimator__max_depth': None, 'clf__base_estimator__class_weight': None}
Best Accuracy: 0.9602162997296253
              precision    recall  f1-score   support

   large-cap       1.00      0.99      1.00       474
     mid-cap       0.93      0.92      0.92       151
   small-cap       0.95      0.97      0.96       238

    accuracy                           0.97       863
   macro avg       0.96      0.96      0.96       863
weighted avg       0.97      0.97      0.97       863



In [97]:
print("Best Combination Of Hyperparameters:", gb.best_params_) 
print("Best Accuracy:", gb.best_score_)
print(
    classification_report(
        y_test,
        gb.predict(X_test),
        target_names=['large-cap', 'mid-cap', 'small-cap']
    )
)

Best Combination Of Hyperparameters: {'clf__n_estimators': 132, 'clf__min_samples_split': 19, 'clf__min_samples_leaf': 2, 'clf__max_depth': 4, 'clf__learning_rate': 0.7244897959183674}
Best Accuracy: 0.9729625337968327
              precision    recall  f1-score   support

   large-cap       1.00      1.00      1.00       474
     mid-cap       0.96      0.90      0.93       151
   small-cap       0.94      0.97      0.96       238

    accuracy                           0.98       863
   macro avg       0.97      0.96      0.96       863
weighted avg       0.98      0.98      0.98       863



In [98]:
print("Best Combination Of Hyperparameters:", vote.best_params_) 
print("Best Accuracy:", vote.best_score_)
print(
    classification_report(
        y_test,
        vote.predict(X_test), 
        target_names=['large-cap', 'mid-cap', 'small-cap']
    )
)

Best Combination Of Hyperparameters: {'clf__tree__min_samples_split': 3, 'clf__tree__min_samples_leaf': 5, 'clf__tree__max_depth': 4, 'clf__tree__class_weight': 'balanced', 'clf__gb__n_estimators': 146, 'clf__gb__min_samples_split': 14, 'clf__gb__min_samples_leaf': 7, 'clf__gb__max_depth': 4, 'clf__gb__learning_rate': 0.6510204081632653, 'clf__ada__n_estimators': 68, 'clf__ada__learning_rate': 0.5959183673469388, 'clf__ada__base_estimator__min_samples_split': 9, 'clf__ada__base_estimator__min_samples_leaf': 6, 'clf__ada__base_estimator__max_depth': None, 'clf__ada__base_estimator__class_weight': 'balanced'}
Best Accuracy: 0.9636925453843183
              precision    recall  f1-score   support

   large-cap       0.99      1.00      1.00       474
     mid-cap       0.94      0.87      0.90       151
   small-cap       0.92      0.95      0.94       238

    accuracy                           0.96       863
   macro avg       0.95      0.94      0.94       863
weighted avg       0.96  

In [99]:
print("Best Combination Of Hyperparameters:", xgb.best_params_) 
print("Best Accuracy:", xgb.best_score_)
print(
    classification_report(
        y_test,
        xgb.predict(X_test), 
        target_names=['large-cap', 'mid-cap', 'small-cap']
    )
)

Best Combination Of Hyperparameters: {'clf__n_estimators': 958, 'clf__max_depth': 3, 'clf__learning_rate': 0.21020408163265308}
Best Accuracy: 0.9652375434530707
              precision    recall  f1-score   support

   large-cap       1.00      1.00      1.00       474
     mid-cap       0.94      0.91      0.93       151
   small-cap       0.95      0.97      0.96       238

    accuracy                           0.97       863
   macro avg       0.96      0.96      0.96       863
weighted avg       0.97      0.97      0.97       863



In [100]:
print("Best Combination Of Hyperparameters:", svm.best_params_) 
print("Best Accuracy:", svm.best_score_)
print(
    classification_report(
        y_test,
        svm.predict(X_test),
        target_names=['large-cap', 'mid-cap', 'small-cap']
    )
)

Best Combination Of Hyperparameters: {'clf__kernel': 'poly', 'clf__gamma': 1.9306977288832496, 'clf__class_weight': 'balanced', 'clf__C': 5.179474679231202}
Best Accuracy: 0.9312475859405176
              precision    recall  f1-score   support

   large-cap       1.00      1.00      1.00       474
     mid-cap       0.84      0.85      0.84       151
   small-cap       0.90      0.90      0.90       238

    accuracy                           0.95       863
   macro avg       0.92      0.92      0.92       863
weighted avg       0.95      0.95      0.95       863



In [101]:
print("Best Combination Of Hyperparameters:", ffnn.best_params_) 
print("Best Accuracy:", ffnn.best_score_)
print(
    classification_report(
        y_test,
        ffnn.predict(X_test),
        target_names=['large-cap', 'mid-cap', 'small-cap']
    )
)

Best Combination Of Hyperparameters: {'clf__opt_learning_rate': 0.00339322177189533, 'clf__layer_two_neurons': 79, 'clf__layer_two_dropout': 0.5612244897959183, 'clf__layer_one_neurons': 130, 'clf__layer_one_dropout': 0.616326530612245, 'clf__epochs': 6, 'clf__batch_size': 128}
Best Accuracy: 0.8455001856655446
              precision    recall  f1-score   support

   large-cap       1.00      1.00      1.00       474
     mid-cap       0.67      0.24      0.35       151
   small-cap       0.65      0.92      0.77       238

    accuracy                           0.84       863
   macro avg       0.77      0.72      0.71       863
weighted avg       0.85      0.84      0.82       863



---