# Model Benchmarks
---

## Import Libraries And Modules

In [1]:
# Basic libraries
import numpy as np
import pandas as pd

# Modelling libraries
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.decomposition import PCA

# Popular boosting package, separate from `sklearn`. (Many Kaggle competitions have been won with `XGBoost`.)
from xgboost import XGBClassifier

# Modelling using neural networks
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.regularizers import l2
from keras.optimizers import Adam
from keras.wrappers.scikit_learn import KerasClassifier

Using TensorFlow backend.


## Load Preprocessed Data

Load `preprocessed.csv` from `data` folder into a pandas DataFrame.

In [2]:
df = pd.read_csv('../data/preprocessed.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4969 entries, 0 to 4968
Columns: 172 entries, security to sector_Wireless Telecommunication Services
dtypes: float64(34), int64(137), object(1)
memory usage: 6.5+ MB


---

## Set Up `X` And `y`

In [3]:
target = 'cap'
X = df[[f for f in df.columns if f not in target and f != 'security' and f != 'year']]
y = df[target]

---

## Train/Test Split

Our goal is to:
- avoid overfitting in our model, and
- to get an unbiased estimate of model performance on new, 'unseen' data.

We always want to have a holdout set to test our model. Use the `train_test_split` function to split our X and y variables into a training set and a holdout set.

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y)

---

## Modelling

Our pipeline will consist of two stages:

1. An instance of `StandardScaler`
2. A classifier instance

Since we want to tune over the classifiers, so we'll load our pipeline object into `RandomizedSearchCV`.

In [5]:
models = []
best_scores = []

In [6]:
def fit_model(
    estimator,
    param_distributions, 
    pca=False,
    nn=False,
    X_train=X_train,
    X_test=X_test,
    y_train=y_train,
    y_test=y_test
):
    steps = [
        # Use an instance of `StandardScaler` to scale `X_train` and `X_test` for any model that uses Gradient Descent
        ('ss', StandardScaler()), 
        ('clf', estimator)
    ]
    if pca:
        steps.insert(1, ('pca', PCA()))
        if not nn:
            param_distributions['pca__n_components'] = np.linspace(0.1, 0.9)
    if nn:
        # Since this is a multiclass classification problem, keras needs y to be a one-hot encoded matrix.
        y_train = pd.get_dummies(y_train)
        
    # Fit the models on training data
    model = RandomizedSearchCV(
        estimator=Pipeline(steps),
        param_distributions=param_distributions,
        n_jobs=-1,
        cv=5,
        verbose=1
    ).fit(X_train, y_train)
    
    # Evaluate models on test data
    print("Best Combination Of Hyperparameters:", model.best_params_) 
    print("Best Accuracy:", model.best_score_)
    print(
        classification_report(
            y_test,
            model.predict(X_test), 
            target_names=['large-cap', 'mid-cap', 'small-cap']
        )
    )
    return model

In [7]:
model = fit_model(
    LogisticRegression(max_iter=10_000, multi_class='multinomial'), 
    {
        'clf__penalty': ['l2', 'none'],
        'clf__C': np.logspace(-5, 2), 
        'clf__class_weight': [None, 'balanced'],
        'clf__solver': ['newton-cg', 'sag', 'lbfgs']
    }
)
models.append(model)
best_scores.append(model.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  3.3min finished
  "Setting penalty='none' will ignore the C and l1_ratio "


Best Combination Of Hyperparameters: {'clf__solver': 'lbfgs', 'clf__penalty': 'none', 'clf__class_weight': None, 'clf__C': 1.9306977288832496}
Best Accuracy: 0.8588298443370908
              precision    recall  f1-score   support

   large-cap       1.00      1.00      1.00       480
     mid-cap       0.74      0.63      0.68       316
   small-cap       0.76      0.85      0.80       447

    accuracy                           0.85      1243
   macro avg       0.84      0.82      0.83      1243
weighted avg       0.85      0.85      0.85      1243



In [8]:
model = fit_model(
    LogisticRegression(penalty='l1', solver='saga', max_iter=10_000, multi_class='multinomial'), 
    {
        'clf__C': np.logspace(-5, 2), 
        'clf__class_weight': [None, 'balanced']
    }
)
models.append(model)
best_scores.append(model.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 12.7min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 14.0min finished


Best Combination Of Hyperparameters: {'clf__class_weight': None, 'clf__C': 100.0}
Best Accuracy: 0.8548040794417606
              precision    recall  f1-score   support

   large-cap       1.00      0.99      1.00       480
     mid-cap       0.74      0.61      0.67       316
   small-cap       0.75      0.85      0.80       447

    accuracy                           0.85      1243
   macro avg       0.83      0.82      0.82      1243
weighted avg       0.85      0.85      0.84      1243



In [9]:
model = fit_model(
    LogisticRegression(penalty='elasticnet', solver='saga', max_iter=10_000, multi_class='multinomial'), 
    {
        'clf__C': np.logspace(-5, 2), 
        'clf__class_weight': [None, 'balanced'],
        'clf__l1_ratio': np.linspace(0.1, 1)
    }
)
models.append(model)
best_scores.append(model.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 14.3min finished


Best Combination Of Hyperparameters: {'clf__l1_ratio': 0.6142857142857143, 'clf__class_weight': None, 'clf__C': 5.179474679231202}
Best Accuracy: 0.855072463768116
              precision    recall  f1-score   support

   large-cap       1.00      1.00      1.00       480
     mid-cap       0.73      0.61      0.67       316
   small-cap       0.75      0.84      0.80       447

    accuracy                           0.84      1243
   macro avg       0.83      0.82      0.82      1243
weighted avg       0.84      0.84      0.84      1243



In [10]:
model = fit_model(
    KNeighborsClassifier(), 
    {
        'clf__n_neighbors': np.arange(1, 22, 2), 
        'clf__weights': ['uniform', 'distance'],
        'clf__metric': ['euclidean','manhattan']
    }
)
models.append(model)
best_scores.append(model.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   15.1s finished


Best Combination Of Hyperparameters: {'clf__weights': 'distance', 'clf__n_neighbors': 5, 'clf__metric': 'euclidean'}
Best Accuracy: 0.8961352657004831
              precision    recall  f1-score   support

   large-cap       1.00      1.00      1.00       480
     mid-cap       0.87      0.76      0.81       316
   small-cap       0.84      0.92      0.88       447

    accuracy                           0.91      1243
   macro avg       0.90      0.89      0.90      1243
weighted avg       0.91      0.91      0.91      1243



In [11]:
model = fit_model(
    BernoulliNB(), 
    {
        'clf__alpha': np.logspace(-5, 2)
    }
)
models.append(model)
best_scores.append(model.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Best Combination Of Hyperparameters: {'clf__alpha': 0.0001}
Best Accuracy: 0.78341384863124
              precision    recall  f1-score   support

   large-cap       1.00      0.95      0.97       480
     mid-cap       0.62      0.58      0.60       316
   small-cap       0.72      0.79      0.75       447

    accuracy                           0.80      1243
   macro avg       0.78      0.77      0.77      1243
weighted avg       0.80      0.80      0.80      1243



[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.9s finished


In [12]:
model = fit_model(
    GaussianNB(), 
    {
        'clf__var_smoothing': np.logspace(-9, -4)
    }
)
models.append(model)
best_scores.append(model.best_score_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Combination Of Hyperparameters: {'clf__var_smoothing': 7.906043210907701e-05}
Best Accuracy: 0.8215244229736983
              precision    recall  f1-score   support

   large-cap       1.00      1.00      1.00       480
     mid-cap       0.65      0.70      0.68       316
   small-cap       0.78      0.74      0.76       447

    accuracy                           0.83      1243
   macro avg       0.81      0.81      0.81      1243
weighted avg       0.83      0.83      0.83      1243



[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    0.9s finished


In [13]:
model = fit_model(
    DecisionTreeClassifier(), 
    {
        'clf__max_depth': list(np.arange(1, 5)) + [None],
        'clf__min_samples_split': np.arange(2, 21),
        'clf__min_samples_leaf': np.arange(1, 8),
        'clf__class_weight': [None, 'balanced']
    }
)
models.append(model)
best_scores.append(model.best_score_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Combination Of Hyperparameters: {'clf__min_samples_split': 6, 'clf__min_samples_leaf': 6, 'clf__max_depth': None, 'clf__class_weight': None}
Best Accuracy: 0.8099838969404187
              precision    recall  f1-score   support

   large-cap       0.92      0.95      0.94       480
     mid-cap       0.72      0.68      0.70       316
   small-cap       0.80      0.80      0.80       447

    accuracy                           0.83      1243
   macro avg       0.81      0.81      0.81      1243
weighted avg       0.83      0.83      0.83      1243



[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    1.6s finished


In [14]:
model = fit_model(
    BaggingClassifier(DecisionTreeClassifier()), 
    {
        'clf__base_estimator__max_depth': list(np.arange(1, 5)) + [None],
        'clf__base_estimator__min_samples_split': np.arange(2, 21),
        'clf__base_estimator__min_samples_leaf': np.arange(1, 8),
        'clf__base_estimator__class_weight': [None, 'balanced'],
        'clf__n_estimators': np.arange(1, 150)
    }
)
models.append(model)
best_scores.append(model.best_score_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.5min finished


Best Combination Of Hyperparameters: {'clf__n_estimators': 101, 'clf__base_estimator__min_samples_split': 10, 'clf__base_estimator__min_samples_leaf': 2, 'clf__base_estimator__max_depth': None, 'clf__base_estimator__class_weight': None}
Best Accuracy: 0.8939881910896403
              precision    recall  f1-score   support

   large-cap       0.97      0.96      0.97       480
     mid-cap       0.81      0.84      0.82       316
   small-cap       0.88      0.88      0.88       447

    accuracy                           0.90      1243
   macro avg       0.89      0.89      0.89      1243
weighted avg       0.90      0.90      0.90      1243



In [15]:
model = fit_model(
    RandomForestClassifier(), 
    {
        'clf__max_depth': np.arange(1, 5),
        'clf__n_estimators': np.arange(1, 150),
        'clf__min_samples_split': np.arange(2, 21),
        'clf__min_samples_leaf': np.arange(1, 8),
        'clf__class_weight': [None, 'balanced', 'balanced_subsample']
    }
)
models.append(model)
best_scores.append(model.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   10.3s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   11.4s finished


Best Combination Of Hyperparameters: {'clf__n_estimators': 102, 'clf__min_samples_split': 9, 'clf__min_samples_leaf': 1, 'clf__max_depth': 4, 'clf__class_weight': 'balanced'}
Best Accuracy: 0.773215244229737
              precision    recall  f1-score   support

   large-cap       0.94      0.87      0.90       480
     mid-cap       0.57      0.68      0.62       316
   small-cap       0.79      0.75      0.77       447

    accuracy                           0.78      1243
   macro avg       0.77      0.77      0.77      1243
weighted avg       0.80      0.78      0.79      1243



In [16]:
model = fit_model(
    ExtraTreesClassifier(), 
    {
        'clf__max_depth': np.arange(1, 5),
        'clf__n_estimators': np.arange(1, 150),
        'clf__min_samples_split': np.arange(2, 21),
        'clf__min_samples_leaf': np.arange(1, 8),
        'clf__class_weight': [None, 'balanced', 'balanced_subsample']
    }
)
models.append(model)
best_scores.append(model.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    4.2s


Best Combination Of Hyperparameters: {'clf__n_estimators': 75, 'clf__min_samples_split': 11, 'clf__min_samples_leaf': 6, 'clf__max_depth': 4, 'clf__class_weight': 'balanced_subsample'}
Best Accuracy: 0.7541599570585078
              precision    recall  f1-score   support

   large-cap       0.95      0.99      0.97       480
     mid-cap       0.54      0.89      0.67       316
   small-cap       0.86      0.42      0.56       447

    accuracy                           0.76      1243
   macro avg       0.78      0.77      0.73      1243
weighted avg       0.81      0.76      0.75      1243



[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    4.9s finished


In [17]:
model = fit_model(
    AdaBoostClassifier(base_estimator=DecisionTreeClassifier()), 
    {
        'clf__base_estimator__max_depth': list(np.arange(1, 5)) + [None],
        'clf__base_estimator__min_samples_split': np.arange(2, 21),
        'clf__base_estimator__min_samples_leaf': np.arange(1, 8),
        'clf__base_estimator__class_weight': [None, 'balanced'],
        'clf__n_estimators': np.arange(1, 150), 
        'clf__learning_rate': np.linspace(0.1, 1)
    }
)
models.append(model)
best_scores.append(model.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   37.4s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   38.8s finished


Best Combination Of Hyperparameters: {'clf__n_estimators': 133, 'clf__learning_rate': 0.5040816326530613, 'clf__base_estimator__min_samples_split': 5, 'clf__base_estimator__min_samples_leaf': 1, 'clf__base_estimator__max_depth': 2, 'clf__base_estimator__class_weight': 'balanced'}
Best Accuracy: 0.8832528180354268
              precision    recall  f1-score   support

   large-cap       1.00      1.00      1.00       480
     mid-cap       0.77      0.77      0.77       316
   small-cap       0.84      0.83      0.84       447

    accuracy                           0.88      1243
   macro avg       0.87      0.87      0.87      1243
weighted avg       0.88      0.88      0.88      1243



In [18]:
model = fit_model(
    GradientBoostingClassifier(), 
    {
        'clf__max_depth': np.arange(1, 5),
        'clf__n_estimators': np.arange(1, 150), 
        'clf__min_samples_split': np.arange(2, 21),
        'clf__min_samples_leaf': np.arange(1, 8),
        'clf__learning_rate': np.linspace(0.1, 1)
    }
)
models.append(model)
best_scores.append(model.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.2min finished


Best Combination Of Hyperparameters: {'clf__n_estimators': 89, 'clf__min_samples_split': 3, 'clf__min_samples_leaf': 2, 'clf__max_depth': 4, 'clf__learning_rate': 0.43061224489795924}
Best Accuracy: 0.9439076757917337
              precision    recall  f1-score   support

   large-cap       1.00      1.00      1.00       480
     mid-cap       0.91      0.90      0.90       316
   small-cap       0.93      0.94      0.93       447

    accuracy                           0.95      1243
   macro avg       0.95      0.94      0.95      1243
weighted avg       0.95      0.95      0.95      1243



In [19]:
model = fit_model(
    VotingClassifier([
        ('tree', DecisionTreeClassifier()), 
        ('ada', AdaBoostClassifier(base_estimator=DecisionTreeClassifier())), 
        ('gb', GradientBoostingClassifier())
    ]), 
    {
        'clf__tree__max_depth': list(np.arange(1, 5)) + [None],
        'clf__tree__min_samples_split': np.arange(2, 21),
        'clf__tree__min_samples_leaf': np.arange(1, 8),
        'clf__tree__class_weight': [None, 'balanced'],
        'clf__ada__base_estimator__max_depth': list(np.arange(1, 5)) + [None],
        'clf__ada__base_estimator__min_samples_split': np.arange(2, 21),
        'clf__ada__base_estimator__min_samples_leaf': np.arange(1, 8),
        'clf__ada__base_estimator__class_weight': [None, 'balanced'],
        'clf__ada__n_estimators': np.arange(1, 150), 
        'clf__ada__learning_rate': np.linspace(0.1, 1),
        'clf__gb__max_depth': np.arange(1, 5),
        'clf__gb__n_estimators': np.arange(1, 150), 
        'clf__gb__min_samples_split': np.arange(2, 21),
        'clf__gb__min_samples_leaf': np.arange(1, 8),
        'clf__gb__learning_rate': np.linspace(0.1, 1)
    }
)
models.append(model)
best_scores.append(model.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.8min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.9min finished


Best Combination Of Hyperparameters: {'clf__tree__min_samples_split': 7, 'clf__tree__min_samples_leaf': 6, 'clf__tree__max_depth': 3, 'clf__tree__class_weight': None, 'clf__gb__n_estimators': 117, 'clf__gb__min_samples_split': 15, 'clf__gb__min_samples_leaf': 1, 'clf__gb__max_depth': 2, 'clf__gb__learning_rate': 0.48571428571428577, 'clf__ada__n_estimators': 32, 'clf__ada__learning_rate': 0.7061224489795919, 'clf__ada__base_estimator__min_samples_split': 19, 'clf__ada__base_estimator__min_samples_leaf': 3, 'clf__ada__base_estimator__max_depth': None, 'clf__ada__base_estimator__class_weight': None}
Best Accuracy: 0.9079441760601181
              precision    recall  f1-score   support

   large-cap       0.98      0.98      0.98       480
     mid-cap       0.83      0.85      0.84       316
   small-cap       0.90      0.89      0.90       447

    accuracy                           0.91      1243
   macro avg       0.91      0.91      0.91      1243
weighted avg       0.92      0.91  

In [20]:
model = fit_model(
    XGBClassifier(objective='multi:softprob'), 
    {
        'clf__max_depth': np.arange(1, 25),
        'clf__n_estimators': np.arange(1, 1000), 
        'clf__learning_rate': np.linspace(0.1, 1)
    }
)
models.append(model)
best_scores.append(model.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed: 12.7min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 15.9min finished


Best Combination Of Hyperparameters: {'clf__n_estimators': 616, 'clf__max_depth': 12, 'clf__learning_rate': 0.3938775510204082}
Best Accuracy: 0.9452495974235104
              precision    recall  f1-score   support

   large-cap       1.00      0.99      0.99       480
     mid-cap       0.93      0.91      0.92       316
   small-cap       0.93      0.95      0.94       447

    accuracy                           0.95      1243
   macro avg       0.95      0.95      0.95      1243
weighted avg       0.95      0.95      0.95      1243



In [21]:
model = fit_model(
    SVC(decision_function_shape='ovo'), 
    {
        'clf__C': np.logspace(-5, 2),
        'clf__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'clf__gamma': list(np.logspace(-5, 2)) + ['scale'],
        'clf__class_weight': [None, 'balanced']
    }
)
models.append(model)
best_scores.append(model.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   58.2s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.2min finished


Best Combination Of Hyperparameters: {'clf__kernel': 'poly', 'clf__gamma': 1.9306977288832496, 'clf__class_weight': 'balanced', 'clf__C': 0.7196856730011514}
Best Accuracy: 0.8939881910896403
              precision    recall  f1-score   support

   large-cap       1.00      1.00      1.00       480
     mid-cap       0.83      0.79      0.81       316
   small-cap       0.86      0.89      0.87       447

    accuracy                           0.91      1243
   macro avg       0.90      0.89      0.89      1243
weighted avg       0.91      0.91      0.91      1243



In [22]:
def ffnn_fn(
    layer_one_neurons=18,
    layer_two_neurons=18,
    layer_one_dropout=0.5,
    layer_two_dropout=0.5,
    opt_learning_rate=0.01
):
    # Create a neural network using the Dense and Dropout layers from keras. 
    model = Sequential()
    model.add(Dense(layer_one_neurons, activation='relu', kernel_regularizer=l2(), input_dim=X_train.shape[1]))
    model.add(Dropout(layer_one_dropout))
    model.add(Dense(layer_two_neurons, activation='relu', kernel_regularizer=l2()))
    model.add(Dropout(layer_two_dropout))
    
    # Activation function for the final output layer needs to be softmax 
    # to accomidate the three different classes.
    model.add(Dense(3, activation='softmax'))
    
    ad = Adam(learning_rate=opt_learning_rate)
    
    # Compile model
    # Since this is a multiclass classification problem,loss function is categorical_crossentropy.
    model.compile(optimizer=ad, loss='categorical_crossentropy', metrics=['accuracy'])
    return model

model = fit_model(
    KerasClassifier(build_fn=ffnn_fn, verbose=0), 
    {
        'clf__layer_one_neurons': np.arange(10, X_train.shape[1]),
        'clf__layer_two_neurons': np.arange(10, X_train.shape[1]),
        'clf__layer_one_dropout': np.linspace(0.5, 0.8),
        'clf__layer_two_dropout': np.linspace(0.5, 0.8),
        'clf__opt_learning_rate': np.logspace(-4, -1),
        'clf__batch_size': [128, 256],
        'clf__epochs': np.arange(5, 50)
    },
    nn=True
)
models.append(model)
best_scores.append(model.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  4.1min




[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  4.9min finished


Best Combination Of Hyperparameters: {'clf__opt_learning_rate': 0.0029470517025518097, 'clf__layer_two_neurons': 162, 'clf__layer_two_dropout': 0.5918367346938775, 'clf__layer_one_neurons': 30, 'clf__layer_one_dropout': 0.5612244897959183, 'clf__epochs': 37, 'clf__batch_size': 128}
Best Accuracy: 0.8247450452239937
              precision    recall  f1-score   support

   large-cap       1.00      1.00      1.00       480
     mid-cap       0.64      0.67      0.66       316
   small-cap       0.76      0.73      0.74       447

    accuracy                           0.82      1243
   macro avg       0.80      0.80      0.80      1243
weighted avg       0.82      0.82      0.82      1243



---

## Modelling With `PCA`

Use `PCA` (Principal component analysis) to simplify data, reduce noise, and find unmeasured latent variables.

Our pipeline will consist of three stages:

1. An instance of `StandardScaler`
2. An instance of `PCA`
3. A classifier instance

Since we want to tune over the classifiers, so we'll load our pipeline object into `RandomizedSearchCV`.

In [23]:
model = fit_model(
    LogisticRegression(max_iter=10_000, multi_class='multinomial'), 
    {
        'clf__penalty': ['l2', 'none'],
        'clf__C': np.logspace(-5, 2), 
        'clf__class_weight': [None, 'balanced'],
        'clf__solver': ['newton-cg', 'sag', 'lbfgs']
    },
    pca=True
)
models.append(model)
best_scores.append(model.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.8min finished
  "Setting penalty='none' will ignore the C and l1_ratio "


Best Combination Of Hyperparameters: {'pca__n_components': 0.6224489795918368, 'clf__solver': 'sag', 'clf__penalty': 'none', 'clf__class_weight': None, 'clf__C': 5.1794746792312125e-05}
Best Accuracy: 0.8467525496511004
              precision    recall  f1-score   support

   large-cap       0.99      1.00      1.00       480
     mid-cap       0.74      0.61      0.67       316
   small-cap       0.76      0.85      0.80       447

    accuracy                           0.85      1243
   macro avg       0.83      0.82      0.82      1243
weighted avg       0.84      0.85      0.84      1243



In [24]:
model = fit_model(
    LogisticRegression(penalty='l1', solver='saga', max_iter=10_000, multi_class='multinomial'), 
    {
        'clf__C': np.logspace(-5, 2), 
        'clf__class_weight': [None, 'balanced']
    },
    pca=True
)
models.append(model)
best_scores.append(model.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  3.4min finished


Best Combination Of Hyperparameters: {'pca__n_components': 0.8836734693877552, 'clf__class_weight': 'balanced', 'clf__C': 0.7196856730011514}
Best Accuracy: 0.8462157809983897
              precision    recall  f1-score   support

   large-cap       1.00      1.00      1.00       480
     mid-cap       0.69      0.75      0.72       316
   small-cap       0.82      0.77      0.79       447

    accuracy                           0.85      1243
   macro avg       0.84      0.84      0.84      1243
weighted avg       0.85      0.85      0.85      1243



In [25]:
model = fit_model(
    LogisticRegression(penalty='elasticnet', solver='saga', max_iter=10_000, multi_class='multinomial'), 
    {
        'clf__C': np.logspace(-5, 2), 
        'clf__class_weight': [None, 'balanced'],
        'clf__l1_ratio': np.linspace(0.1, 1)
    },
    pca=True
)
models.append(model)
best_scores.append(model.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   14.5s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   35.3s finished


Best Combination Of Hyperparameters: {'pca__n_components': 0.7857142857142857, 'clf__l1_ratio': 0.17346938775510207, 'clf__class_weight': None, 'clf__C': 0.05179474679231207}
Best Accuracy: 0.8298443370907139
              precision    recall  f1-score   support

   large-cap       0.99      1.00      0.99       480
     mid-cap       0.70      0.56      0.63       316
   small-cap       0.73      0.83      0.78       447

    accuracy                           0.83      1243
   macro avg       0.81      0.80      0.80      1243
weighted avg       0.82      0.83      0.82      1243



In [26]:
model = fit_model(
    KNeighborsClassifier(), 
    {
        'clf__n_neighbors': np.arange(1, 22, 2), 
        'clf__weights': ['uniform', 'distance'],
        'clf__metric': ['euclidean','manhattan']
    },
    pca=True
)
models.append(model)
best_scores.append(model.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    4.4s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    5.0s finished


Best Combination Of Hyperparameters: {'pca__n_components': 0.8346938775510204, 'clf__weights': 'distance', 'clf__n_neighbors': 1, 'clf__metric': 'euclidean'}
Best Accuracy: 0.9133118625872249
              precision    recall  f1-score   support

   large-cap       1.00      0.99      1.00       480
     mid-cap       0.84      0.81      0.83       316
   small-cap       0.87      0.89      0.88       447

    accuracy                           0.91      1243
   macro avg       0.90      0.90      0.90      1243
weighted avg       0.91      0.91      0.91      1243



In [27]:
model = fit_model(
    BernoulliNB(), 
    {
        'clf__alpha': np.logspace(-5, 2)
    },
    pca=True
)
models.append(model)
best_scores.append(model.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.3s


Best Combination Of Hyperparameters: {'pca__n_components': 0.589795918367347, 'clf__alpha': 0.003727593720314938}
Best Accuracy: 0.7106816961889426
              precision    recall  f1-score   support

   large-cap       0.85      1.00      0.92       480
     mid-cap       0.53      0.70      0.61       316
   small-cap       0.77      0.45      0.57       447

    accuracy                           0.73      1243
   macro avg       0.72      0.72      0.70      1243
weighted avg       0.74      0.73      0.71      1243



[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    3.7s finished


In [28]:
model = fit_model(
    GaussianNB(), 
    {
        'clf__var_smoothing': np.logspace(-9, -4)
    },
    pca=True
)
models.append(model)
best_scores.append(model.best_score_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    2.8s


Best Combination Of Hyperparameters: {'pca__n_components': 0.4591836734693878, 'clf__var_smoothing': 3.5564803062231287e-07}
Best Accuracy: 0.7283950617283951
              precision    recall  f1-score   support

   large-cap       0.91      1.00      0.95       480
     mid-cap       0.53      0.88      0.66       316
   small-cap       0.90      0.38      0.54       447

    accuracy                           0.75      1243
   macro avg       0.78      0.75      0.72      1243
weighted avg       0.81      0.75      0.73      1243



[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    3.3s finished


In [29]:
model = fit_model(
    DecisionTreeClassifier(), 
    {
        'clf__max_depth': list(np.arange(1, 5)) + [None],
        'clf__min_samples_split': np.arange(2, 21),
        'clf__min_samples_leaf': np.arange(1, 8),
        'clf__class_weight': [None, 'balanced']
    },
    pca=True
)
models.append(model)
best_scores.append(model.best_score_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.8s


Best Combination Of Hyperparameters: {'pca__n_components': 0.2469387755102041, 'clf__min_samples_split': 6, 'clf__min_samples_leaf': 5, 'clf__max_depth': None, 'clf__class_weight': 'balanced'}
Best Accuracy: 0.8193773483628556
              precision    recall  f1-score   support

   large-cap       0.99      0.96      0.97       480
     mid-cap       0.67      0.71      0.69       316
   small-cap       0.78      0.77      0.77       447

    accuracy                           0.83      1243
   macro avg       0.81      0.81      0.81      1243
weighted avg       0.83      0.83      0.83      1243



[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    4.7s finished


In [30]:
model = fit_model(
    BaggingClassifier(DecisionTreeClassifier()), 
    {
        'clf__base_estimator__max_depth': list(np.arange(1, 5)) + [None],
        'clf__base_estimator__min_samples_split': np.arange(2, 21),
        'clf__base_estimator__min_samples_leaf': np.arange(1, 8),
        'clf__base_estimator__class_weight': [None, 'balanced'],
        'clf__n_estimators': np.arange(1, 150)
    },
    pca=True
)
models.append(model)
best_scores.append(model.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  1.9min finished


Best Combination Of Hyperparameters: {'pca__n_components': 0.8020408163265307, 'clf__n_estimators': 60, 'clf__base_estimator__min_samples_split': 4, 'clf__base_estimator__min_samples_leaf': 2, 'clf__base_estimator__max_depth': None, 'clf__base_estimator__class_weight': 'balanced'}
Best Accuracy: 0.9001610305958132
              precision    recall  f1-score   support

   large-cap       1.00      1.00      1.00       480
     mid-cap       0.82      0.79      0.81       316
   small-cap       0.86      0.88      0.87       447

    accuracy                           0.90      1243
   macro avg       0.89      0.89      0.89      1243
weighted avg       0.90      0.90      0.90      1243



In [31]:
model = fit_model(
    RandomForestClassifier(), 
    {
        'clf__max_depth': np.arange(1, 5),
        'clf__n_estimators': np.arange(1, 150),
        'clf__min_samples_split': np.arange(2, 21),
        'clf__min_samples_leaf': np.arange(1, 8),
        'clf__class_weight': [None, 'balanced', 'balanced_subsample']
    },
    pca=True
)
models.append(model)
best_scores.append(model.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   10.2s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   11.5s finished


Best Combination Of Hyperparameters: {'pca__n_components': 0.7693877551020408, 'clf__n_estimators': 131, 'clf__min_samples_split': 3, 'clf__min_samples_leaf': 6, 'clf__max_depth': 3, 'clf__class_weight': 'balanced'}
Best Accuracy: 0.8215244229736983
              precision    recall  f1-score   support

   large-cap       0.98      1.00      0.99       480
     mid-cap       0.63      0.77      0.69       316
   small-cap       0.80      0.66      0.72       447

    accuracy                           0.82      1243
   macro avg       0.81      0.81      0.80      1243
weighted avg       0.83      0.82      0.82      1243



In [32]:
model = fit_model(
    ExtraTreesClassifier(), 
    {
        'clf__max_depth': np.arange(1, 5),
        'clf__n_estimators': np.arange(1, 150),
        'clf__min_samples_split': np.arange(2, 21),
        'clf__min_samples_leaf': np.arange(1, 8),
        'clf__class_weight': [None, 'balanced', 'balanced_subsample']
    },
    pca=True
)
models.append(model)
best_scores.append(model.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    4.0s


Best Combination Of Hyperparameters: {'pca__n_components': 0.4755102040816327, 'clf__n_estimators': 34, 'clf__min_samples_split': 19, 'clf__min_samples_leaf': 6, 'clf__max_depth': 4, 'clf__class_weight': 'balanced_subsample'}
Best Accuracy: 0.7659688674181427
              precision    recall  f1-score   support

   large-cap       0.98      0.99      0.99       480
     mid-cap       0.53      0.94      0.68       316
   small-cap       0.91      0.41      0.57       447

    accuracy                           0.77      1243
   macro avg       0.81      0.78      0.75      1243
weighted avg       0.84      0.77      0.76      1243



[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    5.0s finished


In [33]:
model = fit_model(
    AdaBoostClassifier(base_estimator=DecisionTreeClassifier()), 
    {
        'clf__base_estimator__max_depth': list(np.arange(1, 5)) + [None],
        'clf__base_estimator__min_samples_split': np.arange(2, 21),
        'clf__base_estimator__min_samples_leaf': np.arange(1, 8),
        'clf__base_estimator__class_weight': [None, 'balanced'],
        'clf__n_estimators': np.arange(1, 150), 
        'clf__learning_rate': np.linspace(0.1, 1)
    },
    pca=True
)
models.append(model)
best_scores.append(model.best_score_)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  2.3min finished


Best Combination Of Hyperparameters: {'pca__n_components': 0.5408163265306123, 'clf__n_estimators': 100, 'clf__learning_rate': 0.8530612244897959, 'clf__base_estimator__min_samples_split': 18, 'clf__base_estimator__min_samples_leaf': 7, 'clf__base_estimator__max_depth': None, 'clf__base_estimator__class_weight': 'balanced'}
Best Accuracy: 0.9114331723027376
              precision    recall  f1-score   support

   large-cap       1.00      1.00      1.00       480
     mid-cap       0.85      0.80      0.82       316
   small-cap       0.86      0.90      0.88       447

    accuracy                           0.91      1243
   macro avg       0.90      0.90      0.90      1243
weighted avg       0.91      0.91      0.91      1243



In [34]:
model = fit_model(
    GradientBoostingClassifier(), 
    {
        'clf__max_depth': np.arange(1, 5),
        'clf__n_estimators': np.arange(1, 150), 
        'clf__min_samples_split': np.arange(2, 21),
        'clf__min_samples_leaf': np.arange(1, 8),
        'clf__learning_rate': np.linspace(0.1, 1)
    },
    pca=True
)
models.append(model)
best_scores.append(model.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   37.6s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   41.3s finished


Best Combination Of Hyperparameters: {'pca__n_components': 0.5571428571428572, 'clf__n_estimators': 106, 'clf__min_samples_split': 18, 'clf__min_samples_leaf': 7, 'clf__max_depth': 4, 'clf__learning_rate': 0.41224489795918373}
Best Accuracy: 0.8918411164787976
              precision    recall  f1-score   support

   large-cap       1.00      1.00      1.00       480
     mid-cap       0.83      0.77      0.80       316
   small-cap       0.85      0.89      0.87       447

    accuracy                           0.90      1243
   macro avg       0.89      0.89      0.89      1243
weighted avg       0.90      0.90      0.90      1243



In [35]:
model = fit_model(
    VotingClassifier([
        ('tree', DecisionTreeClassifier()), 
        ('ada', AdaBoostClassifier(base_estimator=DecisionTreeClassifier())), 
        ('gb', GradientBoostingClassifier())
    ]), 
    {
        'clf__tree__max_depth': list(np.arange(1, 5)) + [None],
        'clf__tree__min_samples_split': np.arange(2, 21),
        'clf__tree__min_samples_leaf': np.arange(1, 8),
        'clf__tree__class_weight': [None, 'balanced'],
        'clf__ada__base_estimator__max_depth': list(np.arange(1, 5)) + [None],
        'clf__ada__base_estimator__min_samples_split': np.arange(2, 21),
        'clf__ada__base_estimator__min_samples_leaf': np.arange(1, 8),
        'clf__ada__base_estimator__class_weight': [None, 'balanced'],
        'clf__ada__n_estimators': np.arange(1, 150), 
        'clf__ada__learning_rate': np.linspace(0.1, 1),
        'clf__gb__max_depth': np.arange(1, 5),
        'clf__gb__n_estimators': np.arange(1, 150), 
        'clf__gb__min_samples_split': np.arange(2, 21),
        'clf__gb__min_samples_leaf': np.arange(1, 8),
        'clf__gb__learning_rate': np.linspace(0.1, 1)
    },
    pca=True
)
models.append(model)
best_scores.append(model.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  3.1min finished


Best Combination Of Hyperparameters: {'pca__n_components': 0.6061224489795919, 'clf__tree__min_samples_split': 16, 'clf__tree__min_samples_leaf': 7, 'clf__tree__max_depth': 4, 'clf__tree__class_weight': None, 'clf__gb__n_estimators': 18, 'clf__gb__min_samples_split': 10, 'clf__gb__min_samples_leaf': 4, 'clf__gb__max_depth': 4, 'clf__gb__learning_rate': 0.6142857142857143, 'clf__ada__n_estimators': 88, 'clf__ada__learning_rate': 0.11836734693877551, 'clf__ada__base_estimator__min_samples_split': 13, 'clf__ada__base_estimator__min_samples_leaf': 7, 'clf__ada__base_estimator__max_depth': None, 'clf__ada__base_estimator__class_weight': 'balanced'}
Best Accuracy: 0.8918411164787976
              precision    recall  f1-score   support

   large-cap       1.00      1.00      1.00       480
     mid-cap       0.84      0.77      0.80       316
   small-cap       0.85      0.89      0.87       447

    accuracy                           0.90      1243
   macro avg       0.89      0.89      0.8

In [36]:
model = fit_model(
    XGBClassifier(objective='multi:softprob'), 
    {
        'clf__max_depth': np.arange(1, 25),
        'clf__n_estimators': np.arange(1, 1000), 
        'clf__learning_rate': np.linspace(0.1, 1)
    },
    pca=True
)
models.append(model)
best_scores.append(model.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  9.9min
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed: 10.5min finished


Best Combination Of Hyperparameters: {'pca__n_components': 0.49183673469387756, 'clf__n_estimators': 304, 'clf__max_depth': 21, 'clf__learning_rate': 0.5408163265306123}
Best Accuracy: 0.9143853998926462
              precision    recall  f1-score   support

   large-cap       1.00      1.00      1.00       480
     mid-cap       0.85      0.82      0.83       316
   small-cap       0.87      0.90      0.89       447

    accuracy                           0.92      1243
   macro avg       0.91      0.90      0.91      1243
weighted avg       0.92      0.92      0.92      1243



In [37]:
model = fit_model(
    SVC(decision_function_shape='ovo'), 
    {
        'clf__C': np.logspace(-5, 2),
        'clf__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'clf__gamma': list(np.logspace(-5, 2)) + ['scale'],
        'clf__class_weight': [None, 'balanced']
    },
    pca=True
)
models.append(model)
best_scores.append(model.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   24.5s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   28.0s finished


Best Combination Of Hyperparameters: {'pca__n_components': 0.589795918367347, 'clf__kernel': 'linear', 'clf__gamma': 1.9306977288832496, 'clf__class_weight': 'balanced', 'clf__C': 2.682695795279722}
Best Accuracy: 0.8475577026301664
              precision    recall  f1-score   support

   large-cap       1.00      1.00      1.00       480
     mid-cap       0.70      0.75      0.72       316
   small-cap       0.81      0.77      0.79       447

    accuracy                           0.85      1243
   macro avg       0.84      0.84      0.84      1243
weighted avg       0.86      0.85      0.85      1243



In [38]:
model = fit_model(
    KerasClassifier(build_fn=ffnn_fn, verbose=0), 
    {
        'clf__layer_one_neurons': np.arange(10, X_train.shape[1]),
        'clf__layer_two_neurons': np.arange(10, X_train.shape[1]),
        'clf__layer_one_dropout': np.linspace(0.5, 0.8),
        'clf__layer_two_dropout': np.linspace(0.5, 0.8),
        'clf__opt_learning_rate': np.logspace(-4, -1),
        'clf__batch_size': [128, 256],
        'clf__epochs': np.arange(5, 50)
    },
    pca=True,
    nn=True
)
models.append(model)
best_scores.append(model.best_score_)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.3min




[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:  3.6min finished


Best Combination Of Hyperparameters: {'clf__opt_learning_rate': 0.0019306977288832496, 'clf__layer_two_neurons': 134, 'clf__layer_two_dropout': 0.7020408163265306, 'clf__layer_one_neurons': 113, 'clf__layer_one_dropout': 0.7142857142857143, 'clf__epochs': 22, 'clf__batch_size': 128}
Best Accuracy: 0.8177670447882694
              precision    recall  f1-score   support

   large-cap       1.00      1.00      1.00       480
     mid-cap       0.62      0.75      0.68       316
   small-cap       0.80      0.68      0.73       447

    accuracy                           0.82      1243
   macro avg       0.81      0.81      0.81      1243
weighted avg       0.83      0.82      0.82      1243



---

## Best Model

In [39]:
best_model = models[best_scores.index(max(best_scores))]
best_model.best_estimator_.named_steps['clf']

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bytree=1, gamma=0, learning_rate=0.3938775510204082,
              max_delta_step=0, max_depth=12, min_child_weight=1, missing=None,
              n_estimators=616, n_jobs=1, nthread=None,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, seed=None, silent=True,
              subsample=1)

In [40]:
print("Best Accuracy:", best_model.best_score_)
print(
    classification_report(
        y_test,
        best_model.predict(X_test),
        target_names=['large-cap', 'mid-cap', 'small-cap']
    )
)

Best Accuracy: 0.9452495974235104
              precision    recall  f1-score   support

   large-cap       1.00      0.99      0.99       480
     mid-cap       0.93      0.91      0.92       316
   small-cap       0.93      0.95      0.94       447

    accuracy                           0.95      1243
   macro avg       0.95      0.95      0.95      1243
weighted avg       0.95      0.95      0.95      1243



The best model is one that overfits the least and has the best training and test accuracies. In this case, `XGBClassifier` has a training and test accuracies of 0.95, which makes it the best performing model.

Comparing with [baseline accuracy of 0.39](09_preprocessing_and_feature_engineering.ipynb#Baseline-Accuracy), `XGBClassifier` has higher accuracies than baseline.

---

## Export To `csv`

In [41]:
pd.Series(best_model.predict(X)).to_csv('../data/predict.csv', index=False)

In [42]:
pd.DataFrame({
    'Features': X.columns, 
    'Feature Importance': best_model.best_estimator_.named_steps['clf'].feature_importances_
}).sort_values('Feature Importance', ascending=False).to_csv('../data/feature_importance.csv', index=False)