# Performance Baselines

In [1]:
%reload_ext watermark

In [2]:
%load_ext watermark
%watermark -p scikit-learn,mlxtend,xgboost

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
scikit-learn: 1.0.1
mlxtend     : 0.19.0
xgboost     : 1.5.0



## Dataset

Source: https://archive.ics.uci.edu/ml/datasets/Dry+Bean+Dataset

In [3]:
import pandas as pd


X_train = pd.read_csv('https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/main/hw02-starter/dataset/X_train.csv', header=None).values
y_train = pd.read_csv('https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/main/hw02-starter/dataset/y_train.csv', header=None).values.ravel().astype(int)

X_test = pd.read_csv('https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/main/hw02-starter/dataset/X_test.csv', header=None).values
y_test = pd.read_csv('https://raw.githubusercontent.com/rasbt/stat451-machine-learning-fs21/main/hw02-starter/dataset/y_test.csv', header=None).values.ravel().astype(int)

print('X_train.shape:', X_train.shape)
print('y_train.shape:', y_train.shape)
print('X_test.shape:', X_test.shape)
print('y_test.shape:', y_test.shape)

X_train.shape: (9119, 16)
y_train.shape: (9119,)
X_test.shape: (4492, 16)
y_test.shape: (4492,)


In [4]:
from sklearn import model_selection
from sklearn.model_selection import train_test_split


X_train_sub, X_valid, y_train_sub, y_valid = \
    train_test_split(X_train, y_train, test_size=0.2, random_state=1, stratify=y_train)

print('Train/Valid/Test sizes:', y_train.shape[0], y_valid.shape[0], y_test.shape[0])

Train/Valid/Test sizes: 9119 1824 4492


## Baselines

Compare hyperparameter settings on validation set:

In [5]:
from sklearn.neighbors import KNeighborsClassifier


knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_sub, y_train_sub)
print(f"Train Accuracy: {knn.score(X_train_sub, y_train_sub)*100:0.3f}%")
print(f"Valid Accuracy: {knn.score(X_valid, y_valid)*100:0.3f}%")

Train Accuracy: 79.657%
Valid Accuracy: 71.162%


In [6]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_sub, y_train_sub)
print(f"Train Accuracy: {knn.score(X_train_sub, y_train_sub)*100:0.3f}%")
print(f"Valid Accuracy: {knn.score(X_valid, y_valid)*100:0.3f}%")

Train Accuracy: 84.003%
Valid Accuracy: 71.930%


In [7]:
knn = KNeighborsClassifier(n_neighbors=7)
knn.fit(X_train_sub, y_train_sub)
print(f"Train Accuracy: {knn.score(X_train_sub, y_train_sub)*100:0.3f}%")
print(f"Valid Accuracy: {knn.score(X_valid, y_valid)*100:0.3f}%")

Train Accuracy: 77.478%
Valid Accuracy: 69.518%


Choose best model and train on whole training set:

In [8]:
model = KNeighborsClassifier(n_neighbors=3)
model.fit(X_train, y_train)
print(f"Train Accuracy: {model.score(X_train, y_train)*100:0.3f}%")
print(f"Test Accuracy: {model.score(X_test, y_test)*100:0.3f}%")

Train Accuracy: 84.965%
Test Accuracy: 71.305%


In [9]:
# 4.3

from optuna.integration import LightGBMPruningCallback
import numpy as np
import lightgbm
import optuna

from sklearn.metrics import log_loss
from sklearn.model_selection import StratifiedKFold

import warnings

warnings.filterwarnings("ignore", category=UserWarning)
#optuna.logging.set_verbosity(optuna.logging.WARNING)


def objective(trial, X_train, y_train, cv=5):
    
    param_grid = {
        "n_estimators": trial.suggest_categorical("n_estimators", [10, 100]),
        "learning_rate": trial.suggest_categorical("learning_rate", [0.01]),
    }
    
    cv_iterator = StratifiedKFold(n_splits=cv, shuffle=True, random_state=123)

    cv_scores = np.zeros(cv)
    for idx, (train_sub_idx, valid_idx) in enumerate(cv_iterator.split(X_train, y_train)):
        
        X_train_sub, X_valid = X_train[train_sub_idx], X_train[valid_idx]
        y_train_sub, y_valid = y_train[train_sub_idx], y_train[valid_idx]

        model = lightgbm.LGBMClassifier(objective="multi_logloss", **param_grid)
        model.fit(
            X_train_sub,
            y_train_sub,
            eval_set=[(X_valid, y_valid)],
            eval_metric="multi_logloss",
            verbose=-1,
            early_stopping_rounds=50,
            callbacks=[
                LightGBMPruningCallback(trial=trial, metric="multi_logloss")
            ],  # Add a pruning callback to eliminate unpromising candidates
        )
        preds = model.score(X_valid, y_valid)
        
        cv_scores[idx] = preds

    return 1-np.mean(cv_scores)

In [10]:
study = optuna.create_study(direction="minimize", study_name="LGBM Classifier")

def func(trial):
    return objective(trial, X_train, y_train)

study.optimize(func, n_trials=50);

[32m[I 2021-11-16 14:36:37,102][0m A new study created in memory with name: LGBM Classifier[0m
[32m[I 2021-11-16 14:36:39,875][0m Trial 0 finished with value: 0.07511800964286763 and parameters: {'n_estimators': 100, 'learning_rate': 0.01}. Best is trial 0 with value: 0.07511800964286763.[0m
[32m[I 2021-11-16 14:36:40,165][0m Trial 1 finished with value: 0.48239960158212314 and parameters: {'n_estimators': 10, 'learning_rate': 0.01}. Best is trial 0 with value: 0.07511800964286763.[0m
[32m[I 2021-11-16 14:36:42,683][0m Trial 2 finished with value: 0.07511800964286763 and parameters: {'n_estimators': 100, 'learning_rate': 0.01}. Best is trial 0 with value: 0.07511800964286763.[0m
[32m[I 2021-11-16 14:36:42,979][0m Trial 3 finished with value: 0.48239960158212314 and parameters: {'n_estimators': 10, 'learning_rate': 0.01}. Best is trial 0 with value: 0.07511800964286763.[0m
[32m[I 2021-11-16 14:36:43,285][0m Trial 4 finished with value: 0.48239960158212314 and parameters

[32m[I 2021-11-16 14:38:12,098][0m Trial 41 finished with value: 0.07511800964286763 and parameters: {'n_estimators': 100, 'learning_rate': 0.01}. Best is trial 0 with value: 0.07511800964286763.[0m
[32m[I 2021-11-16 14:38:15,413][0m Trial 42 finished with value: 0.07511800964286763 and parameters: {'n_estimators': 100, 'learning_rate': 0.01}. Best is trial 0 with value: 0.07511800964286763.[0m
[32m[I 2021-11-16 14:38:18,386][0m Trial 43 finished with value: 0.07511800964286763 and parameters: {'n_estimators': 100, 'learning_rate': 0.01}. Best is trial 0 with value: 0.07511800964286763.[0m
[32m[I 2021-11-16 14:38:21,276][0m Trial 44 finished with value: 0.07511800964286763 and parameters: {'n_estimators': 100, 'learning_rate': 0.01}. Best is trial 0 with value: 0.07511800964286763.[0m
[32m[I 2021-11-16 14:38:21,623][0m Trial 45 finished with value: 0.48239960158212314 and parameters: {'n_estimators': 10, 'learning_rate': 0.01}. Best is trial 0 with value: 0.07511800964286

In [11]:
print(f"\tBest value: {study.best_value:.5f}")
print(f"\tBest params:")

for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")

	Best value: 0.07512
	Best params:
		n_estimators: 100
		learning_rate: 0.01


In [12]:
model = lightgbm.LGBMClassifier(objective="multi_logloss", **study.best_params)
model.fit(X_train, y_train)

LGBMClassifier(learning_rate=0.01, objective='multi_logloss')

In [19]:
print(f"Training Accuracy: {model.score(X_train, y_train)*100:0.5f}")
print(f"Valid Accuracy: {model.score(X_valid, y_valid)*100:0.5f}")
print(f"Test Accuracy: {model.score(X_test, y_test)*100:0.5f}")

Training Accuracy: 95.64645
Valid Accuracy: 95.83333
Test Accuracy: 92.16385


In [22]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingClassifier
from mlxtend.classifier import StackingClassifier
from sklearn.ensemble import StackingClassifier


clf1 = KNeighborsClassifier(n_neighbors=5)
clf2 = RandomForestClassifier(random_state=123)
clf3 = HistGradientBoostingClassifier(random_state=123)
clf4 = AdaBoostClassifier(random_state=123)
clf5 = DecisionTreeClassifier(random_state=123,
                              max_depth=None)

lr = LogisticRegression(random_state=123)

estimators = [('clf1', clf1),
              ('clf2', clf2),
              ('clf3', clf3),
              ('clf4', clf4),
              ('clf5', clf5)]

sclf = StackingClassifier(estimators=estimators, 
                          final_estimator=lr, 
                          cv=10)


sclf.fit(X_train, y_train)
print("Training Accuracy: %0.2f" % sclf.score(X_train, y_train))
print("Validation Accuracy: %0.2f" % sclf.score(X_valid, y_valid))
print("Test Accuracy: %0.2f" % sclf.score(X_test, y_test))

Training Accuracy: 1.00
Validation Accuracy: 1.00
Test Accuracy: 0.92


In [23]:
print("Training Accuracy: %0.5f" % sclf.score(X_train, y_train))
print("Validation Accuracy: %0.5f" % sclf.score(X_valid, y_valid))
print("Test Accuracy: %0.5f" % sclf.score(X_test, y_test))

Training Accuracy: 1.00000
Validation Accuracy: 1.00000
Test Accuracy: 0.92053


In [24]:
from mlxtend.classifier import StackingCVClassifier


sclf = StackingCVClassifier(classifiers=[clf1, clf2, clf3, clf4, clf5], 
                            meta_classifier=lr, 
                            use_probas=True, # changed
                            drop_proba_col='last',
                            #use_features_in_secondary=True,
                            cv=10,
                            random_state=123)


sclf.fit(X_train, y_train)
print("Training Accuracy: %0.5f" % sclf.score(X_train, y_train))
print("Validation Accuracy: %0.5f" % sclf.score(X_valid, y_valid))
print("Test Accuracy: %0.5f" % sclf.score(X_test, y_test))

Training Accuracy: 1.00000
Validation Accuracy: 1.00000
Test Accuracy: 0.91986
