In [2]:
from sklearn.datasets import make_classification
import numpy as np

In [3]:
RANDOM_SEED = 6
X,y = make_classification(
    n_samples = 1000, 
    n_features= 20,
    n_classes = 2,
    random_state = 6,
    n_redundant = 5, # Redundant features
    n_informative = 15 # Independent features
)

In [4]:
X.shape,y.shape

((1000, 20), (1000,))

In [16]:
# Check label distribution
import pandas as pd
df = pd.DataFrame(y, columns = ['label'])
df['label'].value_counts()


1    500
0    500
Name: label, dtype: int64

In [27]:
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score
from sklearn.ensemble import AdaBoostClassifier
# Choose the cv strategy
cv = RepeatedStratifiedKFold(n_splits = 10,  n_repeats = 3, random_state = 1)

# Select model with default hyper params
model = AdaBoostClassifier()

# Apply cv on model
scores = cross_val_score(model, X, y, scoring='accuracy', cv = cv, n_jobs = -1)

# Returns the score for each pass (Total: 10*3 = 30) in a list 
print(scores, len(scores))
print('Accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores)))

[0.79 0.82 0.84 0.75 0.88 0.79 0.77 0.84 0.83 0.84 0.83 0.84 0.81 0.79
 0.72 0.76 0.84 0.76 0.79 0.82 0.84 0.8  0.78 0.82 0.8  0.84 0.81 0.89
 0.71 0.79] 30
Accuracy: 0.806 (0.041)


In [None]:
# cv on the training set only
#  TODO: Review https://www.ritchieng.com/machine-learning-cross-validation/

In [21]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state = 50)

dtc = DecisionTreeClassifier()
adaboost = AdaBoostClassifier(base_estimator = dtc)

hyperparams = {
    'n_estimators' : [10,50,100,500,1000,5000],
    'learning_rate': np.arange(0.2,1, 0.1)
}

rgs = RandomizedSearchCV(
        model,
        param_distributions = hyperparams,
        n_iter = 10,
        random_state = 50,
        n_jobs = -1,
        scoring = 'accuracy'
)
for cv_num_folds in range(3,6):
    rgs.cv = cv_num_folds
    rgs.fit(X_train, y_train)
    
    score = rgs.score(X_test, y_test)
    print("Num Folds: ", cv_num_folds)
    print("Score ", score)
    print("Model params ", rgs.best_params_)

    

Num Folds:  3
Score  0.795
Model params  {'n_estimators': 100, 'learning_rate': 0.30000000000000004}
Num Folds:  4
Score  0.795
Model params  {'n_estimators': 100, 'learning_rate': 0.30000000000000004}
Num Folds:  5
Score  0.795
Model params  {'n_estimators': 100, 'learning_rate': 0.30000000000000004}


In [None]:
# Run predictions on best model

best_model = 