In [1]:
!pip install --upgrade pip
!pip install pandas scikit-learn matplotlib seaborn ipywidgets
!pip install fancyimpute optuna
# !pip install xgboost catboost lightgbm

!mkdir model_params

Collecting pip
  Downloading pip-23.3.1-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.1.2
    Uninstalling pip-23.1.2:
      Successfully uninstalled pip-23.1.2
Successfully installed pip-23.3.1
Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Downloading jedi-0.19.1-py2.py3-none-any.whl.metadata (22 kB)
Downloading jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jedi
Successfully installed jedi-0.19.1
[0mCollecting fancyimpute
  Downloading fancyimpute-0.7.0.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting optuna
  Downloading optuna-3.5.0-py3-none-any.whl.metadata (17 kB)
Collecting knnimpute>=0

In [2]:
import json
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, roc_auc_score

# Data Preparation

In [3]:
orig_df = pd.read_csv("https://www.cpe.ku.ac.th/~cnc/customer_data.csv")

In [4]:
df = orig_df[~orig_df['Segmentation'].isna()]

In [5]:
df['Segmentation'] = df['Segmentation'].map({'A':0,'B':1,'C':2,'D':3})

In [6]:
df.dtypes

ID                   int64
Gender              object
Ever_Married        object
Age                  int64
Graduated           object
Profession          object
Work_Experience    float64
Spending_Score      object
Family_Size        float64
Var_1               object
Segmentation         int64
dtype: object

In [7]:
df.describe()

Unnamed: 0,ID,Age,Work_Experience,Family_Size,Segmentation
count,10695.0,10695.0,9597.0,10247.0,10695.0
mean,463468.08864,43.511828,2.619777,2.844052,1.530902
std,2600.966411,16.774158,3.39079,1.536427,1.158536
min,458982.0,18.0,0.0,1.0,0.0
25%,461220.5,30.0,0.0,2.0,0.0
50%,463451.0,41.0,1.0,3.0,2.0
75%,465733.5,53.0,4.0,4.0,3.0
max,467974.0,89.0,14.0,9.0,3.0


In [8]:
df.nunique()

ID                 8363
Gender                2
Ever_Married          2
Age                  67
Graduated             2
Profession            9
Work_Experience      15
Spending_Score        3
Family_Size           9
Var_1                 7
Segmentation          4
dtype: int64

# Preprocessing

## Encode Categorical Columns

In [9]:
df_target = df['Segmentation']
df = df.drop('Segmentation', axis=1)

In [10]:
categorical_cols = df.select_dtypes(include='object').columns.tolist()

In [11]:
encoders = dict()

for col in categorical_cols:
    series = df[col]
    label_encoder = LabelEncoder()
    df[col] = pd.Series(
        label_encoder.fit_transform(series[series.notnull()]),
        index=series[series.notnull()].index
    )
    encoders[col] = label_encoder

In [12]:
for encoder in encoders:
    print(encoder, encoders[encoder].classes_)

Gender ['Female' 'Male']
Ever_Married ['No' 'Yes']
Graduated ['No' 'Yes']
Profession ['Artist' 'Doctor' 'Engineer' 'Entertainment' 'Executive' 'Healthcare'
 'Homemaker' 'Lawyer' 'Marketing']
Spending_Score ['Average' 'High' 'Low']
Var_1 ['Cat_1' 'Cat_2' 'Cat_3' 'Cat_4' 'Cat_5' 'Cat_6' 'Cat_7']


## Missing Value Analysis

In [13]:
df.isna().sum()

ID                    0
Gender                0
Ever_Married        190
Age                   0
Graduated           102
Profession          162
Work_Experience    1098
Spending_Score        0
Family_Size         448
Var_1               108
dtype: int64

In [14]:
df = df.ffill()

df.isna().sum()

ID                 0
Gender             0
Ever_Married       0
Age                0
Graduated          0
Profession         0
Work_Experience    0
Spending_Score     0
Family_Size        0
Var_1              0
dtype: int64

In [15]:
# from fancyimpute import KNN

# df = pd.DataFrame(KNN(k=5).fit_transform(df), columns=df.columns)

# df.isna().sum()

In [16]:
for col in categorical_cols:
    print(df)

           ID  Gender  Ever_Married  Age  Graduated  Profession  \
0      462809       1           0.0   22        0.0         5.0   
1      462643       0           1.0   38        1.0         2.0   
2      466315       0           1.0   67        1.0         2.0   
3      461735       1           1.0   67        1.0         7.0   
4      462669       0           1.0   40        1.0         3.0   
...       ...     ...           ...  ...        ...         ...   
10690  467954       1           0.0   29        0.0         5.0   
10691  467958       0           0.0   35        1.0         1.0   
10692  467960       0           0.0   53        1.0         3.0   
10693  467961       1           1.0   47        1.0         4.0   
10694  467968       0           0.0   43        1.0         5.0   

       Work_Experience  Spending_Score  Family_Size  Var_1  
0                  1.0               2          4.0    3.0  
1                  1.0               0          3.0    3.0  
2           

# ML

In [17]:
import optuna
from sklearn.metrics import accuracy_score

In [18]:
df.head()

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1
0,462809,1,0.0,22,0.0,5.0,1.0,2,4.0,3.0
1,462643,0,1.0,38,1.0,2.0,1.0,0,3.0,3.0
2,466315,0,1.0,67,1.0,2.0,1.0,2,1.0,5.0
3,461735,1,1.0,67,1.0,7.0,0.0,1,2.0,5.0
4,462669,0,1.0,40,1.0,3.0,0.0,1,6.0,5.0


In [19]:
X =  df
X = X.drop('ID', axis=1)
X.shape

(10695, 9)

In [20]:
y = df_target
y.shape

(10695,)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=69)

## Gradent Boosting

In [31]:
from sklearn.ensemble import GradientBoostingClassifier

def objective(trial: optuna.Trial) -> float:
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 5000, step = 100),
        "learning_rate": trial.suggest_float("learning_rate", 1e-4, 0.3, log = True),
        "max_depth": trial.suggest_int("max_depth", 1, 9),
        "subsample": trial.suggest_float("subsample", 0.5, 0.9, step = 0.05),
        "max_features": trial.suggest_categorical("max_features", [None, "sqrt", "log2"]),
        "random_state": 42,
        }

    _clf = GradientBoostingClassifier(**params)
    _clf.fit(X_train, y_train)
    return accuracy_score(y_test, _clf.predict(X_test))

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

[I 2023-12-11 09:52:13,930] A new study created in memory with name: no-name-d3a12af9-e3d8-487f-90d8-dff6054861da
[I 2023-12-11 09:53:20,329] Trial 0 finished with value: 0.49181860682561945 and parameters: {'n_estimators': 3600, 'learning_rate': 0.009039163911464339, 'max_depth': 3, 'subsample': 0.5, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.49181860682561945.
[I 2023-12-11 09:53:30,877] Trial 1 finished with value: 0.4946236559139785 and parameters: {'n_estimators': 600, 'learning_rate': 0.11016360371763322, 'max_depth': 3, 'subsample': 0.5, 'max_features': 'sqrt'}. Best is trial 1 with value: 0.4946236559139785.
[I 2023-12-11 09:55:50,328] Trial 2 finished with value: 0.48761103319308086 and parameters: {'n_estimators': 4200, 'learning_rate': 0.003556681447966854, 'max_depth': 4, 'subsample': 0.8, 'max_features': None}. Best is trial 1 with value: 0.4946236559139785.
[I 2023-12-11 09:56:54,266] Trial 3 finished with value: 0.4834034595605423 and parameters: {'n_estimato

In [33]:
gb_best_params = study.best_params
gb_best_score = study.best_value

In [34]:
with open(f'model_params/GradientBoostingClassifier-params-{gb_best_score:.4f}.json', 'w') as f:
    json.dump(gb_best_params, f)

In [None]:
gb_clf = GradientBoostingClassifier(**gb_best_params)

gb_clf.fit(X_train, y_train)

print("Train Accuracy: %.2f%%" % (gb_clf.score(X_train, y_train) * 100.0))
print("Test Accuracy: %.2f%%" % (gb_clf.score(X_test, y_test) * 100.0))

## Random Forest

In [22]:
from sklearn.ensemble import RandomForestClassifier

def objective(trial: optuna.Trial) -> float:
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 1, 200),
        'max_depth': trial.suggest_int('max_depth', 2, 20),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 200),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'max_features': trial.suggest_categorical('max_features', [None, 'sqrt', 'log2']),
        'random_state': trial.suggest_categorical('random_state', [42]),
    }

    _clf = RandomForestClassifier(**params)
    _clf.fit(X_train, y_train)
    return accuracy_score(y_test, _clf.predict(X_test))

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=500)

[I 2023-12-11 09:36:48,901] A new study created in memory with name: no-name-35ba4fe0-0a3e-45de-8847-c916bc1359bd
[I 2023-12-11 09:36:49,941] Trial 0 finished with value: 0.4899485741000468 and parameters: {'n_estimators': 144, 'max_depth': 17, 'min_samples_split': 92, 'min_samples_leaf': 10, 'max_features': 'log2', 'random_state': 42}. Best is trial 0 with value: 0.4899485741000468.
[I 2023-12-11 09:36:50,062] Trial 1 finished with value: 0.49555867227676487 and parameters: {'n_estimators': 8, 'max_depth': 13, 'min_samples_split': 112, 'min_samples_leaf': 8, 'max_features': None, 'random_state': 42}. Best is trial 1 with value: 0.49555867227676487.
[I 2023-12-11 09:36:52,037] Trial 2 finished with value: 0.49555867227676487 and parameters: {'n_estimators': 165, 'max_depth': 9, 'min_samples_split': 89, 'min_samples_leaf': 10, 'max_features': 'log2', 'random_state': 42}. Best is trial 1 with value: 0.49555867227676487.
[I 2023-12-11 09:36:53,378] Trial 3 finished with value: 0.491351098

In [23]:
rf_best_params = study.best_params
rf_best_score = study.best_value

In [24]:
with open(f'model_params/RandomForestClassifier-params-{rf_best_score:.4f}.json', 'w') as f:
    json.dump(rf_best_params, f)

In [25]:
rf_clf = RandomForestClassifier(**rf_best_params)

rf_clf.fit(X_train, y_train)

print("Train Accuracy: %.2f%%" % (rf_clf.score(X_train, y_train) * 100.0))
print("Test Accuracy: %.2f%%" % (rf_clf.score(X_test, y_test) * 100.0))

Train Accuracy: 52.23%
Test Accuracy: 50.26%


## Gradient Boosting

## KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

def objective(trial: optuna.Trial) -> float:
    params = {
        'n_neighbors': trial.suggest_int('n_neighbors', 1, 100),
        'weights': trial.suggest_categorical('weights', ['uniform', 'distance']),
        'algorithm': trial.suggest_categorical('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute']),
    }

    _clf = KNeighborsClassifier(**params)
    return accuracy_score(y_test, _clf.fit(X_train, y_train).predict(X_test))

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

In [None]:
knn_best_params = study.best_params
knn_best_score = study.best_value

with open(f'model_params/KNeighborsClassifier-params-{knn_best_score:.4f}.json', 'w') as f:
    json.dump(knn_best_params, f)

In [None]:
knn_clf = KNeighborsClassifier(**knn_best_params)

knn_clf.fit(X_train, y_train)

print("Train Accuracy: %.2f%%" % (knn_clf.score(X_train, y_train) * 100.0))
print("Test Accuracy: %.2f%%" % (knn_clf.score(X_test, y_test) * 100.0))

Train Accuracy: 47.01%
Test Accuracy: 46.10%


## XGB

In [None]:
from xgboost import XGBClassifier

def objective(trial: optuna.Trial) -> float:
    param_grid = {
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_categorical('gamma', [0.5, 1, 1.5, 2, 5]),
        'subsample': trial.suggest_categorical('subsample', [0.6, 0.8, 1.0]),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.6, 0.8, 1.0]),
        'max_depth': trial.suggest_int('max_depth', 3, 5),
        'alpha': trial.suggest_categorical('alpha', [0, 0.1, 0.5, 1]),
        'n_estimators': trial.suggest_int('n_estimators', 500, 1000, 100),
        }

    _clf = XGBClassifier(**param_grid)
    return accuracy_score(y_test, _clf.fit(X_train, y_train).predict(X_test))

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2023-12-09 23:22:40,847] A new study created in memory with name: no-name-6b21cdeb-7d32-4a23-90de-5beea0f7d4c7
[I 2023-12-09 23:22:42,871] Trial 0 finished with value: 0.4908835904628331 and parameters: {'min_child_weight': 6, 'gamma': 2, 'subsample': 1.0, 'colsample_bytree': 1.0, 'max_depth': 4, 'alpha': 1, 'n_estimators': 1000}. Best is trial 0 with value: 0.4908835904628331.
[I 2023-12-09 23:22:45,040] Trial 1 finished with value: 0.4983637213651239 and parameters: {'min_child_weight': 8, 'gamma': 1, 'subsample': 1.0, 'colsample_bytree': 0.6, 'max_depth': 5, 'alpha': 1, 'n_estimators': 800}. Best is trial 1 with value: 0.4983637213651239.
[I 2023-12-09 23:22:48,355] Trial 2 finished with value: 0.4927536231884058 and parameters: {'min_child_weight': 10, 'gamma': 2, 'subsample': 0.8, 'colsample_bytree': 0.8, 'max_depth': 3, 'alpha': 0.1, 'n_estimators': 1000}. Best is trial 1 with value: 0.4983637213651239.
[I 2023-12-09 23:22:50,779] Trial 3 finished with value: 0.496493688639551

In [None]:
study.best_params, study.best_value

({'min_child_weight': 4,
  'gamma': 0.5,
  'subsample': 1.0,
  'colsample_bytree': 0.8,
  'max_depth': 4,
  'alpha': 0,
  'n_estimators': 800},
 0.501168770453483)

In [None]:
xgb_best_params = study.best_params
xgb_best_score = study.best_value
with open(f'XGBoostClassifier-params-{xgb_best_score:.4f}.json', 'w') as f:
    json.dump(xgb_best_params, f)

In [None]:
xgb_clf = XGBClassifier(**study.best_params)

xgb_clf.fit(X_train, y_train)

print("Train Accuracy: %.2f%%" % (accuracy_score(y_train, xgb_clf.predict(X_train)) * 100.0))
print("Test Accuracy: %.2f%%" % (accuracy_score(y_test, xgb_clf.predict(X_test)) * 100.0))

Train Accuracy: 52.07%
Test Accuracy: 50.12%


## LightGBM

In [None]:
from lightgbm import LGBMClassifier

def objective(trial: optuna.Trial) -> float:
    param_grid = {
        'num_leaves': trial.suggest_int('num_leaves', 2, 256),
        'max_depth': trial.suggest_int('max_depth', 3, 8),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.05, 0.1]),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000, 50),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.1, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1.0),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.0, 10.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.0, 10.0),
        'random_state': trial.suggest_int('random_state', 69, 69),
        'eval_metric': 'multi_error',
        'eval_set': [(X_test, y_test)],
        }

    _clf = LGBMClassifier(**param_grid, verbose=-1)
    return accuracy_score(y_test, _clf.fit(X_train, y_train).predict(X_test))

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2023-12-09 23:26:14,265] A new study created in memory with name: no-name-ed424b00-4226-4db3-9a0f-473b37bc0dd7
[I 2023-12-09 23:26:14,874] Trial 0 finished with value: 0.4838709677419355 and parameters: {'num_leaves': 122, 'max_depth': 7, 'learning_rate': 0.1, 'n_estimators': 350, 'min_child_samples': 89, 'subsample': 0.25009726969462265, 'colsample_bytree': 0.6542703657345444, 'reg_alpha': 0.9050827428256514, 'reg_lambda': 1.489746481471943, 'random_state': 69}. Best is trial 0 with value: 0.4838709677419355.
[I 2023-12-09 23:26:15,661] Trial 1 finished with value: 0.49883122954651704 and parameters: {'num_leaves': 48, 'max_depth': 7, 'learning_rate': 0.05, 'n_estimators': 800, 'min_child_samples': 59, 'subsample': 0.8914190521083324, 'colsample_bytree': 0.656153157034567, 'reg_alpha': 2.429350433619173, 'reg_lambda': 9.983253124382184, 'random_state': 69}. Best is trial 1 with value: 0.49883122954651704.
[I 2023-12-09 23:26:17,245] Trial 2 finished with value: 0.45161290322580644 

In [None]:
lgbm_best_params = study.best_params
lgbm_best_score = study.best_value

In [None]:
with open(f'model_params/LGBMClassifier-params-{lgbm_best_score:.4f}.json', 'w') as f:
    json.dump(lgbm_best_params, f)

In [None]:
lgb_clf = LGBMClassifier(**lgbm_best_params, verbose=-1)

lgb_clf.fit(X_train, y_train)

print("Train Accuracy: %.2f%%" % (accuracy_score(y_train, lgb_clf.predict(X_train)) * 100.0))
print("Test Accuracy: %.2f%%" % (accuracy_score(y_test, lgb_clf.predict(X_test)) * 100.0))

Train Accuracy: 52.19%
Test Accuracy: 51.29%


## CatBoost

In [None]:
from catboost import CatBoostClassifier

def objective(trial: optuna.Trial) -> float:
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'depth': trial.suggest_int('depth', 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        'l2_leaf_reg': trial.suggest_int('l2_leaf_reg', 1, 10),
        'border_count': trial.suggest_int('border_count', 5, 255),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100),
        'eval_metric': 'Accuracy',
        'verbose': False,
    }

    _clf = CatBoostClassifier(**params)
    return accuracy_score(y_test, _clf.fit(X_train, y_train).predict(X_test))

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2023-12-09 23:29:09,709] A new study created in memory with name: no-name-dc1a5b47-def8-4b41-85c4-9c6126ae2e6f
[I 2023-12-09 23:29:10,142] Trial 0 finished with value: 0.4801309022907901 and parameters: {'iterations': 119, 'depth': 8, 'learning_rate': 0.006242208415007259, 'l2_leaf_reg': 5, 'border_count': 205, 'min_data_in_leaf': 48}. Best is trial 0 with value: 0.4801309022907901.
[I 2023-12-09 23:29:11,884] Trial 1 finished with value: 0.49696119682094436 and parameters: {'iterations': 907, 'depth': 7, 'learning_rate': 0.002691125367876088, 'l2_leaf_reg': 10, 'border_count': 247, 'min_data_in_leaf': 3}. Best is trial 1 with value: 0.49696119682094436.
[I 2023-12-09 23:29:13,737] Trial 2 finished with value: 0.4927536231884058 and parameters: {'iterations': 762, 'depth': 8, 'learning_rate': 0.005843088453837564, 'l2_leaf_reg': 10, 'border_count': 113, 'min_data_in_leaf': 87}. Best is trial 1 with value: 0.49696119682094436.
[I 2023-12-09 23:29:16,273] Trial 3 finished with value: 

In [None]:
cat_best_params = study.best_params
cat_best_score = study.best_value

In [None]:
with open(f'model_params/CatBoostClassifier-params-{cat_best_score:.4f}.json', 'w') as f:
    json.dump(cat_best_params, f)

In [None]:
cat_clf = CatBoostClassifier(**cat_best_params, verbose=False)

cat_clf.fit(X_train, y_train)

print("Train Accuracy: %.2f%%" % (cat_clf.score(X_train, y_train) * 100.0))
print("Test Accuracy: %.2f%%" % (cat_clf.score(X_test, y_test) * 100.0))

Train Accuracy: 49.70%
Test Accuracy: 50.54%


## Ada Boost

In [26]:
from sklearn.ensemble import AdaBoostClassifier

def objective(trial: optuna.Trial) -> float:
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000, 50),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 1, log=True),
        'algorithm': trial.suggest_categorical('algorithm', ['SAMME', 'SAMME.R']),
    }

    _clf = AdaBoostClassifier(**params)
    return accuracy_score(y_test, _clf.fit(X_train, y_train).predict(X_test))

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2023-12-11 09:42:03,507] A new study created in memory with name: no-name-7c36d628-8db9-47c3-97a1-eba347ab63c3
  'n_estimators': trial.suggest_int('n_estimators', 10, 1000, 50),
[I 2023-12-11 09:42:06,836] Trial 0 finished with value: 0.4796633941093969 and parameters: {'n_estimators': 610, 'learning_rate': 0.052218083692412834, 'algorithm': 'SAMME.R'}. Best is trial 0 with value: 0.4796633941093969.
  'n_estimators': trial.suggest_int('n_estimators', 10, 1000, 50),
[I 2023-12-11 09:42:07,507] Trial 1 finished with value: 0.38429172510518933 and parameters: {'n_estimators': 160, 'learning_rate': 0.0034989905959886035, 'algorithm': 'SAMME'}. Best is trial 0 with value: 0.4796633941093969.
  'n_estimators': trial.suggest_int('n_estimators', 10, 1000, 50),
[I 2023-12-11 09:42:08,109] Trial 2 finished with value: 0.4067321178120617 and parameters: {'n_estimators': 110, 'learning_rate': 0.0035555208554698494, 'algorithm': 'SAMME.R'}. Best is trial 0 with value: 0.4796633941093969.
  'n_e

KeyboardInterrupt: ignored

In [None]:
ada_best_params = study.best_params
ada_best_score = study.best_value

In [None]:
with open(f'model_params/AdaBoostClassifier-params-{ada_best_score:.4f}.json', 'w') as f:
    json.dump(ada_best_params, f)

In [None]:
ada_clf = AdaBoostClassifier(**ada_best_params)

ada_clf.fit(X_train, y_train)

print("Train Accuracy: %.2f%%" % (ada_clf.score(X_train, y_train) * 100.0))
print("Test Accuracy: %.2f%%" % (ada_clf.score(X_test, y_test) * 100.0))