In [2]:
# Get the training data
# Clean/preprocess/transform the data
# Train a machine learning model
# Evaluate and optimise the model
# Clean/preprocess/transform new data
# Fit the model on new data to make predictions.

In [22]:
import pandas as pd
import numpy as np
import load_data

from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score,classification_report,recall_score,confusion_matrix, roc_auc_score, precision_score, f1_score, roc_curve, auc, plot_confusion_matrix,plot_roc_curve
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import Normalizer
from sklearn import metrics

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

# Import Libraries for Modelling
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

import optuna

pd.set_option('display.max_rows', 999)
pd.set_option('display.max_columns', 999)

In [42]:
# Features to preprocess

# Categorical Features to Preprocess
categorical_features = [
    'homebanking_active_x', 'homebanking_active_y', 'homebanking_active',
    'has_homebanking_x', 'has_homebanking_y', 'has_homebanking',
    'has_insurance_21_x', 'has_insurance_21_y', 'has_insurance_21',
    'has_insurance_23_x', 'has_insurance_23_y', 'has_insurance_23',
    'has_life_insurance_fixed_cap_x', 'has_life_insurance_fixed_cap_y', 'has_life_insurance_fixed_cap',
    'has_life_insurance_decreasing_cap_x', 'has_life_insurance_decreasing_cap_y', 'has_life_insurance_decreasing_cap',
    'has_fire_car_other_insurance_x', 'has_fire_car_other_insurance_y', 'has_fire_car_other_insurance',
    'has_personal_loan_x', 'has_personal_loan_y', 'has_personal_loan',
    'has_mortgage_loan_x', 'has_mortgage_loan_y', 'has_mortgage_loan',
    'has_current_account_x', 'has_current_account_y', 'has_current_account',
    'has_pension_saving_x', 'has_pension_saving_y', 'has_pension_saving',
    'has_savings_account_x', 'has_savings_account_y', 'has_savings_account',
    'has_savings_account_starter_x', 'has_savings_account_starter',
    'has_current_account_starter_x', 'has_current_account_starter_y', 'has_current_account_starter',
    'visits_distinct_so_x', 'visits_distinct_so_y', 'visits_distinct_so',
    'visits_distinct_so_areas_x', 'visits_distinct_so_areas_y', 'visits_distinct_so_areas',
    'customer_gender_x',
#     'customer_postal_code_x', drop this, causes error...
    'customer_occupation_code_x',
    'customer_self_employed_x', 'customer_self_employed_y', 'customer_self_employed',
    'customer_education_x',
    'customer_children_x', 'customer_children_y', 'customer_children',
    'customer_relationship_x', 'customer_relationship_y', 'customer_relationship',
    'area_cat',
]

# Numerical Features to Preprocess
numeric_features = [
    'bal_insurance_21_x', 'bal_insurance_21_y', 'bal_insurance_21', 
    'bal_insurance_23_x', 'bal_insurance_23_y', 'bal_insurance_23',
    'cap_life_insurance_fixed_cap_x', 'cap_life_insurance_fixed_cap_y', 'cap_life_insurance_fixed_cap',
    'cap_life_insurance_decreasing_cap_x', 'cap_life_insurance_decreasing_cap_y', 'cap_life_insurance_decreasing_cap',
    'prem_fire_car_other_insurance_x', 'prem_fire_car_other_insurance_y', 'prem_fire_car_other_insurance',
    'bal_personal_loan_x', 'bal_personal_loan_y', 'bal_personal_loan',
    'bal_mortgage_loan_x', 'bal_mortgage_loan_y', 'bal_mortgage_loan',
    'bal_current_account_x', 'bal_current_account_y', 'bal_current_account',
    'bal_pension_saving_x', 'bal_pension_saving_y', 'bal_pension_saving',
    'bal_savings_account_x', 'bal_savings_account_y', 'bal_savings_account', 
    'bal_savings_account_starter_x', 'bal_savings_account_starter_y', 'bal_savings_account_starter',
    'bal_current_account_starter_x', 'bal_current_account_starter_y', 'bal_current_account_starter',    
    'customer_since_all_x', 
    'customer_since_bank_x',
    'customer_birth_date_x',
]

In [43]:
def preprocess(categorical_features, numeric_features, df):
    
    # Replace NA with column mean, normalize numerical values
    si = SimpleImputer(strategy='mean')
    norm = Normalizer()
    
    df[numeric_features] = si.fit_transform(df[numeric_features])
    df[numeric_features] = norm.fit_transform(df[numeric_features])

    # Replace NA with column mode, encode categorical value to 0/1    
    sic = SimpleImputer(strategy='most_frequent')
    enc = OrdinalEncoder() #OneHotEncoder()
    
    df[categorical_features] = sic.fit_transform(df[categorical_features])
    df[categorical_features] = enc.fit_transform(df[categorical_features])

    return df

In [44]:
def objective(trial):
    #Load Data
    mypath = "../data/"
    mydata = load_data.get_file_names(mypath)
    data_files = load_data.load_copy_data(mydata, mypath)

    data = data_files['data_merged']
    X_train = data_files['X_train'].copy()
    X_val = data_files['X_val'].copy()
    y_train = data_files['y_train']['target'].copy()
    y_val = data_files['y_val']['target'].copy()

    X_train = preprocess(categorical_features, numeric_features, X_train)
    X_val = preprocess(categorical_features, numeric_features, X_val)
    
    categorical_features_indices = np.where(X_train.dtypes != np.float)[0]
    
    # define oversampling strategy
    oversample = RandomOverSampler(sampling_strategy=0.8)
    # fit and apply the transform
    X_over, y_over = oversample.fit_resample(X_train, y_train)
    # summarize class distribution

    param = {
        "objective": "Logloss",
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        "used_ram_limit": "3gb",
    }
    
    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

#     cat_cls = CatBoostClassifier(verbose=False,random_state=0,scale_pos_weight=1.2, **param)
    cat_cls = CatBoostClassifier(verbose=False,random_state=0,**param)
    cat_cls.fit(X_over, y_over, eval_set=[(X_val, y_val)], cat_features=categorical_features_indices,verbose=0, early_stopping_rounds=100)

    preds = cat_cls.predict(X_val)
    pred_labels = np.rint(preds)
    accuracy = accuracy_score(y_val, pred_labels)
    return accuracy


if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100, timeout=600)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

[32m[I 2022-04-01 00:53:26,539][0m A new study created in memory with name: no-name-b1a77e5b-f3fa-408e-8cf4-df3b2b9cc951[0m


file name: train_month_1
file name: data_merged
file name: train_month_2
file name: y_val
file name: X_train
file name: X_val
file name: test_month_1
file name: test_month_3
file name: y_train
file name: test_month_2
file name: train_month_3_with_target


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categorical_features_indices = np.where(X_train.dtypes != np.float)[0]
CatBoost is using more CPU RAM (3.67GiB) than the limit (3GiB)
CatBoost is using more CPU RAM (3.67GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)

CatBoost is using more CPU RAM (3.7GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU R

CatBoost is using more CPU RAM (3.89GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > Resource

[32m[I 2022-04-01 00:56:25,925][0m Trial 0 finished with value: 0.9572998430141287 and parameters: {'colsample_bylevel': 0.08935852593590027, 'depth': 10, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 5.351678914534944}. Best is trial 0 with value: 0.9572998430141287.[0m


file name: train_month_1
file name: data_merged
file name: train_month_2
file name: y_val
file name: X_train
file name: X_val
file name: test_month_1
file name: test_month_3
file name: y_train
file name: test_month_2
file name: train_month_3_with_target


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categorical_features_indices = np.where(X_train.dtypes != np.float)[0]
CatBoost is using more CPU RAM (4.73GiB) than the limit (3GiB)
CatBoost is using more CPU RAM (4.73GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)

CatBoost is using more CPU RAM (4.74GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(25480) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU 

file name: train_month_1
file name: data_merged
file name: train_month_2
file name: y_val
file name: X_train
file name: X_val
file name: test_month_1
file name: test_month_3
file name: y_train
file name: test_month_2
file name: train_month_3_with_target


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categorical_features_indices = np.where(X_train.dtypes != np.float)[0]
CatBoost is using more CPU RAM (4.29GiB) than the limit (3GiB)
CatBoost is using more CPU RAM (4.29GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)

CatBoost is using more CPU RAM (4.3GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(25480) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU R

[32m[I 2022-04-01 00:57:49,326][0m Trial 2 finished with value: 0.9583202511773941 and parameters: {'colsample_bylevel': 0.0970617717313076, 'depth': 5, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS'}. Best is trial 2 with value: 0.9583202511773941.[0m


file name: train_month_1
file name: data_merged
file name: train_month_2
file name: y_val
file name: X_train
file name: X_val
file name: test_month_1
file name: test_month_3
file name: y_train
file name: test_month_2
file name: train_month_3_with_target


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categorical_features_indices = np.where(X_train.dtypes != np.float)[0]
CatBoost is using more CPU RAM (4.49GiB) than the limit (3GiB)
CatBoost is using more CPU RAM (4.49GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)

CatBoost is using more CPU RAM (4.49GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(25480) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU 

[32m[I 2022-04-01 00:59:38,859][0m Trial 3 finished with value: 0.9587912087912088 and parameters: {'colsample_bylevel': 0.0274207675421404, 'depth': 12, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.4965688142128988}. Best is trial 3 with value: 0.9587912087912088.[0m


file name: train_month_1
file name: data_merged
file name: train_month_2
file name: y_val
file name: X_train
file name: X_val
file name: test_month_1
file name: test_month_3
file name: y_train
file name: test_month_2
file name: train_month_3_with_target


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categorical_features_indices = np.where(X_train.dtypes != np.float)[0]
CatBoost is using more CPU RAM (5.57GiB) than the limit (3GiB)
CatBoost is using more CPU RAM (5.57GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)

CatBoost is using more CPU RAM (5.57GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU 

CatBoost is using more CPU RAM (4.88GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > Resource

[32m[I 2022-04-01 00:59:59,566][0m Trial 4 finished with value: 0.9519623233908948 and parameters: {'colsample_bylevel': 0.06574906659011832, 'depth': 4, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.9845508007836096}. Best is trial 3 with value: 0.9587912087912088.[0m


file name: train_month_1
file name: data_merged
file name: train_month_2
file name: y_val
file name: X_train
file name: X_val
file name: test_month_1
file name: test_month_3
file name: y_train
file name: test_month_2
file name: train_month_3_with_target


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categorical_features_indices = np.where(X_train.dtypes != np.float)[0]
CatBoost is using more CPU RAM (5.17GiB) than the limit (3GiB)
CatBoost is using more CPU RAM (5.17GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)

CatBoost is using more CPU RAM (5.16GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU 

CatBoost is using more CPU RAM (4.99GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > Resource

[32m[I 2022-04-01 01:00:29,298][0m Trial 5 finished with value: 0.9547095761381476 and parameters: {'colsample_bylevel': 0.07994286881136901, 'depth': 4, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.2615395519556478}. Best is trial 3 with value: 0.9587912087912088.[0m


file name: train_month_1
file name: data_merged
file name: train_month_2
file name: y_val
file name: X_train
file name: X_val
file name: test_month_1
file name: test_month_3
file name: y_train
file name: test_month_2
file name: train_month_3_with_target


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categorical_features_indices = np.where(X_train.dtypes != np.float)[0]
CatBoost is using more CPU RAM (4.77GiB) than the limit (3GiB)
CatBoost is using more CPU RAM (4.77GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)

CatBoost is using more CPU RAM (4.77GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(25480) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU 

[32m[I 2022-04-01 01:01:48,275][0m Trial 6 finished with value: 0.9648351648351648 and parameters: {'colsample_bylevel': 0.07105308738868985, 'depth': 12, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 4.340964045900418}. Best is trial 6 with value: 0.9648351648351648.[0m


file name: train_month_1
file name: data_merged
file name: train_month_2
file name: y_val
file name: X_train
file name: X_val
file name: test_month_1
file name: test_month_3
file name: y_train
file name: test_month_2
file name: train_month_3_with_target


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categorical_features_indices = np.where(X_train.dtypes != np.float)[0]
CatBoost is using more CPU RAM (5.32GiB) than the limit (3GiB)
CatBoost is using more CPU RAM (5.32GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)

CatBoost is using more CPU RAM (5.32GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(25480) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU 

[32m[I 2022-04-01 01:02:30,426][0m Trial 7 finished with value: 0.9602825745682888 and parameters: {'colsample_bylevel': 0.03916725767354713, 'depth': 7, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.2212988807326295}. Best is trial 6 with value: 0.9648351648351648.[0m


file name: train_month_1
file name: data_merged
file name: train_month_2
file name: y_val
file name: X_train
file name: X_val
file name: test_month_1
file name: test_month_3
file name: y_train
file name: test_month_2
file name: train_month_3_with_target


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categorical_features_indices = np.where(X_train.dtypes != np.float)[0]
CatBoost is using more CPU RAM (4.12GiB) than the limit (3GiB)
CatBoost is using more CPU RAM (4.12GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)

CatBoost is using more CPU RAM (4.13GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU 

CatBoost is using more CPU RAM (4.34GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > Resource

[32m[I 2022-04-01 01:03:08,892][0m Trial 8 finished with value: 0.9600470957613815 and parameters: {'colsample_bylevel': 0.08469912601135574, 'depth': 5, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 6 with value: 0.9648351648351648.[0m


file name: train_month_1
file name: data_merged
file name: train_month_2
file name: y_val
file name: X_train
file name: X_val
file name: test_month_1
file name: test_month_3
file name: y_train
file name: test_month_2
file name: train_month_3_with_target


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categorical_features_indices = np.where(X_train.dtypes != np.float)[0]
CatBoost is using more CPU RAM (4.55GiB) than the limit (3GiB)
CatBoost is using more CPU RAM (4.55GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)

CatBoost is using more CPU RAM (4.56GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU 

CatBoost is using more CPU RAM (3.36GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > Resource

[32m[I 2022-04-01 01:03:31,673][0m Trial 9 finished with value: 0.9540031397174255 and parameters: {'colsample_bylevel': 0.03815816610971732, 'depth': 6, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.2921246577520282}. Best is trial 6 with value: 0.9648351648351648.[0m


Number of finished trials: 10
Best trial:
  Value: 0.9648351648351648
  Params: 
    colsample_bylevel: 0.07105308738868985
    depth: 12
    boosting_type: Plain
    bootstrap_type: Bayesian
    bagging_temperature: 4.340964045900418


In [45]:
accuracy= []
recall =[]
roc_auc= []
precision = []

mypath = "../data/"
mydata = load_data.get_file_names(mypath)
data_files = load_data.load_copy_data(mydata, mypath)

data = data_files['data_merged']
X_train = data_files['X_train'].copy()
X_val = data_files['X_val'].copy()
y_train = data_files['y_train']['target'].copy()
y_val = data_files['y_val']['target'].copy()

X_train = preprocess(categorical_features, numeric_features, X_train)
X_val = preprocess(categorical_features, numeric_features, X_val)

categorical_features_indices = np.where(X_train.dtypes != np.float)[0]

# define oversampling strategy
oversample = RandomOverSampler(sampling_strategy=0.8)
# fit and apply the transform
X_over, y_over = oversample.fit_resample(X_train, y_train)
# summarize class distribution

catboost_5 = CatBoostClassifier(verbose=False,random_state=0,
                                 colsample_bylevel=0.07105308738868985,
                                 depth=12,
                                 boosting_type="Plain",
                                 bootstrap_type="Bayesian",
                                 bagging_temperature=4.340964045900418,
                                )

catboost_5.fit(X_over, y_over,cat_features=categorical_features_indices,eval_set=(X_val, y_val), early_stopping_rounds=100)
y_pred = catboost_5.predict(X_val)

accuracy.append(round(accuracy_score(y_val, y_pred),4))
recall.append(round(recall_score(y_val, y_pred),4))
roc_auc.append(round(roc_auc_score(y_val, y_pred),4))
precision.append(round(precision_score(y_val, y_pred),4))

print(accuracy, recall, roc_auc, precision)

file name: train_month_1
file name: data_merged
file name: train_month_2
file name: y_val
file name: X_train
file name: X_val
file name: test_month_1
file name: test_month_3
file name: y_train
file name: test_month_2
file name: train_month_3_with_target


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categorical_features_indices = np.where(X_train.dtypes != np.float)[0]


[0.9663] [0.0027] [0.4987] [0.0149]
