In [2]:
# Get the training data
# Clean/preprocess/transform the data
# Train a machine learning model
# Evaluate and optimise the model
# Clean/preprocess/transform new data
# Fit the model on new data to make predictions.

In [22]:
import pandas as pd
import numpy as np
import load_data

from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score,classification_report,recall_score,confusion_matrix, roc_auc_score, precision_score, f1_score, roc_curve, auc, plot_confusion_matrix,plot_roc_curve
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import Normalizer
from sklearn import metrics

from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler

# Import Libraries for Modelling
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

import optuna

# from imblearn.under_sampling import RandomUnderSampler
# from sklearn.pipeline import Pipeline
# from sklearn.base import BaseEstimator
# from sklearn.model_selection import train_test_split
# import imblearn
# from collections import Counter

pd.set_option('display.max_rows', 999)
pd.set_option('display.max_columns', 999)

In [3]:
# Features to preprocess

# Features to preprocess

# Categorical Features to Preprocess
categorical_features = [
    'customer_relationship', 'customer_relationship_y', 'customer_relationship_x', #unknown
    'customer_occupation_code_x',
    'customer_children', 'customer_children_y', 'customer_children_x',
    'customer_gender_x',
    'area_cat',
    'customer_self_employed', 'customer_self_employed_y', 'customer_self_employed_x',
    'visits_distinct_so', 'visits_distinct_so_y', 'visits_distinct_so_x',
    'visits_distinct_so_areas', 'visits_distinct_so_areas_y', 'visits_distinct_so_areas_x',
    'customer_education_x', #
    'homebanking_active', 'homebanking_active_y', 'homebanking_active_x',
    'has_homebanking', 'has_homebanking_y', 'has_homebanking_x',
    'has_insurance_21', 'has_insurance_21_y', 'has_insurance_21_x',
    'has_insurance_23', 'has_insurance_23_y', 'has_insurance_23_x',
    'has_life_insurance_fixed_cap', 'has_life_insurance_fixed_cap_y', 'has_life_insurance_fixed_cap_x',
    'has_life_insurance_decreasing_cap', 'has_life_insurance_decreasing_cap_y', 'has_life_insurance_decreasing_cap_x',
    'has_fire_car_other_insurance', 'has_fire_car_other_insurance_y', 'has_fire_car_other_insurance_x',
    'has_personal_loan', 'has_personal_loan_y', 'has_personal_loan_x',
    'has_mortgage_loan', 'has_mortgage_loan_y', 'has_mortgage_loan_x',
    'has_current_account', 'has_current_account_y', 'has_current_account_x',
    'has_pension_saving', 'has_pension_saving_y', 'has_pension_saving_x',
    'has_savings_account', 'has_savings_account_y', 'has_savings_account_x',
    'has_current_account_starter', 'has_current_account_starter_x', #'has_current_account_starter_y',
    'has_savings_account_starter', 'has_savings_account_starter_x', #'has_savings_account_starter_y',
]

# Numerical Features to Preprocess
numeric_features = [
    'bal_insurance_21', 'bal_insurance_21_y', 'bal_insurance_21_x', 
    'bal_insurance_23', 'bal_insurance_23_y', 'bal_insurance_23_x',
    'cap_life_insurance_fixed_cap', 'cap_life_insurance_fixed_cap_y', 'cap_life_insurance_fixed_cap_x',
    'cap_life_insurance_decreasing_cap', 'cap_life_insurance_decreasing_cap_y', 'cap_life_insurance_decreasing_cap_x',
    'prem_fire_car_other_insurance', 'prem_fire_car_other_insurance_y', 'prem_fire_car_other_insurance_x',
    'bal_personal_loan', 'bal_personal_loan_y', 'bal_personal_loan_x',
    'bal_mortgage_loan', 'bal_mortgage_loan_y', 'bal_mortgage_loan_x',
    'bal_current_account', 'bal_current_account_y', 'bal_current_account_x',
    'bal_pension_saving', 'bal_pension_saving_y', 'bal_pension_saving_x', 
    'bal_savings_account', 'bal_savings_account_y', 'bal_savings_account_x',
    'bal_current_account_starter', 'bal_current_account_starter_y', 'bal_current_account_starter_x',
    'bal_savings_account_starter', 'bal_savings_account_starter_y', 'bal_savings_account_starter_x',
    'customer_since_all_x', 'customer_since_bank_x', 'customer_birth_date_x',
]

In [24]:
def preprocess(categorical_features, numeric_features, df):
    
    # Replace NA with column mean, normalize numerical values
    si = SimpleImputer(strategy='mean')
    norm = Normalizer()
    
    df[numeric_features] = si.fit_transform(df[numeric_features])
    df[numeric_features] = norm.fit_transform(df[numeric_features])

    # Replace NA with column mode, encode categorical value to 0/1    
    sic = SimpleImputer(strategy='most_frequent')
    enc = OrdinalEncoder() #OneHotEncoder()
    
    df[categorical_features] = sic.fit_transform(df[categorical_features])
    df[categorical_features] = enc.fit_transform(df[categorical_features])

    return df

In [38]:
def objective(trial):
    #Load Data
    mypath = "../data/"
    mydata = load_data.get_file_names(mypath)
    data_files = load_data.load_copy_data(mydata, mypath)

    data = data_files['data_merged']
    X_train = data_files['X_train'].copy()
    X_val = data_files['X_val'].copy()
    y_train = data_files['y_train']['target'].copy()
    y_val = data_files['y_val']['target'].copy()

    X_train = preprocess(categorical_features, numeric_features, X_train)
    X_val = preprocess(categorical_features, numeric_features, X_val)
    
    categorical_features_indices = np.where(X_train.dtypes != np.float)[0]
    
    # define oversampling strategy
    oversample = RandomOverSampler(sampling_strategy=0.8)
    # fit and apply the transform
    X_over, y_over = oversample.fit_resample(X_train, y_train)
    # summarize class distribution

    param = {
        "objective": "Logloss",
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        "used_ram_limit": "3gb",
    }
    
    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

#     cat_cls = CatBoostClassifier(verbose=False,random_state=0,scale_pos_weight=1.2, **param)
    cat_cls = CatBoostClassifier(verbose=False,random_state=0,**param)
    cat_cls.fit(X_over, y_over, eval_set=[(X_val, y_val)], cat_features=categorical_features_indices,verbose=0, early_stopping_rounds=100)

    preds = cat_cls.predict(X_val)
    pred_labels = np.rint(preds)
    accuracy = accuracy_score(y_val, pred_labels)
    return accuracy


if __name__ == "__main__":
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100, timeout=600)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

[32m[I 2022-03-31 15:05:09,909][0m A new study created in memory with name: no-name-7755e396-ed79-4ff9-ae93-1adebf06ea72[0m


file name: train_month_1
file name: data_merged
file name: train_month_2
file name: y_val
file name: X_train
file name: X_val
file name: test_month_1
file name: test_month_3
file name: y_train
file name: test_month_2
file name: train_month_3_with_target


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categorical_features_indices = np.where(X_train.dtypes != np.float)[0]
CatBoost is using more CPU RAM (3.83GiB) than the limit (3GiB)
CatBoost is using more CPU RAM (3.83GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)

CatBoost is using more CPU RAM (3.87GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12744) > ResourceQuota(0)
Resource CPU 

CatBoost is using more CPU RAM (5.04GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > Resource

[32m[I 2022-03-31 15:05:23,703][0m Trial 0 finished with value: 0.9612244897959183 and parameters: {'colsample_bylevel': 0.021452900425393193, 'depth': 5, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS'}. Best is trial 0 with value: 0.9612244897959183.[0m


file name: train_month_1
file name: data_merged
file name: train_month_2
file name: y_val
file name: X_train
file name: X_val
file name: test_month_1
file name: test_month_3
file name: y_train
file name: test_month_2
file name: train_month_3_with_target


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categorical_features_indices = np.where(X_train.dtypes != np.float)[0]
CatBoost is using more CPU RAM (4.98GiB) than the limit (3GiB)
CatBoost is using more CPU RAM (4.98GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)

CatBoost is using more CPU RAM (5.35GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > Resource

[32m[I 2022-03-31 15:05:38,936][0m Trial 1 finished with value: 0.9677394034536891 and parameters: {'colsample_bylevel': 0.039598082502671654, 'depth': 8, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 1 with value: 0.9677394034536891.[0m


file name: train_month_1
file name: data_merged
file name: train_month_2
file name: y_val
file name: X_train
file name: X_val
file name: test_month_1
file name: test_month_3
file name: y_train
file name: test_month_2
file name: train_month_3_with_target


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categorical_features_indices = np.where(X_train.dtypes != np.float)[0]
CatBoost is using more CPU RAM (5.13GiB) than the limit (3GiB)
CatBoost is using more CPU RAM (5.13GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)

CatBoost is using more CPU RAM (5.27GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > Resource

[32m[I 2022-03-31 15:05:55,646][0m Trial 2 finished with value: 0.9612244897959183 and parameters: {'colsample_bylevel': 0.01021793009756367, 'depth': 5, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.2511803903057829}. Best is trial 1 with value: 0.9677394034536891.[0m


file name: train_month_1
file name: data_merged
file name: train_month_2
file name: y_val
file name: X_train
file name: X_val
file name: test_month_1
file name: test_month_3
file name: y_train
file name: test_month_2
file name: train_month_3_with_target


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categorical_features_indices = np.where(X_train.dtypes != np.float)[0]
CatBoost is using more CPU RAM (4.98GiB) than the limit (3GiB)
CatBoost is using more CPU RAM (4.98GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)

CatBoost is using more CPU RAM (5.14GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > Resource

[32m[I 2022-03-31 15:06:06,342][0m Trial 3 finished with value: 0.9642072213500785 and parameters: {'colsample_bylevel': 0.07191871311423008, 'depth': 5, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.24282599657338025}. Best is trial 1 with value: 0.9677394034536891.[0m


file name: train_month_1
file name: data_merged
file name: train_month_2
file name: y_val
file name: X_train
file name: X_val
file name: test_month_1
file name: test_month_3
file name: y_train
file name: test_month_2
file name: train_month_3_with_target


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categorical_features_indices = np.where(X_train.dtypes != np.float)[0]
CatBoost is using more CPU RAM (5.09GiB) than the limit (3GiB)
CatBoost is using more CPU RAM (5.09GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)

CatBoost is using more CPU RAM (4.68GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > Resource

[32m[I 2022-03-31 15:06:19,945][0m Trial 4 finished with value: 0.9660125588697017 and parameters: {'colsample_bylevel': 0.02700463162825497, 'depth': 6, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS'}. Best is trial 1 with value: 0.9677394034536891.[0m


file name: train_month_1
file name: data_merged
file name: train_month_2
file name: y_val
file name: X_train
file name: X_val
file name: test_month_1
file name: test_month_3
file name: y_train
file name: test_month_2
file name: train_month_3_with_target


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categorical_features_indices = np.where(X_train.dtypes != np.float)[0]
CatBoost is using more CPU RAM (4.55GiB) than the limit (3GiB)
CatBoost is using more CPU RAM (4.55GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)

CatBoost is using more CPU RAM (4.55GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > Resource

[32m[I 2022-03-31 15:06:39,874][0m Trial 5 finished with value: 0.9547095761381476 and parameters: {'colsample_bylevel': 0.02539923017779416, 'depth': 6, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 8.973528623502675}. Best is trial 1 with value: 0.9677394034536891.[0m


file name: train_month_1
file name: data_merged
file name: train_month_2
file name: y_val
file name: X_train
file name: X_val
file name: test_month_1
file name: test_month_3
file name: y_train
file name: test_month_2
file name: train_month_3_with_target


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categorical_features_indices = np.where(X_train.dtypes != np.float)[0]
CatBoost is using more CPU RAM (4.61GiB) than the limit (3GiB)
CatBoost is using more CPU RAM (4.61GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)

CatBoost is using more CPU RAM (4.74GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > Resource

[32m[I 2022-03-31 15:12:49,788][0m Trial 6 finished with value: 0.9691522762951335 and parameters: {'colsample_bylevel': 0.0703735052929811, 'depth': 12, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS'}. Best is trial 6 with value: 0.9691522762951335.[0m


file name: train_month_1
file name: data_merged
file name: train_month_2
file name: y_val
file name: X_train
file name: X_val
file name: test_month_1
file name: test_month_3
file name: y_train
file name: test_month_2
file name: train_month_3_with_target


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categorical_features_indices = np.where(X_train.dtypes != np.float)[0]
CatBoost is using more CPU RAM (5.7GiB) than the limit (3GiB)
CatBoost is using more CPU RAM (5.7GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
R

CatBoost is using more CPU RAM (5.88GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > Resource

[32m[I 2022-03-31 15:13:08,044][0m Trial 7 finished with value: 0.9653846153846154 and parameters: {'colsample_bylevel': 0.09780508674950031, 'depth': 7, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 5.848020456192863}. Best is trial 6 with value: 0.9691522762951335.[0m


file name: train_month_1
file name: data_merged
file name: train_month_2
file name: y_val
file name: X_train
file name: X_val
file name: test_month_1
file name: test_month_3
file name: y_train
file name: test_month_2
file name: train_month_3_with_target


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categorical_features_indices = np.where(X_train.dtypes != np.float)[0]
CatBoost is using more CPU RAM (6.23GiB) than the limit (3GiB)
CatBoost is using more CPU RAM (6.23GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)

CatBoost is using more CPU RAM (6.7GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQ

[32m[I 2022-03-31 15:13:22,014][0m Trial 8 finished with value: 0.9669544740973313 and parameters: {'colsample_bylevel': 0.07491338316880052, 'depth': 8, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 0.45252042379789414}. Best is trial 6 with value: 0.9691522762951335.[0m


file name: train_month_1
file name: data_merged
file name: train_month_2
file name: y_val
file name: X_train
file name: X_val
file name: test_month_1
file name: test_month_3
file name: y_train
file name: test_month_2
file name: train_month_3_with_target


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categorical_features_indices = np.where(X_train.dtypes != np.float)[0]
CatBoost is using more CPU RAM (5.82GiB) than the limit (3GiB)
CatBoost is using more CPU RAM (5.82GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)

CatBoost is using more CPU RAM (6.16GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(8051504) > Resource

[32m[I 2022-03-31 15:13:35,136][0m Trial 9 finished with value: 0.9656985871271585 and parameters: {'colsample_bylevel': 0.08910376957692859, 'depth': 8, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.37191042787947937}. Best is trial 6 with value: 0.9691522762951335.[0m


file name: train_month_1
file name: data_merged
file name: train_month_2
file name: y_val
file name: X_train
file name: X_val
file name: test_month_1
file name: test_month_3
file name: y_train
file name: test_month_2
file name: train_month_3_with_target


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categorical_features_indices = np.where(X_train.dtypes != np.float)[0]
CatBoost is using more CPU RAM (5.27GiB) than the limit (3GiB)
CatBoost is using more CPU RAM (5.27GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(481248) > ResourceQuota(0)

CatBoost is using more CPU RAM (5.27GiB) than the limit (3GiB)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(25480) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU RAM: functionWithResourceUsage.ResourceUsage(12740) > ResourceQuota(0)
Resource CPU 

[32m[I 2022-03-31 15:15:34,703][0m Trial 10 finished with value: 0.9676609105180534 and parameters: {'colsample_bylevel': 0.054125120205273186, 'depth': 12, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS'}. Best is trial 6 with value: 0.9691522762951335.[0m


Number of finished trials: 11
Best trial:
  Value: 0.9691522762951335
  Params: 
    colsample_bylevel: 0.0703735052929811
    depth: 12
    boosting_type: Ordered
    bootstrap_type: MVS


In [39]:
accuracy= []
recall =[]
roc_auc= []
precision = []

mypath = "../data/"
mydata = load_data.get_file_names(mypath)
data_files = load_data.load_copy_data(mydata, mypath)

data = data_files['data_merged']
X_train = data_files['X_train'].copy()
X_val = data_files['X_val'].copy()
y_train = data_files['y_train']['target'].copy()
y_val = data_files['y_val']['target'].copy()

X_train = preprocess(categorical_features, numeric_features, X_train)
X_val = preprocess(categorical_features, numeric_features, X_val)

categorical_features_indices = np.where(X_train.dtypes != np.float)[0]

# define oversampling strategy
oversample = RandomOverSampler(sampling_strategy=0.8)
# fit and apply the transform
X_over, y_over = oversample.fit_resample(X_train, y_train)
# summarize class distribution

catboost_5 = CatBoostClassifier(verbose=False,random_state=0,
                                 colsample_bylevel=0.0703735052929811,
                                 depth=12,
                                 boosting_type="Ordered",
                                 bootstrap_type="MVS",
                                )

catboost_5.fit(X_over, y_over,cat_features=categorical_features_indices,eval_set=(X_val, y_val), early_stopping_rounds=100)
y_pred = catboost_5.predict(X_val)

accuracy.append(round(accuracy_score(y_val, y_pred),4))
recall.append(round(recall_score(y_val, y_pred),4))
roc_auc.append(round(roc_auc_score(y_val, y_pred),4))
precision.append(round(precision_score(y_val, y_pred),4))

print(accuracy, recall, roc_auc, precision)

file name: train_month_1
file name: data_merged
file name: train_month_2
file name: y_val
file name: X_train
file name: X_val
file name: test_month_1
file name: test_month_3
file name: y_train
file name: test_month_2
file name: train_month_3_with_target


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  categorical_features_indices = np.where(X_train.dtypes != np.float)[0]


[0.9665] [0.0027] [0.4988] [0.0154]


In [None]:
# Predict using test data and export data