In [2]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.3-cp310-cp310-manylinux2014_x86_64.whl (98.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.3


In [1]:
config = {
    'root': "."
    , 'train_path': "./train.csv"
    , 'submit_path': "./submission.csv"
    , 'seed_list': [42, 137, 56, 89, 24, 75 ,88 ,36 ,71]
    , 'k_fold': 5
    , 'thresholds': {'product_category': 10, 'expected_timeline': 3}
}

cbt_params = {
    'random_seed': config['seed_list'][0],
    'objective': 'Logloss',
    'auto_class_weights': 'Balanced',
    'verbose': 0
}


tuning_params = [
    {
        'learning_rate': 0.05
        , 'n_estimators': 3000

        , 'early_stopping_rounds': 50

        # regularizations
        , 'max_depth': 6
        , 'l2_leaf_reg': 1
        , 'min_data_in_leaf': 2
        , 'subsample': 0.5
        # ,'grow_policy': 'Depthwise' # 'SymmetricTree'(default)
    },
]

In [2]:
import random
import pandas as pd
import numpy as np
from datetime import datetime
import os

from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import StratifiedKFold

from catboost import CatBoostClassifier
from functools import partial

In [3]:
def set_seed(seed: int):
    # Set the seed for reproducibility.
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    # torch.manual_seed(seed)
    # torch.cuda.manual_seed(seed)  # type: ignore
    # torch.backends.cudnn.deterministic = True  # type: ignore
    # torch.backends.cudnn.benchmark = True  # type: ignore


def read_data(config):
    # Load training, testing, and submission CSV files
    df_train = pd.read_csv(config['train_path'])  # 학습용 데이터
    df_test = pd.read_csv(config['submit_path'])  # 테스트 데이터(제출파일의 데이터)
    df_sub = pd.read_csv(config['submit_path'])

    return df_train, df_test, df_sub


def get_clf_eval(y_test, y_pred=None, fold_no=None):
    # Calculate and print evaluation metrics and confusion matrix,
    # accuracy, precision, recall, and F1 score.
    # Optionally includes fold number in the output.
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    fold_info = f'Fold #{fold_no}' if fold_no is not None else ''
    print(f'{fold_info} ACC: {accuracy:.4f}, PRE: {precision:.4f}, REC: {recall:.4f}, F1: {F1:.4f}')
    return F1


def new_business_area(cur_area):
    # Categorizes current business area into a new, simplified business area group.
    # Returns the original area if it doesn't match the above categories.
    if cur_area in ['corporate / office', 'government department']:
        return 'Office'
    elif cur_area in ['education', 'public facility']:
        return 'Public'
    elif cur_area in ['hotel & accommodation', 'residential (home)']:
        return 'Amenity'
    elif cur_area in ['factory', 'power plant / renewable energy', 'transportation']:
        return 'Industry'
    else:
        return cur_area


def make_value_count(value_count_dict, val):
     # Returns the count of occurrences of val from a precomputed dictionary
     # otherwise returns NaN.
    if val in value_count_dict:
        return value_count_dict[val]
    return np.NAN


def make_value_count_dict(df, col_names):
    # Creates a dictionary of value counts for each feature name
    value_count_dict = dict()
    for feat_name in col_names:
        total_count = df[feat_name].value_counts()

        # Converts the Series object to a DataFrame and then to a dictionary.
        count_df = pd.DataFrame(total_count).reset_index(drop=False)

        value_count_dict[feat_name] = dict(zip(count_df.iloc[:, 0], count_df.iloc[:, 1]))

    return value_count_dict


def make_continent(rc):
    # Maps response corporate (rc) codes to their respective continent.
    # Each if statement checks for membership in a list of codes corresponding to a continent.
    if rc in ('lgein', 'lgeml', 'lgeph', 'lgeth', 'lgevh', 'lgeil', 'lgekr', 'lgett', 'lgejp', 'lgech', 'lgeir', 'lgesj',
              'lgegf', 'lgetk', 'lgelf', 'lgehk', 'lgeyk'):
        return 'asia'
    if rc in ('lgeaf', 'lgesa', 'lgemc', 'lgeas', 'lgeeg', 'lgeef'):
        return 'africa'
    if rc in ('lgeus', 'lgeci'):
        return 'northamerica'
    if rc in ('lgesp', 'lgecb', 'lgems', 'lgecl', 'lgeps', 'lgear', 'lgepr'):
        return 'southamerica'
    if rc in ('lgeuk', 'lgees', 'lgefs', 'lgebn', 'lgebt', 'lgedg', 'lgero', 'lgemk', 'lgepl', 'lgecz', 'lgehs', 'lgesw',
              'lgeag', 'lgeeb', 'lgera', 'lgeur', 'lgept', 'lgeis', 'lgela'):
        return 'europe'
    if rc in ('lgesl', 'lgeap'):
        return 'oceania'
    return np.NaN


def feature_engineering(df_input, is_train=False):
    df = df_input.copy()

    # drop_duplicates
    df = df.drop_duplicates(keep='first')

    # converting text to lowercase, removing commas and periods
    # and replacing underscores and hyphens with spaces.
    for feat in df.columns:
        # Applies transformations only to string-type columns.
        if df[feat].dtype == 'object':
            df[feat] = df[feat].str.lower()
            df[feat] = df[feat].str.replace('[,\.]', '', regex=True)
            df[feat] = df[feat].str.replace('[_-]', ' ', regex=True)

    # Recategorizes 'Solution' and 'CM' in 'business_unit' column to 'Others'.
    df['business_unit'] = np.where(df['business_unit'].isin(['Solution', 'CM']), 'Others', df['business_unit'])

    # map ['other', 'others', 'etc'] -> 'Others'
    unify_others_columns = ['customer_type', 'customer_job', 'inquiry_type', 'product_category', 'product_subcategory',
                            'customer_position', 'expected_timeline']
    for column in unify_others_columns:
        df[column] = df[column].replace(['other', 'others', 'etc'], 'others')

    # make new columns
    df['continent'] = df['response_corporate'].map(make_continent)
    df['business_area_group'] = df['business_area'].map(new_business_area)
    df['product_category_count'] = df['product_category'].apply(
        lambda x: len(str(x).split(',')) if not pd.isna(x) else np.nan)

    make_value_count_columns = ['lead_owner']
    if is_train:
        global value_count_dict
        value_count_dict = make_value_count_dict(df_input, make_value_count_columns)

    for feat_name in value_count_dict:
        func = partial(make_value_count, value_count_dict[feat_name])
        df[f'{feat_name}_count'] = df[feat_name].map(func)

    # correct data type
    for feat in ['customer_idx', 'lead_owner', 'ver_cus', 'ver_pro']:
        df[feat] = df[feat].astype(object)

    return df


def make_feature_lists(df):
    base_features = []     # all features except target variable.
    base_num_features = [] # numerical features
    base_cat_features = [] # categorical features

    for feat in df.columns:
        # skip the target
        if feat == 'is_converted':
            continue

        base_features.append(feat)

        if df[feat].dtype == 'object':
            base_cat_features.append(feat)
        else:
            base_num_features.append(feat)

    # features to be removed from data analysis
    removal_features = {
        'id', 'bant_submit', 'id_strategic_ver', 'it_strategic_ver', 'idit_strategic_ver',
        'customer_country', 'customer_country.1', 'business_subarea', 'business_area'
    }

    # remove the specified features
    base_num_features = [i for i in base_num_features if i not in removal_features]
    base_cat_features = [i for i in base_cat_features if i not in removal_features]
    base_features = [i for i in base_features if i not in removal_features]

    return base_num_features, base_cat_features, base_features


def filling_missing_values(df_input, base_cat_features, base_num_features):
    df = df_input.copy()

    # Fill missing values for categorical features with 'UNK'
    # and ensure their data type is string.
    for base_cat_feat in base_cat_features:
        df[base_cat_feat] = df[base_cat_feat].fillna('UNK')
        df[base_cat_feat] = df[base_cat_feat].astype(str)

    # Fill missing values for numerical features with -1.
    for base_num_feat in base_num_features:
        df[base_num_feat] = df[base_num_feat].fillna(-1)

    return df


def model_kfold(df, config, cbt_params, base_features, base_cat_features):
    target = 'is_converted'

    skf = StratifiedKFold(n_splits=config['k_fold'], shuffle=True, random_state=config['seed'])
    cbt_models = [] # trained models
    f1_scores = []  # f1-scores for validation sets

    for k_fold, (train_idx, valid_idx) in enumerate(skf.split(df[base_features], df[target])):
        print(f'Fold #{k_fold + 1}')
        X_train, y_train = df[base_features].iloc[train_idx], df[target].iloc[train_idx].astype(int)
        X_valid, y_valid = df[base_features].iloc[valid_idx], df[target].iloc[valid_idx].astype(int)

        # initialize CatBoost model with provided parameters
        cbt = CatBoostClassifier(**cbt_params)

        cbt.fit(
            X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            cat_features=base_cat_features, # specify categorical features
        )

        # save the trained model
        cbt_models.append(cbt)

        # evaluate the model
        # --- train-set
        print('[Train] ', end='')
        y_pred = cbt.predict(X_train)
        _ = get_clf_eval(y_train, y_pred, k_fold + 1)

        # --- valid-set
        print('[Valid] ', end='')
        y_pred = cbt.predict(X_valid)
        y_pred = y_pred.astype(y_valid.dtype) # ensure matching dtype
        f1 = get_clf_eval(y_valid, y_pred, k_fold + 1)

        f1_scores.append(f1)

    print(f'Avg. F1 of validset: {np.mean(f1_scores)}')
    print(f'Var. F1 of validset: {np.var(f1_scores)}')

    return cbt_models


def kfold_submission(df_train, df_test, df_sub, cbt_models, config):
    folder_path = f"{config['root']}/FeatureImportance"

    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    # get current date and time
    now = datetime.now()

    # record the year, month, day, hour, and minute for naming files.
    year = now.year
    month = now.month
    day = now.day
    hour = now.hour
    minute = now.minute

    # file format
    submission_time = f"{year:04d}{month:02d}{day:02d}_{hour:02d}{minute:02d}"[2:]
    target = 'is_converted'

    # apply feature engineering
    df_test = feature_engineering(df_test, is_train=False)
    base_num_features, base_cat_features, base_features = make_feature_lists(df_test)

    df_test = filling_missing_values(df_test, base_cat_features, base_num_features)

    X_test = df_test[base_features]

    # a matrix to store the probabilities of each class for the test set.
    y_probs = np.zeros((X_test.shape[0], 2))

    # dataframe for feature importances
    df_feature_importance_all = pd.DataFrame({'features': base_features})

    for i, cbt_model in enumerate(cbt_models):
        y_probs += cbt_model.predict_proba(X_test) / len(cbt_models)

        # save feature importance of current model
        df_feature_importance_all[f'model_{i}'] = cbt_model.get_feature_importance()

    # compute avarege, rank
    df_feature_importance_all['average'] = df_feature_importance_all.iloc[:, 1:].mean(axis=1).values
    df_feature_importance_all['rank'] = df_feature_importance_all['average'].rank(ascending=False)

    # save the feature importance as CSV
    df_feature_importance_all.to_csv(f'{folder_path}/feat_import_{submission_time}.csv', index=False)

    # create submission file
    # threshold: 0.5 to determine the class
    df_sub[target] = (y_probs[:, 1] >= 0.5).astype(bool)

    # save submission file as CSV
    df_sub.to_csv(f"{config['root']}/submission.csv", index=False)

In [4]:
def main(config, cbt_params, tuning_params):
    cbt_models = []

    for seed in config['seed_list']:
        for tuning_param in tuning_params:
            config['seed'] = seed
            cbt_params['random_seed'] = seed

            for param in cbt_params:
                tuning_param[param] = cbt_params[param]
            print(tuning_param)

            # seed 설정
            set_seed(config['seed'])

            # 데이터 셋 읽어오기
            df_train, df_test, df_sub = read_data(config)

            # 데이터 전처리
            df_train = feature_engineering(df_train, is_train=True)
            base_num_features, base_cat_features, base_features = make_feature_lists(df_train)

            # 결측치 채우기
            df_train = filling_missing_values(df_train, base_cat_features, base_num_features)

            # 모델 성능 확인(train, valid 데이터 활용)
            cbt_model = model_kfold(df_train, config, tuning_param, base_features, base_cat_features)

            cbt_models.extend(cbt_model)

    # 제출하기
    kfold_submission(df_train, df_test, df_sub, cbt_models, config)

In [5]:
main(config, cbt_params, tuning_params)

{'learning_rate': 0.05, 'n_estimators': 3000, 'early_stopping_rounds': 50, 'max_depth': 6, 'l2_leaf_reg': 1, 'min_data_in_leaf': 2, 'subsample': 0.5, 'random_seed': 42, 'objective': 'Logloss', 'auto_class_weights': 'Balanced', 'verbose': 0}


  df[base_cat_feat] = df[base_cat_feat].fillna('UNK')
  df[base_cat_feat] = df[base_cat_feat].fillna('UNK')
  df[base_cat_feat] = df[base_cat_feat].fillna('UNK')
  df[base_cat_feat] = df[base_cat_feat].fillna('UNK')


Fold #1
[Train] Fold #1 ACC: 0.9907, PRE: 0.9020, REC: 0.9959, F1: 0.9466
[Valid] Fold #1 ACC: 0.9555, PRE: 0.6624, REC: 0.9448, F1: 0.7788
Fold #2
[Train] Fold #2 ACC: 0.9868, PRE: 0.8637, REC: 0.9976, F1: 0.9258
[Valid] Fold #2 ACC: 0.9539, PRE: 0.6537, REC: 0.9437, F1: 0.7724
Fold #3
[Train] Fold #3 ACC: 0.9781, PRE: 0.7919, REC: 0.9978, F1: 0.8830
[Valid] Fold #3 ACC: 0.9516, PRE: 0.6422, REC: 0.9383, F1: 0.7625
Fold #4
[Train] Fold #4 ACC: 0.9772, PRE: 0.7857, REC: 0.9968, F1: 0.8787
[Valid] Fold #4 ACC: 0.9526, PRE: 0.6458, REC: 0.9470, F1: 0.7679
Fold #5
[Train] Fold #5 ACC: 0.9846, PRE: 0.8454, REC: 0.9970, F1: 0.9150
[Valid] Fold #5 ACC: 0.9514, PRE: 0.6400, REC: 0.9448, F1: 0.7631
Avg. F1 of validset: 0.7689318800102057
Var. F1 of validset: 3.689543177453125e-05
{'learning_rate': 0.05, 'n_estimators': 3000, 'early_stopping_rounds': 50, 'max_depth': 6, 'l2_leaf_reg': 1, 'min_data_in_leaf': 2, 'subsample': 0.5, 'random_seed': 137, 'objective': 'Logloss', 'auto_class_weights': '

  df[base_cat_feat] = df[base_cat_feat].fillna('UNK')
  df[base_cat_feat] = df[base_cat_feat].fillna('UNK')
  df[base_cat_feat] = df[base_cat_feat].fillna('UNK')
  df[base_cat_feat] = df[base_cat_feat].fillna('UNK')


Fold #1
[Train] Fold #1 ACC: 0.9885, PRE: 0.8798, REC: 0.9981, F1: 0.9352
[Valid] Fold #1 ACC: 0.9549, PRE: 0.6584, REC: 0.9470, F1: 0.7767
Fold #2
[Train] Fold #2 ACC: 0.9886, PRE: 0.8808, REC: 0.9973, F1: 0.9354
[Valid] Fold #2 ACC: 0.9508, PRE: 0.6370, REC: 0.9437, F1: 0.7606
Fold #3
[Train] Fold #3 ACC: 0.9881, PRE: 0.8762, REC: 0.9978, F1: 0.9331
[Valid] Fold #3 ACC: 0.9581, PRE: 0.6746, REC: 0.9556, F1: 0.7909
Fold #4
[Train] Fold #4 ACC: 0.9893, PRE: 0.8873, REC: 0.9973, F1: 0.9391
[Valid] Fold #4 ACC: 0.9554, PRE: 0.6580, REC: 0.9600, F1: 0.7808
Fold #5
[Train] Fold #5 ACC: 0.9854, PRE: 0.8524, REC: 0.9968, F1: 0.9189
[Valid] Fold #5 ACC: 0.9509, PRE: 0.6395, REC: 0.9329, F1: 0.7588
Avg. F1 of validset: 0.7735589541690493
Var. F1 of validset: 0.00014969856718657177
{'learning_rate': 0.05, 'n_estimators': 3000, 'early_stopping_rounds': 50, 'max_depth': 6, 'l2_leaf_reg': 1, 'min_data_in_leaf': 2, 'subsample': 0.5, 'random_seed': 56, 'objective': 'Logloss', 'auto_class_weights': '

  df[base_cat_feat] = df[base_cat_feat].fillna('UNK')
  df[base_cat_feat] = df[base_cat_feat].fillna('UNK')
  df[base_cat_feat] = df[base_cat_feat].fillna('UNK')
  df[base_cat_feat] = df[base_cat_feat].fillna('UNK')


Fold #1
[Train] Fold #1 ACC: 0.9879, PRE: 0.8741, REC: 0.9976, F1: 0.9318
[Valid] Fold #1 ACC: 0.9537, PRE: 0.6516, REC: 0.9491, F1: 0.7727
Fold #2
[Train] Fold #2 ACC: 0.9846, PRE: 0.8453, REC: 0.9965, F1: 0.9147
[Valid] Fold #2 ACC: 0.9546, PRE: 0.6550, REC: 0.9535, F1: 0.7766
Fold #3
[Train] Fold #3 ACC: 0.9863, PRE: 0.8594, REC: 0.9976, F1: 0.9234
[Valid] Fold #3 ACC: 0.9532, PRE: 0.6478, REC: 0.9535, F1: 0.7715
Fold #4
[Train] Fold #4 ACC: 0.9913, PRE: 0.9072, REC: 0.9973, F1: 0.9501
[Valid] Fold #4 ACC: 0.9570, PRE: 0.6702, REC: 0.9459, F1: 0.7846
Fold #5
[Train] Fold #5 ACC: 0.9832, PRE: 0.8336, REC: 0.9965, F1: 0.9078
[Valid] Fold #5 ACC: 0.9547, PRE: 0.6591, REC: 0.9394, F1: 0.7747
Avg. F1 of validset: 0.7759817357103549
Var. F1 of validset: 2.140727620938135e-05
{'learning_rate': 0.05, 'n_estimators': 3000, 'early_stopping_rounds': 50, 'max_depth': 6, 'l2_leaf_reg': 1, 'min_data_in_leaf': 2, 'subsample': 0.5, 'random_seed': 89, 'objective': 'Logloss', 'auto_class_weights': 'B

  df[base_cat_feat] = df[base_cat_feat].fillna('UNK')
  df[base_cat_feat] = df[base_cat_feat].fillna('UNK')
  df[base_cat_feat] = df[base_cat_feat].fillna('UNK')
  df[base_cat_feat] = df[base_cat_feat].fillna('UNK')


Fold #1
[Train] Fold #1 ACC: 0.9837, PRE: 0.8371, REC: 0.9970, F1: 0.9101
[Valid] Fold #1 ACC: 0.9503, PRE: 0.6330, REC: 0.9502, F1: 0.7598
Fold #2
[Train] Fold #2 ACC: 0.9907, PRE: 0.9006, REC: 0.9976, F1: 0.9466
[Valid] Fold #2 ACC: 0.9531, PRE: 0.6480, REC: 0.9502, F1: 0.7705
Fold #3
[Train] Fold #3 ACC: 0.9913, PRE: 0.9072, REC: 0.9968, F1: 0.9499
[Valid] Fold #3 ACC: 0.9509, PRE: 0.6374, REC: 0.9437, F1: 0.7609
Fold #4
[Train] Fold #4 ACC: 0.9808, PRE: 0.8140, REC: 0.9959, F1: 0.8958
[Valid] Fold #4 ACC: 0.9523, PRE: 0.6461, REC: 0.9383, F1: 0.7652
Fold #5
[Train] Fold #5 ACC: 0.9841, PRE: 0.8414, REC: 0.9962, F1: 0.9123
[Valid] Fold #5 ACC: 0.9530, PRE: 0.6488, REC: 0.9437, F1: 0.7690
Avg. F1 of validset: 0.7650899225055513
Var. F1 of validset: 1.788306038141294e-05
{'learning_rate': 0.05, 'n_estimators': 3000, 'early_stopping_rounds': 50, 'max_depth': 6, 'l2_leaf_reg': 1, 'min_data_in_leaf': 2, 'subsample': 0.5, 'random_seed': 24, 'objective': 'Logloss', 'auto_class_weights': 'B

  df[base_cat_feat] = df[base_cat_feat].fillna('UNK')
  df[base_cat_feat] = df[base_cat_feat].fillna('UNK')
  df[base_cat_feat] = df[base_cat_feat].fillna('UNK')
  df[base_cat_feat] = df[base_cat_feat].fillna('UNK')


Fold #1
[Train] Fold #1 ACC: 0.9838, PRE: 0.8383, REC: 0.9970, F1: 0.9108
[Valid] Fold #1 ACC: 0.9521, PRE: 0.6440, REC: 0.9437, F1: 0.7656
Fold #2
[Train] Fold #2 ACC: 0.9767, PRE: 0.7823, REC: 0.9965, F1: 0.8765
[Valid] Fold #2 ACC: 0.9532, PRE: 0.6520, REC: 0.9329, F1: 0.7676
Fold #3
[Train] Fold #3 ACC: 0.9903, PRE: 0.8979, REC: 0.9968, F1: 0.9447
[Valid] Fold #3 ACC: 0.9543, PRE: 0.6552, REC: 0.9459, F1: 0.7741
Fold #4
[Train] Fold #4 ACC: 0.9866, PRE: 0.8627, REC: 0.9962, F1: 0.9247
[Valid] Fold #4 ACC: 0.9529, PRE: 0.6468, REC: 0.9491, F1: 0.7693
Fold #5
[Train] Fold #5 ACC: 0.9917, PRE: 0.9106, REC: 0.9973, F1: 0.9520
[Valid] Fold #5 ACC: 0.9562, PRE: 0.6674, REC: 0.9383, F1: 0.7800
Avg. F1 of validset: 0.7713264613116447
Var. F1 of validset: 2.693423829798386e-05
{'learning_rate': 0.05, 'n_estimators': 3000, 'early_stopping_rounds': 50, 'max_depth': 6, 'l2_leaf_reg': 1, 'min_data_in_leaf': 2, 'subsample': 0.5, 'random_seed': 75, 'objective': 'Logloss', 'auto_class_weights': 'B

  df[base_cat_feat] = df[base_cat_feat].fillna('UNK')
  df[base_cat_feat] = df[base_cat_feat].fillna('UNK')
  df[base_cat_feat] = df[base_cat_feat].fillna('UNK')
  df[base_cat_feat] = df[base_cat_feat].fillna('UNK')


Fold #1
[Train] Fold #1 ACC: 0.9783, PRE: 0.7932, REC: 0.9981, F1: 0.8839
[Valid] Fold #1 ACC: 0.9561, PRE: 0.6677, REC: 0.9351, F1: 0.7791
Fold #2
[Train] Fold #2 ACC: 0.9800, PRE: 0.8071, REC: 0.9973, F1: 0.8922
[Valid] Fold #2 ACC: 0.9534, PRE: 0.6496, REC: 0.9491, F1: 0.7713
Fold #3
[Train] Fold #3 ACC: 0.9879, PRE: 0.8754, REC: 0.9957, F1: 0.9316
[Valid] Fold #3 ACC: 0.9549, PRE: 0.6579, REC: 0.9491, F1: 0.7771
Fold #4
[Train] Fold #4 ACC: 0.9903, PRE: 0.8976, REC: 0.9959, F1: 0.9442
[Valid] Fold #4 ACC: 0.9523, PRE: 0.6439, REC: 0.9491, F1: 0.7673
Fold #5
[Train] Fold #5 ACC: 0.9851, PRE: 0.8496, REC: 0.9965, F1: 0.9172
[Valid] Fold #5 ACC: 0.9521, PRE: 0.6419, REC: 0.9545, F1: 0.7676
Avg. F1 of validset: 0.7724898426601781
Var. F1 of validset: 2.3442972016827796e-05
{'learning_rate': 0.05, 'n_estimators': 3000, 'early_stopping_rounds': 50, 'max_depth': 6, 'l2_leaf_reg': 1, 'min_data_in_leaf': 2, 'subsample': 0.5, 'random_seed': 88, 'objective': 'Logloss', 'auto_class_weights': '

  df[base_cat_feat] = df[base_cat_feat].fillna('UNK')
  df[base_cat_feat] = df[base_cat_feat].fillna('UNK')
  df[base_cat_feat] = df[base_cat_feat].fillna('UNK')
  df[base_cat_feat] = df[base_cat_feat].fillna('UNK')


Fold #1
[Train] Fold #1 ACC: 0.9870, PRE: 0.8661, REC: 0.9973, F1: 0.9271
[Valid] Fold #1 ACC: 0.9519, PRE: 0.6434, REC: 0.9394, F1: 0.7637
Fold #2
[Train] Fold #2 ACC: 0.9854, PRE: 0.8513, REC: 0.9976, F1: 0.9186
[Valid] Fold #2 ACC: 0.9553, PRE: 0.6594, REC: 0.9513, F1: 0.7789
Fold #3
[Train] Fold #3 ACC: 0.9886, PRE: 0.8810, REC: 0.9976, F1: 0.9357
[Valid] Fold #3 ACC: 0.9520, PRE: 0.6435, REC: 0.9437, F1: 0.7652
Fold #4
[Train] Fold #4 ACC: 0.9858, PRE: 0.8555, REC: 0.9976, F1: 0.9211
[Valid] Fold #4 ACC: 0.9511, PRE: 0.6386, REC: 0.9448, F1: 0.7621
Fold #5
[Train] Fold #5 ACC: 0.9808, PRE: 0.8141, REC: 0.9962, F1: 0.8960
[Valid] Fold #5 ACC: 0.9537, PRE: 0.6504, REC: 0.9524, F1: 0.7729
Avg. F1 of validset: 0.76859315960625
Var. F1 of validset: 4.041067779156198e-05
{'learning_rate': 0.05, 'n_estimators': 3000, 'early_stopping_rounds': 50, 'max_depth': 6, 'l2_leaf_reg': 1, 'min_data_in_leaf': 2, 'subsample': 0.5, 'random_seed': 36, 'objective': 'Logloss', 'auto_class_weights': 'Bal

  df[base_cat_feat] = df[base_cat_feat].fillna('UNK')
  df[base_cat_feat] = df[base_cat_feat].fillna('UNK')
  df[base_cat_feat] = df[base_cat_feat].fillna('UNK')
  df[base_cat_feat] = df[base_cat_feat].fillna('UNK')


Fold #1
[Train] Fold #1 ACC: 0.9823, PRE: 0.8267, REC: 0.9940, F1: 0.9027
[Valid] Fold #1 ACC: 0.9537, PRE: 0.6529, REC: 0.9405, F1: 0.7707
Fold #2
[Train] Fold #2 ACC: 0.9866, PRE: 0.8630, REC: 0.9968, F1: 0.9250
[Valid] Fold #2 ACC: 0.9483, PRE: 0.6235, REC: 0.9481, F1: 0.7523
Fold #3
[Train] Fold #3 ACC: 0.9877, PRE: 0.8722, REC: 0.9976, F1: 0.9307
[Valid] Fold #3 ACC: 0.9525, PRE: 0.6446, REC: 0.9502, F1: 0.7682
Fold #4
[Train] Fold #4 ACC: 0.9843, PRE: 0.8428, REC: 0.9968, F1: 0.9134
[Valid] Fold #4 ACC: 0.9552, PRE: 0.6599, REC: 0.9470, F1: 0.7778
Fold #5
[Train] Fold #5 ACC: 0.9788, PRE: 0.7977, REC: 0.9962, F1: 0.8859
[Valid] Fold #5 ACC: 0.9500, PRE: 0.6350, REC: 0.9318, F1: 0.7553
Avg. F1 of validset: 0.7648361620176131
Var. F1 of validset: 9.263986275090917e-05
{'learning_rate': 0.05, 'n_estimators': 3000, 'early_stopping_rounds': 50, 'max_depth': 6, 'l2_leaf_reg': 1, 'min_data_in_leaf': 2, 'subsample': 0.5, 'random_seed': 71, 'objective': 'Logloss', 'auto_class_weights': 'B

  df[base_cat_feat] = df[base_cat_feat].fillna('UNK')
  df[base_cat_feat] = df[base_cat_feat].fillna('UNK')
  df[base_cat_feat] = df[base_cat_feat].fillna('UNK')
  df[base_cat_feat] = df[base_cat_feat].fillna('UNK')


Fold #1
[Train] Fold #1 ACC: 0.9844, PRE: 0.8454, REC: 0.9940, F1: 0.9137
[Valid] Fold #1 ACC: 0.9502, PRE: 0.6361, REC: 0.9307, F1: 0.7557
Fold #2
[Train] Fold #2 ACC: 0.9880, PRE: 0.8744, REC: 0.9981, F1: 0.9322
[Valid] Fold #2 ACC: 0.9563, PRE: 0.6649, REC: 0.9513, F1: 0.7827
Fold #3
[Train] Fold #3 ACC: 0.9883, PRE: 0.8778, REC: 0.9970, F1: 0.9336
[Valid] Fold #3 ACC: 0.9540, PRE: 0.6517, REC: 0.9556, F1: 0.7749
Fold #4
[Train] Fold #4 ACC: 0.9877, PRE: 0.8728, REC: 0.9970, F1: 0.9308
[Valid] Fold #4 ACC: 0.9536, PRE: 0.6499, REC: 0.9524, F1: 0.7726
Fold #5
[Train] Fold #5 ACC: 0.9875, PRE: 0.8713, REC: 0.9962, F1: 0.9296
[Valid] Fold #5 ACC: 0.9522, PRE: 0.6441, REC: 0.9459, F1: 0.7663
Avg. F1 of validset: 0.7704551997911007
Var. F1 of validset: 8.18651204692394e-05


  df[base_cat_feat] = df[base_cat_feat].fillna('UNK')
  df[base_cat_feat] = df[base_cat_feat].fillna('UNK')
  df[base_cat_feat] = df[base_cat_feat].fillna('UNK')
  df[base_cat_feat] = df[base_cat_feat].fillna('UNK')
