In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install catboost



In [21]:
config = {
    'root': "."
    , 'train_path':  '/content/drive/MyDrive/lg_aimers/data/train.csv'  #"./train.csv"
    , 'submit_path': '/content/drive/MyDrive/lg_aimers/data/submission.csv' #"./submission.csv"
    , 'seed_list': [42, 137] # , 56, 89, 24, 75 ,88 ,36 ,71]
    , 'k_fold': 5
    , 'thresholds': {'product_category': 10, 'expected_timeline': 3}
}

cbt_params = {
    'random_seed': config['seed_list'],
    'objective': 'Logloss',
    'auto_class_weights': 'Balanced',
    'verbose': 0
}


tuning_params = [
    {
        'learning_rate': 0.05
        , 'n_estimators': 3000

        , 'early_stopping_rounds': 50

        # regularizations
        , 'max_depth': 6
        , 'l2_leaf_reg': 1
        , 'min_data_in_leaf': 2
        , 'subsample': 0.5
        # ,'grow_policy': 'Depthwise' # 'SymmetricTree'(default)
    },
]

In [4]:
import random
import pandas as pd
import numpy as np
from datetime import datetime
import os

from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import StratifiedKFold

from catboost import CatBoostClassifier
from functools import partial

In [13]:
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder

In [6]:
def set_seed(seed: int):
    # Set the seed for reproducibility.
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    # torch.manual_seed(seed)
    # torch.cuda.manual_seed(seed)  # type: ignore
    # torch.backends.cudnn.deterministic = True  # type: ignore
    # torch.backends.cudnn.benchmark = True  # type: ignore


def read_data(config):
    # Load training, testing, and submission CSV files
    df_train = pd.read_csv(config['train_path'])  # 학습용 데이터
    df_test = pd.read_csv(config['submit_path'])  # 테스트 데이터(제출파일의 데이터)
    df_sub = pd.read_csv(config['submit_path'])

    return df_train, df_test, df_sub


def get_clf_eval(y_test, y_pred=None, fold_no=None):
    # Calculate and print evaluation metrics and confusion matrix,
    # accuracy, precision, recall, and F1 score.
    # Optionally includes fold number in the output.
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    fold_info = f'Fold #{fold_no}' if fold_no is not None else ''
    print(f'{fold_info} ACC: {accuracy:.4f}, PRE: {precision:.4f}, REC: {recall:.4f}, F1: {F1:.4f}')
    return F1


def new_business_area(cur_area):
    # Categorizes current business area into a new, simplified business area group.
    # Returns the original area if it doesn't match the above categories.
    if cur_area in ['corporate / office', 'government department']:
        return 'Office'
    elif cur_area in ['education', 'public facility']:
        return 'Public'
    elif cur_area in ['hotel & accommodation', 'residential (home)']:
        return 'Amenity'
    elif cur_area in ['factory', 'power plant / renewable energy', 'transportation']:
        return 'Industry'
    else:
        return cur_area


def make_value_count(value_count_dict, val):
     # Returns the count of occurrences of val from a precomputed dictionary
     # otherwise returns NaN.
    if val in value_count_dict:
        return value_count_dict[val]
    return np.NAN


def make_value_count_dict(df, col_names):
    # Creates a dictionary of value counts for each feature name
    value_count_dict = dict()
    for feat_name in col_names:
        total_count = df[feat_name].value_counts()

        # Converts the Series object to a DataFrame and then to a dictionary.
        count_df = pd.DataFrame(total_count).reset_index(drop=False)

        value_count_dict[feat_name] = dict(zip(count_df.iloc[:, 0], count_df.iloc[:, 1]))

    return value_count_dict


def make_continent(rc):
    # Maps response corporate (rc) codes to their respective continent.
    # Each if statement checks for membership in a list of codes corresponding to a continent.
    if rc in ('lgein', 'lgeml', 'lgeph', 'lgeth', 'lgevh', 'lgeil', 'lgekr', 'lgett', 'lgejp', 'lgech', 'lgeir', 'lgesj',
              'lgegf', 'lgetk', 'lgelf', 'lgehk', 'lgeyk'):
        return 'asia'
    if rc in ('lgeaf', 'lgesa', 'lgemc', 'lgeas', 'lgeeg', 'lgeef'):
        return 'africa'
    if rc in ('lgeus', 'lgeci'):
        return 'northamerica'
    if rc in ('lgesp', 'lgecb', 'lgems', 'lgecl', 'lgeps', 'lgear', 'lgepr'):
        return 'southamerica'
    if rc in ('lgeuk', 'lgees', 'lgefs', 'lgebn', 'lgebt', 'lgedg', 'lgero', 'lgemk', 'lgepl', 'lgecz', 'lgehs', 'lgesw',
              'lgeag', 'lgeeb', 'lgera', 'lgeur', 'lgept', 'lgeis', 'lgela'):
        return 'europe'
    if rc in ('lgesl', 'lgeap'):
        return 'oceania'
    return np.NaN


def feature_engineering(df_input, is_train=False):
    df = df_input.copy()

    # drop_duplicates
    df = df.drop_duplicates(keep='first')

    # converting text to lowercase, removing commas and periods
    # and replacing underscores and hyphens with spaces.
    for feat in df.columns:
        # Applies transformations only to string-type columns.
        if df[feat].dtype == 'object':
            df[feat] = df[feat].str.lower()
            df[feat] = df[feat].str.replace('[,\.]', '', regex=True)
            df[feat] = df[feat].str.replace('[_-]', ' ', regex=True)

    # Recategorizes 'Solution' and 'CM' in 'business_unit' column to 'Others'.
    df['business_unit'] = np.where(df['business_unit'].isin(['Solution', 'CM']), 'Others', df['business_unit'])

    # map ['other', 'others', 'etc'] -> 'Others'
    unify_others_columns = ['customer_type', 'customer_job', 'inquiry_type', 'product_category', 'product_subcategory',
                            'customer_position', 'expected_timeline']
    for column in unify_others_columns:
        df[column] = df[column].replace(['other', 'others', 'etc'], 'others')

    # make new columns
    df['continent'] = df['response_corporate'].map(make_continent)
    df['business_area_group'] = df['business_area'].map(new_business_area)
    df['product_category_count'] = df['product_category'].apply(
        lambda x: len(str(x).split(',')) if not pd.isna(x) else np.nan)

    make_value_count_columns = ['lead_owner']
    if is_train:
        global value_count_dict
        value_count_dict = make_value_count_dict(df_input, make_value_count_columns)

    for feat_name in value_count_dict:
        func = partial(make_value_count, value_count_dict[feat_name])
        df[f'{feat_name}_count'] = df[feat_name].map(func)

    # correct data type
    for feat in ['customer_idx', 'lead_owner', 'ver_cus', 'ver_pro']:
        df[feat] = df[feat].astype(object)

    return df


def make_feature_lists(df):
    base_features = []     # all features except target variable.
    base_num_features = [] # numerical features
    base_cat_features = [] # categorical features

    for feat in df.columns:
        # skip the target
        if feat == 'is_converted':
            continue

        base_features.append(feat)

        if df[feat].dtype == 'object':
            base_cat_features.append(feat)
        else:
            base_num_features.append(feat)

    # features to be removed from data analysis
    removal_features = {
        'id', 'bant_submit', 'id_strategic_ver', 'it_strategic_ver', 'idit_strategic_ver',
        'customer_country', 'customer_country.1', 'business_subarea', 'business_area'
    }

    # remove the specified features
    base_num_features = [i for i in base_num_features if i not in removal_features]
    base_cat_features = [i for i in base_cat_features if i not in removal_features]
    base_features = [i for i in base_features if i not in removal_features]

    return base_num_features, base_cat_features, base_features


def filling_missing_values(df_input, base_cat_features, base_num_features):
    df = df_input.copy()

    # Fill missing values for categorical features with 'UNK'
    # and ensure their data type is string.
    for base_cat_feat in base_cat_features:
        df[base_cat_feat] = df[base_cat_feat].fillna('UNK')
        df[base_cat_feat] = df[base_cat_feat].astype(str)

    # Fill missing values for numerical features with -1.
    for base_num_feat in base_num_features:
        df[base_num_feat] = df[base_num_feat].fillna(-1)

    return df


def model_kfold(df, config, cbt_params, base_features, base_cat_features):
    target = 'is_converted'

    skf = StratifiedKFold(n_splits=config['k_fold'], shuffle=True, random_state=config['seed'])
    cbt_models = [] # trained models
    f1_scores = []  # f1-scores for validation sets

    for k_fold, (train_idx, valid_idx) in enumerate(skf.split(df[base_features], df[target])):
        print(f'Fold #{k_fold + 1}')
        X_train, y_train = df[base_features].iloc[train_idx], df[target].iloc[train_idx].astype(int)
        X_valid, y_valid = df[base_features].iloc[valid_idx], df[target].iloc[valid_idx].astype(int)

        # initialize CatBoost model with provided parameters
        cbt = CatBoostClassifier(**cbt_params)

        cbt.fit(
            X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            cat_features=base_cat_features, # specify categorical features
        )

        # save the trained model
        cbt_models.append(cbt)

        # evaluate the model
        # --- train-set
        print('[Train] ', end='')
        y_pred = cbt.predict(X_train)
        _ = get_clf_eval(y_train, y_pred, k_fold + 1)

        # --- valid-set
        print('[Valid] ', end='')
        y_pred = cbt.predict(X_valid)
        y_pred = y_pred.astype(y_valid.dtype) # ensure matching dtype
        f1 = get_clf_eval(y_valid, y_pred, k_fold + 1)

        f1_scores.append(f1)

    print(f'Avg. F1 of validset: {np.mean(f1_scores)}')
    print(f'Var. F1 of validset: {np.var(f1_scores)}')

    return cbt_models


def kfold_submission(df_train, df_test, df_sub, cbt_models, config):
    folder_path = f"{config['root']}/FeatureImportance"

    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    # get current date and time
    now = datetime.now()

    # record the year, month, day, hour, and minute for naming files.
    year = now.year
    month = now.month
    day = now.day
    hour = now.hour
    minute = now.minute

    # file format
    submission_time = f"{year:04d}{month:02d}{day:02d}_{hour:02d}{minute:02d}"[2:]
    target = 'is_converted'

    # apply feature engineering
    df_test = feature_engineering(df_test, is_train=False)
    base_num_features, base_cat_features, base_features = make_feature_lists(df_test)

    df_test = filling_missing_values(df_test, base_cat_features, base_num_features)

    X_test = df_test[base_features]

    # a matrix to store the probabilities of each class for the test set.
    y_probs = np.zeros((X_test.shape[0], 2))

    # dataframe for feature importances
    df_feature_importance_all = pd.DataFrame({'features': base_features})

    for i, cbt_model in enumerate(cbt_models):
        y_probs += cbt_model.predict_proba(X_test) / len(cbt_models)

        # save feature importance of current model
        df_feature_importance_all[f'model_{i}'] = cbt_model.get_feature_importance()

    # compute avarege, rank
    df_feature_importance_all['average'] = df_feature_importance_all.iloc[:, 1:].mean(axis=1).values
    df_feature_importance_all['rank'] = df_feature_importance_all['average'].rank(ascending=False)

    # save the feature importance as CSV
    df_feature_importance_all.to_csv(f'{folder_path}/feat_import_{submission_time}.csv', index=False)

    # create submission file
    # threshold: 0.5 to determine the class
    df_sub[target] = (y_probs[:, 1] >= 0.5).astype(bool)

    # save submission file as CSV
    df_sub.to_csv(f"{config['root']}/submission.csv", index=False)

In [7]:
def main(config, cbt_params, tuning_params):
    cbt_models = []

    for seed in config['seed_list']:
        for tuning_param in tuning_params:
            config['seed'] = seed
            cbt_params['random_seed'] = seed

            for param in cbt_params:
                tuning_param[param] = cbt_params[param]
            print(tuning_param)

            # seed 설정
            set_seed(config['seed'])

            # 데이터 셋 읽어오기
            df_train, df_test, df_sub = read_data(config)

            # 데이터 전처리
            df_train = feature_engineering(df_train, is_train=True)
            base_num_features, base_cat_features, base_features = make_feature_lists(df_train)

            # 결측치 채우기
            df_train = filling_missing_values(df_train, base_cat_features, base_num_features)

            # 모델 성능 확인(train, valid 데이터 활용)
            cbt_model = model_kfold(df_train, config, tuning_param, base_features, base_cat_features)

            cbt_models.extend(cbt_model)

    # 제출하기
    kfold_submission(df_train, df_test, df_sub, cbt_models, config)

In [35]:
def lgb_kfold_submission(df_train, df_test, df_sub, lgb_models, config):
    folder_path = f"{config['root']}/FeatureImportance"

    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    # get current date and time
    now = datetime.now()

    # record the year, month, day, hour, and minute for naming files.
    year = now.year
    month = now.month
    day = now.day
    hour = now.hour
    minute = now.minute

    # file format
    submission_time = f"{year:04d}{month:02d}{day:02d}_{hour:02d}{minute:02d}"[2:]
    target = 'is_converted'

    # apply feature engineering
    df_test = feature_engineering(df_test, is_train=False)
    base_num_features, base_cat_features, base_features = make_feature_lists(df_test)

    df_test = filling_missing_values(df_test, base_cat_features, base_num_features)

    X_test = df_test[base_features].copy()
    for cat_feat in base_cat_features:
        X_test[cat_feat] = X_test[cat_feat].astype('category')
        #X_test[cat_feat] = X_test[cat_feat].map(lambda s: config['le_dict'][cat_feat].transform([s])[0] if s in config['le_dict'][cat_feat].classes_ else -1)

    # a matrix to store the probabilities of each class for the test set.
    y_probs = np.zeros((X_test.shape[0], 2))

    # dataframe for feature importances
    df_feature_importance_all = pd.DataFrame({'features': base_features})

    for i, lgb_model in enumerate(lgb_models):
        y_probs += lgb_model.predict_proba(X_test) / len(lgb_models)

        # save feature importance of current model
        df_feature_importance_all[f'model_{i}'] = lgb_model.booster_.feature_importance(importance_type='gain')

    # compute avarege, rank
    df_feature_importance_all['average'] = df_feature_importance_all.iloc[:, 1:].mean(axis=1).values
    df_feature_importance_all['rank'] = df_feature_importance_all['average'].rank(ascending=False)

    # save the feature importance as CSV
    df_feature_importance_all.to_csv(f'{folder_path}/feat_import_{submission_time}.csv', index=False)

    # create submission file
    # threshold: 0.5 to determine the class
    df_sub[target] = (y_probs[:, 1] >= 0.5).astype(bool)

    # save submission file as CSV
    df_sub.to_csv(f"{config['root']}/submission.csv", index=False)

In [31]:
def lgb_model_kfold(df, config, lgb_params, base_features, base_cat_features):
    target = 'is_converted'

    skf = StratifiedKFold(n_splits=config['k_fold'], shuffle=True, random_state=config['seed'])
    lgb_models = [] # trained models
    f1_scores = []  # f1-scores for validation sets

    for k_fold, (train_idx, valid_idx) in enumerate(skf.split(df[base_features], df[target])):
        print(f'Fold #{k_fold + 1}')
        X_train, y_train = df[base_features].iloc[train_idx], df[target].iloc[train_idx].astype(int)
        X_valid, y_valid = df[base_features].iloc[valid_idx], df[target].iloc[valid_idx].astype(int)

        # initialize CatBoost model with provided parameters
        lgb_clf = lgb.LGBMClassifier(**lgb_params)

        # lgb_train = lgb.Dataset(X_train, label=y_train, categorical_feature =base_cat_features)
        # lgb_valid = lgb.Dataset(X_valid, label=y_valid, reference=lgb_train)

        # lgb_clf = lgb.train(
        #     lgb_params,
        #     lgb_train,
        #     valid_sets=[lgb_valid],
        # )

        lgb_clf.fit(
            X_train, y_train,
            eval_set=[(X_valid, y_valid)],
            categorical_feature =base_cat_features, # specify categorical features
        )

        # save the trained model
        lgb_models.append(lgb_clf)

        # evaluate the model
        # --- train-set
        print('[Train] ', end='')
        y_pred = lgb_clf.predict(X_train)
        _ = get_clf_eval(y_train, y_pred, k_fold + 1)

        # --- valid-set
        print('[Valid] ', end='')
        y_pred = lgb_clf.predict(X_valid)
        y_pred = y_pred.astype(y_valid.dtype) # ensure matching dtype
        f1 = get_clf_eval(y_valid, y_pred, k_fold + 1)

        f1_scores.append(f1)

    print(f'Avg. F1 of validset: {np.mean(f1_scores)}')
    print(f'Var. F1 of validset: {np.var(f1_scores)}')

    return lgb_models

In [32]:
def main_lgb(config, lgb_params, lgb_tuning_params):
    lgb_models = []

    for seed in config['seed_list']:
        for tuning_param in lgb_tuning_params:
            config['seed'] = seed
            lgb_params['random_seed'] = seed

            for param in lgb_params:
                tuning_param[param] = lgb_params[param]
            print('tuning_param', tuning_param)

            # seed 설정
            set_seed(config['seed'])

            # 데이터 셋 읽어오기
            df_train, df_test, df_sub = read_data(config)

            # 데이터 전처리
            df_train = feature_engineering(df_train, is_train=True)
            base_num_features, base_cat_features, base_features = make_feature_lists(df_train)

            # 결측치 채우기
            df_train = filling_missing_values(df_train, base_cat_features, base_num_features)

            # LGBM을 위한 전처리: label encoding
            le_dict = {}
            for cat_feat in base_cat_features:
                df_train[cat_feat] = df_train[cat_feat].astype('category')
            #     le = LabelEncoder()
            #     df_train[cat_feat] = le.fit_transform(df_train[cat_feat])
            #     le_dict[cat_feat] = le
            # config['le_dict'] = le_dict

            # 모델 성능 확인(train, valid 데이터 활용)
            lgb_model = lgb_model_kfold(df_train, config, tuning_param, base_features, base_cat_features)

            lgb_models.extend(lgb_model)

    # 제출하기
    lgb_kfold_submission(df_train, df_test, df_sub, lgb_models, config)

In [33]:
lgb_params = {
    'random_seed': config['seed_list'],
    'objective': 'binary',
    'is_unbalance': 'true',
    'verbose': 0
}

lgb_tuning_params = [
    {
        'learning_rate': 0.05,
        'n_estimators': 3000,
        'early_stopping_round': 50,
        'max_depth': -1,
        #'num_leaves': 80,
        'reg_lambda': 1,
        'subsample': 0.5,
        #'n_jobs': -1,
    },
]

In [36]:
main_lgb(config, lgb_params, lgb_tuning_params)

tuning_param {'learning_rate': 0.05, 'n_estimators': 3000, 'early_stopping_round': 50, 'max_depth': -1, 'reg_lambda': 1, 'subsample': 0.5, 'random_seed': 42, 'objective': 'binary', 'is_unbalance': 'true', 'verbose': 0}
Fold #1
[Train] Fold #1 ACC: 0.9881, PRE: 0.8746, REC: 1.0000, F1: 0.9331
[Valid] Fold #1 ACC: 0.9682, PRE: 0.7676, REC: 0.8831, F1: 0.8213
Fold #2
[Train] Fold #2 ACC: 0.9910, PRE: 0.9021, REC: 0.9995, F1: 0.9483
[Valid] Fold #2 ACC: 0.9699, PRE: 0.7940, REC: 0.8593, F1: 0.8254
Fold #3
[Train] Fold #3 ACC: 0.9866, PRE: 0.8607, REC: 0.9997, F1: 0.9250
[Valid] Fold #3 ACC: 0.9675, PRE: 0.7624, REC: 0.8820, F1: 0.8179
Fold #4
[Train] Fold #4 ACC: 0.9891, PRE: 0.8838, REC: 0.9997, F1: 0.9382
[Valid] Fold #4 ACC: 0.9668, PRE: 0.7633, REC: 0.8690, F1: 0.8128
Fold #5
[Train] Fold #5 ACC: 0.9871, PRE: 0.8655, REC: 0.9995, F1: 0.9277
[Valid] Fold #5 ACC: 0.9696, PRE: 0.7832, REC: 0.8755, F1: 0.8268
Avg. F1 of validset: 0.8208187518467576
Var. F1 of validset: 2.6041615547458325e-

In [None]:
#main(config, cbt_params, tuning_params)