In [1]:
!pip install catboost

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import random
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    precision_recall_curve
)

from catboost import CatBoostClassifier

In [3]:
def fix_seed(seed=42):
    # Fixing the seed for the Python random module
    random.seed(seed)
    
    # Fixing the seed for NumPy
    np.random.seed(seed)
    
    
def get_clf_eval(y_test, y_pred=None, fold_no=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])
    
    fold_info = f'Fold #{fold_no}' if fold_no is not None else ''
    print(f'{fold_info} ACC: {accuracy:.4f}, PRE: {precision:.4f}, REC: {recall:.4f}, F1: {F1:.4f}\n')
    return F1

def new_business_area(cur_area):
    if cur_area in ['corporate / office', 'government department']:
        return 'Office'
    elif cur_area in ['education', 'public facility']:
        return 'Public'
    elif cur_area in ['hotel & accommodation', 'residential (home)']:
        return 'Amenity'
    elif cur_area in ['factory', 'power plant / renewable energy', 'transportation']:
        return 'Industry'
    else:
        return cur_area
    
def feature_engineering(df_input):
    df = df_input.copy()

    df['business_unit'] = np.where(df['business_unit'].isin(['Solution', 'CM']), 'Others', df['business_unit'])
    
    df['business_area_group'] = df['business_area'].map(new_business_area)
    
    return df

def filling_missing_values(df_input, base_cat_features, base_num_features):
    df = df_input.copy()
    
    for base_cat_feat in base_cat_features:
        df[base_cat_feat] = df[base_cat_feat].fillna('UNK')
        df[base_cat_feat] = df[base_cat_feat].astype(str)

    for base_num_feat in base_num_features:
        df[base_num_feat] = df[base_num_feat].fillna(-1)
        
    return df

In [4]:
config = {
    'train_path': 'train.csv'
    ,'submit_path': 'original_submission.csv'
    ,'seed':42
    ,'k_fold': 5
}

In [5]:
fix_seed(config['seed'])

In [6]:
df = pd.read_csv(config['train_path'])
df.duplicated().sum()

3519

In [7]:
df = df.drop_duplicates(keep='first')
df.shape

(55780, 29)

In [8]:
df = feature_engineering(df)

In [9]:
base_features = []
base_num_features = []
base_cat_features = []
target = 'is_converted'

total_rows = len(df)
for feat in df.columns:
    if feat == 'is_converted':
        continue
        
    # Count the number of missing values in the column
    missing_count = df[feat].isna().sum()
    
    # Calculate the percentage of missing values
    missing_rate = (missing_count / total_rows) * 100
    
    
    base_features.append(feat)

    # Get the data type of the column
    if feat in ['id_strategic_ver', 'it_strategic_ver', 'idit_strategic_ver']:
        df[feat] = df[feat].astype(object)
    print(f'{feat}, {missing_rate:.3f}(%), [{df[feat].dtype}]')
    
    if df[feat].dtype == 'object':
        base_cat_features.append(feat)
    else:
        base_num_features.append(feat)

bant_submit, 0.000(%), [float64]
customer_country, 1.746(%), [object]
business_unit, 0.000(%), [object]
com_reg_ver_win_rate, 74.697(%), [float64]
customer_idx, 0.000(%), [int64]
customer_type, 74.145(%), [object]
enterprise, 0.000(%), [object]
historical_existing_cnt, 77.824(%), [float64]
id_strategic_ver, 94.014(%), [object]
it_strategic_ver, 98.014(%), [object]
idit_strategic_ver, 92.028(%), [object]
customer_job, 30.125(%), [object]
lead_desc_length, 0.000(%), [int64]
inquiry_type, 1.597(%), [object]
product_category, 30.789(%), [object]
product_subcategory, 83.854(%), [object]
product_modelname, 83.810(%), [object]
customer_country.1, 1.746(%), [object]
customer_position, 0.000(%), [object]
response_corporate, 0.000(%), [object]
expected_timeline, 50.721(%), [object]
ver_cus, 0.000(%), [int64]
ver_pro, 0.000(%), [int64]
ver_win_rate_x, 68.112(%), [float64]
ver_win_ratio_per_bu, 73.317(%), [float64]
business_area, 68.112(%), [object]
business_subarea, 90.714(%), [object]
lead_owner

In [10]:
# customer_country == customer_country.1

removal_features = set([
    'customer_country', 'id_strategic_ver', 'it_strategic_ver', 'idit_strategic_ver',
    'business_area', 'business_subarea',
    'ver_cus', 'ver_pro', 'bant_submit', 'ver_win_ratio_per_bu', 'product_modelname',
    'product_subcategory', 'enterprise', 'ver_win_rate_x'
])

        
base_num_features = [i for i in base_num_features if i not in removal_features]
base_cat_features = [i for i in base_cat_features if i not in removal_features]
base_features = [i for i in base_features if i not in removal_features]

print(base_num_features)
print(base_cat_features)
print(base_features)

['com_reg_ver_win_rate', 'customer_idx', 'historical_existing_cnt', 'lead_desc_length', 'lead_owner']
['business_unit', 'customer_type', 'customer_job', 'inquiry_type', 'product_category', 'customer_country.1', 'customer_position', 'response_corporate', 'expected_timeline', 'business_area_group']
['business_unit', 'com_reg_ver_win_rate', 'customer_idx', 'customer_type', 'historical_existing_cnt', 'customer_job', 'lead_desc_length', 'inquiry_type', 'product_category', 'customer_country.1', 'customer_position', 'response_corporate', 'expected_timeline', 'lead_owner', 'business_area_group']


In [11]:
df = filling_missing_values(df, base_cat_features, base_num_features)

In [18]:
cbt_params = {
    'random_seed': config['seed']
    ,'learning_rate': 0.05
    ,'n_estimators': 3000
    
    ,'early_stopping_rounds': 50
    ,'verbose': 100
    ,'auto_class_weights': 'Balanced' #'SqrtBalanced', 'Balanced'
    
    # regularizations
    ,'max_depth': 6
    ,'l2_leaf_reg': 1
    ,'min_data_in_leaf': 2
    ,'subsample': 0.5
    #,'grow_policy': 'Depthwise' # 'SymmetricTree'(default)
}

skf = StratifiedKFold(n_splits=config['k_fold'], shuffle=True, random_state=config['seed'])
cbt_models = []
f1_scores = []

for k_fold, (train_idx, valid_idx) in enumerate(skf.split(df[base_features], df[target])):
    print(f'Fold #{k_fold+1}')
    X_train, y_train = df[base_features].iloc[train_idx], df[target].iloc[train_idx].astype(int)
    X_valid, y_valid = df[base_features].iloc[valid_idx], df[target].iloc[valid_idx].astype(int)

    cbt = CatBoostClassifier(**cbt_params)

    cbt.fit(
        X_train, y_train,
        eval_set = [(X_valid, y_valid)],
        cat_features=base_cat_features,
    )

    cbt_models.append(cbt)

    # train-set
    print('[Train] ', end='')
    y_pred = cbt.predict(X_train)
    _ = get_clf_eval(y_train, y_pred, k_fold+1)
    
    # valid-set
    print('[Valid] ', end='')
    y_pred = cbt.predict(X_valid)
    y_pred = y_pred.astype(y_valid.dtype)
    f1 = get_clf_eval(y_valid, y_pred, k_fold+1)

    f1_scores.append(f1)

print(f'Avg. F1 of validset: {np.mean(f1_scores)}')

Fold #1
0:	learn: 0.6459792	test: 0.6459291	best: 0.6459291 (0)	total: 95.3ms	remaining: 4m 45s
100:	learn: 0.2031152	test: 0.2060926	best: 0.2060926 (100)	total: 5.36s	remaining: 2m 33s
200:	learn: 0.1753576	test: 0.1835409	best: 0.1835409 (200)	total: 9.32s	remaining: 2m 9s
300:	learn: 0.1464599	test: 0.1652782	best: 0.1652782 (300)	total: 13.6s	remaining: 2m 2s
400:	learn: 0.1312784	test: 0.1581168	best: 0.1581111 (399)	total: 17.8s	remaining: 1m 55s
500:	learn: 0.1188149	test: 0.1533148	best: 0.1532870 (499)	total: 22.1s	remaining: 1m 50s
600:	learn: 0.1094384	test: 0.1523595	best: 0.1523554 (598)	total: 26.4s	remaining: 1m 45s
700:	learn: 0.1010921	test: 0.1517707	best: 0.1512395 (651)	total: 30.8s	remaining: 1m 40s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.1512394553
bestIteration = 651

Shrink model to first 652 iterations.
[Train] Fold #1 ACC: 0.9604, PRE: 0.6796, REC: 0.9878, F1: 0.8052

[Valid] Fold #1 ACC: 0.9460, PRE: 0.6155, REC: 0.9286, F1: 0.740

In [13]:
df_fi_list = []
for cbt_model in cbt_models:
    df_fi = cbt_model.get_feature_importance(prettified=True)
    df_fi_list.append(df_fi)

df_fi_all = pd.concat(df_fi_list)
df_avg_importance = df_fi_all.groupby('Feature Id')['Importances'].mean().reset_index()

# Rename 'importance' column to 'avg.importance'
df_avg_importance.rename(columns={'Importances': 'avg.importance'}, inplace=True)

# Optionally, sort the DataFrame by 'feature id' or 'avg.importance'
df_avg_importance.sort_values(by='avg.importance', inplace=True, ascending=False, ignore_index=True)

print(df_avg_importance)
df_avg_importance

                 Feature Id  avg.importance
0              customer_idx       27.319018
1                lead_owner       18.987136
2        response_corporate        9.252858
3             customer_type        8.777788
4          lead_desc_length        5.466157
5          product_category        5.187749
6             business_unit        4.426346
7              inquiry_type        4.376118
8      com_reg_ver_win_rate        2.961865
9       business_area_group        2.879332
10             customer_job        2.757379
11       customer_country.1        2.430351
12  historical_existing_cnt        2.161900
13        customer_position        1.629770
14        expected_timeline        1.386234


Unnamed: 0,Feature Id,avg.importance
0,customer_idx,27.319018
1,lead_owner,18.987136
2,response_corporate,9.252858
3,customer_type,8.777788
4,lead_desc_length,5.466157
5,product_category,5.187749
6,business_unit,4.426346
7,inquiry_type,4.376118
8,com_reg_ver_win_rate,2.961865
9,business_area_group,2.879332


In [14]:
df_test = pd.read_csv(config['submit_path'])

df_test = feature_engineering(df_test)
df_test = filling_missing_values(df_test, base_cat_features, base_num_features)
X_test = df_test[base_features]

y_probs = np.zeros((X_test.shape[0], 2))
for cbt_model in cbt_models:
    y_probs += cbt_model.predict_proba(X_test) / len(cbt_models)

(y_probs[:, 1] >= 0.5).sum() / len(y_probs)

0.29804591159172833

In [15]:
df_test[target] = (y_probs[:, 1] >= 0.5).astype(bool)
df_test.to_csv(f'submission.csv', index=True)


In [16]:
df_test.to_csv(f'results/submission_0205_2212.csv', index=True)