# 영업 성공 여부 분류 경진대회

## 1. 데이터 확인

### 필수 라이브러리

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [2]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from catboost import CatBoostClassifier, Pool, metrics, cv
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.svm import SVC
import xgboost
from lightgbm import LGBMClassifier

import matplotlib.pyplot as plt

In [3]:
df_train = pd.read_csv("train_processed_0219_1.csv") # 학습용 데이터
df_test = pd.read_csv("submission_0220_1.csv") # 테스트 데이터(제출파일의 데이터)

In [4]:
true_count = df_train['is_converted'].sum()
print(true_count)

4850


In [5]:
df_train = df_train.astype({ 'customer_country' : 'str' })
df_train = df_train.astype({ 'bant_submit' : 'str' })
df_train = df_train.astype({ 'customer_job' : 'str' })
df_train = df_train.astype({ 'customer_type' : 'str' })
df_train = df_train.astype({ 'expected_timeline' : 'str' })
df_train = df_train.astype({ 'ver_pro' : 'str' })
df_train = df_train.astype({ 'inquiry_type' : 'str' })
df_train = df_train.astype({ 'lead_owner' : 'str' })
df_train = df_train.astype({ 'product_category' : 'str' })
df_train = df_train.astype({ 'business_area' : 'str' })
df_train = df_train.astype({ 'is_converted' : 'bool' })

df_test = df_test.astype({ 'customer_country' : 'str' })
df_test = df_test.astype({ 'bant_submit' : 'str' })
df_test = df_test.astype({ 'customer_job' : 'str' })
df_test = df_test.astype({ 'customer_type' : 'str' })
df_test = df_test.astype({ 'customer_position' : 'str' })
df_test = df_test.astype({ 'expected_timeline' : 'str' })
df_test = df_test.astype({ 'id' : 'int' })
df_test = df_test.astype({ 'lead_owner' : 'str' })
df_test = df_test.astype({ 'ver_pro' : 'str' })
df_test = df_test.astype({ 'inquiry_type' : 'str' })
df_test = df_test.astype({ 'product_category' : 'str' })
df_test = df_test.astype({ 'business_area' : 'str' })
df_test = df_test.astype({ 'is_converted' : 'bool' })

In [6]:
df_train = df_train.drop('customer_country.1', axis=1)
df_train = df_train.drop('product_modelname', axis=1)
df_train = df_train.drop('product_subcategory', axis=1)
df_train = df_train.drop('business_subarea', axis=1)
df_train = df_train.drop('customer_idx', axis=1)
df_train = df_train.drop('lead_owner', axis=1)

df_test = df_test.drop('customer_country.1', axis=1)
df_test = df_test.drop('product_modelname', axis=1)
df_test = df_test.drop('product_subcategory', axis=1)
df_test = df_test.drop('business_subarea', axis=1)
df_test = df_test.drop('customer_idx', axis=1)
df_test = df_test.drop('lead_owner', axis=1)

In [7]:
for column in df_train.columns:
    if df_train[column].dtype != 'float64' and column != 'is_converted':
        try:
            df_train[column] = df_train[column].astype(str)
        except ValueError:
            pass
for column in df_test.columns:
    if df_test[column].dtype != 'float64' and column != 'is_converted':
        try:
            df_test[column] = df_test[column].astype(str)
        except ValueError:
            pass

In [8]:
true_count = df_train['is_converted'].sum()
print(true_count)

4850


In [9]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    # print("오차행렬:\n", confusion)
    # print("\n정확도: {:.4f}".format(accuracy))
    # print("정밀도: {:.4f}".format(precision))
    # print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [10]:
uni = df_train['is_converted'].unique()

# for CatBoost
categorical_features_indices = np.where(df_train.drop("is_converted", axis=1).dtypes != float)[0]
print(len(categorical_features_indices))
print(categorical_features_indices)

15
[ 0  1  2  4  5 10 11 12 13 14 15 16 17 18 21]


In [11]:
true_count = df_train['is_converted'].sum()
print(true_count)

4850


In [12]:
x_train, x_val, y_train, y_val = train_test_split(
    df_train.drop("is_converted", axis=1),
    df_train["is_converted"],
    test_size=0.35,
    shuffle=True,
    random_state=400,
)

print('After upsampling: \nX_train shape: {} \ny_train shape: {}'.format(x_train.shape, y_train.shape))

print(df_train['is_converted'].value_counts())

After upsampling: 
X_train shape: (38544, 22) 
y_train shape: (38544,)
is_converted
False    54449
True      4850
Name: count, dtype: int64


In [16]:
true_count = y_val.value_counts()[True]
print(true_count)

1739


In [17]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59299 entries, 0 to 59298
Data columns (total 23 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   bant_submit              59299 non-null  object 
 1   customer_country         59299 non-null  object 
 2   business_unit            59299 non-null  object 
 3   com_reg_ver_win_rate     14568 non-null  float64
 4   customer_type            59299 non-null  object 
 5   enterprise               59299 non-null  object 
 6   historical_existing_cnt  13756 non-null  float64
 7   id_strategic_ver         3444 non-null   float64
 8   it_strategic_ver         1121 non-null   float64
 9   idit_strategic_ver       4565 non-null   float64
 10  customer_job             59299 non-null  object 
 11  lead_desc_length         59299 non-null  object 
 12  inquiry_type             59299 non-null  object 
 13  product_category         59299 non-null  object 
 14  customer_position     

In [44]:
accuracy = []
model_names = []
model = CatBoostClassifier(verbose=False,random_state=1111)
model.fit(x_train, y_train,cat_features=categorical_features_indices,eval_set=(x_val, y_val))
y_pred = model.predict(x_val)
y_pred_bool = [pred == 'True' for pred in y_pred]
accuracy.append(round(accuracy_score(y_val, y_pred_bool),4))

model_names = ['Catboost_default']
result_df5 = pd.DataFrame({'Accuracy':accuracy}, index=model_names)
result_df5

Unnamed: 0,Accuracy
Catboost_default,0.9349


In [36]:
def objective(trial):
    param = {
        # "eval_metric":"F1",
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.5),
        "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
        "depth": trial.suggest_int("depth", 1, 12),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        )
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    cat_cls = CatBoostClassifier(**param)
    y_train_numeric = [1 if label == True else 0 for label in y_train]
    y_val_numeric = [1 if label == True else 0 for label in y_val]
    cat_cls.fit(x_train, y_train_numeric, eval_set=[(x_val,  y_val_numeric)], cat_features=categorical_features_indices,verbose=200, early_stopping_rounds=20)

    pred_labels = cat_cls.predict(x_val)
    # pred_labels = [pred == 'True' for pred in pred_labels]
    # pred_labels = np.rint(preds)
    accuracy = f1_score(y_val, pred_labels)
    return accuracy

In [37]:
import optuna

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10, timeout=600)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2024-02-21 23:01:19,832] A new study created in memory with name: no-name-2469a00c-c304-4d26-af8f-7af006c24186


0:	learn: 0.5967797	test: 0.5975425	best: 0.5975425 (0)	total: 178ms	remaining: 2m 57s
200:	learn: 0.2391560	test: 0.2469637	best: 0.2469637 (200)	total: 10.3s	remaining: 41s
400:	learn: 0.2232130	test: 0.2311197	best: 0.2311197 (400)	total: 18.8s	remaining: 28s
600:	learn: 0.2149711	test: 0.2229772	best: 0.2229772 (600)	total: 27.7s	remaining: 18.4s
800:	learn: 0.2076399	test: 0.2158888	best: 0.2158888 (800)	total: 36s	remaining: 8.95s


[I 2024-02-21 23:02:04,722] Trial 0 finished with value: 0.1301405517959396 and parameters: {'learning_rate': 0.10200496829939348, 'objective': 'Logloss', 'colsample_bylevel': 0.018823050867624022, 'depth': 6, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 5.900878799326746}. Best is trial 0 with value: 0.1301405517959396.


999:	learn: 0.2043978	test: 0.2124837	best: 0.2124837 (993)	total: 44.2s	remaining: 0us

bestTest = 0.2124837194
bestIteration = 993

Shrink model to first 994 iterations.
0:	learn: 0.5876481	test: 0.5887506	best: 0.5887506 (0)	total: 77ms	remaining: 1m 16s
200:	learn: 0.1997713	test: 0.2096226	best: 0.2096226 (200)	total: 11.5s	remaining: 45.7s
400:	learn: 0.1864268	test: 0.1980398	best: 0.1980398 (400)	total: 23.1s	remaining: 34.5s
600:	learn: 0.1776184	test: 0.1907883	best: 0.1907871 (597)	total: 35.7s	remaining: 23.7s
800:	learn: 0.1722601	test: 0.1864891	best: 0.1864891 (800)	total: 47.3s	remaining: 11.8s


[I 2024-02-21 23:03:04,422] Trial 1 finished with value: 0.38187976291278575 and parameters: {'learning_rate': 0.09730840499964691, 'objective': 'CrossEntropy', 'colsample_bylevel': 0.0575724584940038, 'depth': 5, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 5.673763698184242}. Best is trial 1 with value: 0.38187976291278575.


999:	learn: 0.1679873	test: 0.1837314	best: 0.1837282 (998)	total: 59s	remaining: 0us

bestTest = 0.1837281957
bestIteration = 998

Shrink model to first 999 iterations.
0:	learn: 0.5762920	test: 0.5772221	best: 0.5772221 (0)	total: 72.2ms	remaining: 1m 12s
200:	learn: 0.1936330	test: 0.2033549	best: 0.2033549 (200)	total: 10.4s	remaining: 41.3s
400:	learn: 0.1805671	test: 0.1911012	best: 0.1911012 (400)	total: 20s	remaining: 29.9s
600:	learn: 0.1737608	test: 0.1848137	best: 0.1848137 (600)	total: 30.1s	remaining: 20s


[I 2024-02-21 23:03:40,058] Trial 2 finished with value: 0.3631647211413748 and parameters: {'learning_rate': 0.12592272660863538, 'objective': 'CrossEntropy', 'colsample_bylevel': 0.028211575339376183, 'depth': 7, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS'}. Best is trial 1 with value: 0.38187976291278575.


Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.1829618145
bestIteration = 682

Shrink model to first 683 iterations.
0:	learn: 0.4623190	test: 0.4642995	best: 0.4642995 (0)	total: 59.3ms	remaining: 59.3s
200:	learn: 0.1567169	test: 0.1745747	best: 0.1745238 (199)	total: 12.6s	remaining: 50.3s


[I 2024-02-21 23:04:02,845] Trial 3 finished with value: 0.47102803738317756 and parameters: {'learning_rate': 0.2743309615187561, 'objective': 'Logloss', 'colsample_bylevel': 0.07970369505778575, 'depth': 5, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.7673862381510218}. Best is trial 3 with value: 0.47102803738317756.


Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.1711584544
bestIteration = 339

Shrink model to first 340 iterations.
0:	learn: 0.5145945	test: 0.5163180	best: 0.5163180 (0)	total: 70.3ms	remaining: 1m 10s
200:	learn: 0.1632280	test: 0.1796179	best: 0.1796179 (200)	total: 12.4s	remaining: 49.3s
400:	learn: 0.1473807	test: 0.1737370	best: 0.1737370 (400)	total: 25.2s	remaining: 37.6s
600:	learn: 0.1370446	test: 0.1707779	best: 0.1707779 (600)	total: 37s	remaining: 24.5s


[I 2024-02-21 23:04:44,585] Trial 4 finished with value: 0.48184568835098335 and parameters: {'learning_rate': 0.1952068445800979, 'objective': 'Logloss', 'colsample_bylevel': 0.061574747318220616, 'depth': 6, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.2605474819260579}. Best is trial 4 with value: 0.48184568835098335.


Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.1703458358
bestIteration = 654

Shrink model to first 655 iterations.
0:	learn: 0.6326818	test: 0.6331395	best: 0.6331395 (0)	total: 46.6ms	remaining: 46.6s
200:	learn: 0.2257561	test: 0.2328670	best: 0.2328670 (200)	total: 7.25s	remaining: 28.8s
400:	learn: 0.2095552	test: 0.2165634	best: 0.2165634 (400)	total: 14.3s	remaining: 21.3s
600:	learn: 0.2022771	test: 0.2098530	best: 0.2098530 (600)	total: 21.2s	remaining: 14s
800:	learn: 0.1960987	test: 0.2036183	best: 0.2036183 (800)	total: 27.9s	remaining: 6.94s


[I 2024-02-21 23:05:20,529] Trial 5 finished with value: 0.2222222222222222 and parameters: {'learning_rate': 0.06201308587040219, 'objective': 'Logloss', 'colsample_bylevel': 0.02875778549404224, 'depth': 4, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 2.0848053049543913}. Best is trial 4 with value: 0.48184568835098335.


999:	learn: 0.1930634	test: 0.2011287	best: 0.2011286 (998)	total: 34.3s	remaining: 0us

bestTest = 0.2011285556
bestIteration = 998

Shrink model to first 999 iterations.
0:	learn: 0.4508793	test: 0.4531398	best: 0.4531398 (0)	total: 43.7ms	remaining: 43.6s
200:	learn: 0.1966502	test: 0.2055671	best: 0.2055671 (200)	total: 7.5s	remaining: 29.8s
400:	learn: 0.1910330	test: 0.2009902	best: 0.2009902 (400)	total: 15s	remaining: 22.5s
600:	learn: 0.1862643	test: 0.1967844	best: 0.1967413 (596)	total: 24.8s	remaining: 16.5s
800:	learn: 0.1821191	test: 0.1930915	best: 0.1930642 (796)	total: 32.3s	remaining: 8.03s


[I 2024-02-21 23:05:57,030] Trial 6 finished with value: 0.3561290322580645 and parameters: {'learning_rate': 0.2885467682229827, 'objective': 'Logloss', 'colsample_bylevel': 0.07421110597689759, 'depth': 2, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 1.6984081943911022}. Best is trial 4 with value: 0.48184568835098335.


Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.1925715719
bestIteration = 883

Shrink model to first 884 iterations.
0:	learn: 0.3385066	test: 0.3435258	best: 0.3435258 (0)	total: 58.8ms	remaining: 58.7s


[I 2024-02-21 23:06:04,707] Trial 7 finished with value: 0.47636632200886264 and parameters: {'learning_rate': 0.4695115811550748, 'objective': 'Logloss', 'colsample_bylevel': 0.061928937231304645, 'depth': 9, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 4 with value: 0.48184568835098335.


Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.1742208182
bestIteration = 78

Shrink model to first 79 iterations.
0:	learn: 0.3856686	test: 0.3890712	best: 0.3890712 (0)	total: 50.3ms	remaining: 50.3s
200:	learn: 0.1673249	test: 0.1803461	best: 0.1802637 (198)	total: 9.25s	remaining: 36.8s
400:	learn: 0.1551934	test: 0.1738503	best: 0.1737840 (394)	total: 20.3s	remaining: 30.3s
600:	learn: 0.1478413	test: 0.1719165	best: 0.1719165 (600)	total: 33.3s	remaining: 22.1s
Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.170904431
bestIteration = 679

Shrink model to first 680 iterations.


[I 2024-02-21 23:06:43,884] Trial 8 finished with value: 0.466211085801063 and parameters: {'learning_rate': 0.403055173237873, 'objective': 'CrossEntropy', 'colsample_bylevel': 0.06319806002219425, 'depth': 4, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS'}. Best is trial 4 with value: 0.48184568835098335.


0:	learn: 0.3850507	test: 0.3887779	best: 0.3887779 (0)	total: 103ms	remaining: 1m 42s


[I 2024-02-21 23:07:01,730] Trial 9 finished with value: 0.45977011494252873 and parameters: {'learning_rate': 0.4182237436409738, 'objective': 'Logloss', 'colsample_bylevel': 0.06509482024485133, 'depth': 10, 'boosting_type': 'Ordered', 'bootstrap_type': 'Bernoulli', 'subsample': 0.9670838437379145}. Best is trial 4 with value: 0.48184568835098335.


Stopped by overfitting detector  (20 iterations wait)

bestTest = 0.173403745
bestIteration = 112

Shrink model to first 113 iterations.
Number of finished trials: 10
Best trial:
  Value: 0.48184568835098335
  Params: 
    learning_rate: 0.1952068445800979
    objective: Logloss
    colsample_bylevel: 0.061574747318220616
    depth: 6
    boosting_type: Plain
    bootstrap_type: Bernoulli
    subsample: 0.2605474819260579


In [38]:
best_params = study.best_trial.params
# best_params["eval_metric"] = 'F1'
# best_params["random_state"] = 57
# best_cat = CatBoostClassifier(task_type="GPU", **best_params)
best_cat = CatBoostClassifier(**best_params)
y_train_numeric = [1 if label == True else 0 for label in y_train]
y_val_numeric = [1 if label == True else 0 for label in y_val]
best_cat.fit(x_train, y_train_numeric,
             eval_set=[(x_val, y_val_numeric)],
             early_stopping_rounds=35,
             cat_features=categorical_features_indices,
             verbose=200)


y_pred = best_cat.predict(x_val)

# y_pred_bool = np.where(y_pred == 'True', True, False)
print(y_pred)

get_clf_eval(y_val, y_pred)

0:	learn: 0.5145945	test: 0.5163180	best: 0.5163180 (0)	total: 228ms	remaining: 3m 47s
200:	learn: 0.1632280	test: 0.1796179	best: 0.1796179 (200)	total: 13.4s	remaining: 53.3s
400:	learn: 0.1473807	test: 0.1737370	best: 0.1737370 (400)	total: 25.6s	remaining: 38.2s
600:	learn: 0.1370446	test: 0.1707779	best: 0.1707779 (600)	total: 37.2s	remaining: 24.7s
Stopped by overfitting detector  (35 iterations wait)

bestTest = 0.1702249692
bestIteration = 709

Shrink model to first 710 iterations.
[1 0 0 ... 0 0 0]
F1: 0.4796


In [40]:
# df_test = df_test.drop(["is_converted", "id"], axis=1)
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5271 entries, 0 to 5270
Data columns (total 22 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   bant_submit              5271 non-null   object 
 1   customer_country         5271 non-null   object 
 2   business_unit            5271 non-null   object 
 3   com_reg_ver_win_rate     5271 non-null   float64
 4   customer_type            5271 non-null   object 
 5   enterprise               5271 non-null   object 
 6   historical_existing_cnt  5271 non-null   float64
 7   id_strategic_ver         5271 non-null   float64
 8   it_strategic_ver         5271 non-null   float64
 9   idit_strategic_ver       5271 non-null   float64
 10  customer_job             5271 non-null   object 
 11  lead_desc_length         5271 non-null   object 
 12  inquiry_type             5271 non-null   object 
 13  product_category         5271 non-null   object 
 14  customer_position       

In [41]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59299 entries, 0 to 59298
Data columns (total 23 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   bant_submit              59299 non-null  object 
 1   customer_country         59299 non-null  object 
 2   business_unit            59299 non-null  object 
 3   com_reg_ver_win_rate     14568 non-null  float64
 4   customer_type            59299 non-null  object 
 5   enterprise               59299 non-null  object 
 6   historical_existing_cnt  13756 non-null  float64
 7   id_strategic_ver         3444 non-null   float64
 8   it_strategic_ver         1121 non-null   float64
 9   idit_strategic_ver       4565 non-null   float64
 10  customer_job             59299 non-null  object 
 11  lead_desc_length         59299 non-null  object 
 12  inquiry_type             59299 non-null  object 
 13  product_category         59299 non-null  object 
 14  customer_position     

In [42]:
df_test = df_test.fillna(0)
test_pred = best_cat.predict(df_test)

# pred_bool = np.where(test_pred == 'True', True, False)
# sum(pred_bool) # True로 예측된 개수
sum(test_pred)
# sum(1 if pred == 'True' else 0 for pred in test_pred)

386

In [31]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

### Model Training, Evaluation

In [19]:
import optuna
from optuna.samplers import TPESampler
import warnings
warnings.filterwarnings('ignore')

In [44]:
# Lenear Regression
lr_model = LogisticRegression(solver='lbfgs')
lr_model.fit(x_train, y_train)
print('---LR--- ')
get_clf_eval(y_val, lr_model.predict(x_val))


---LR--- 
F1: 0.0360


In [45]:
# KNN
knn_model = KNeighborsClassifier(n_neighbors=2)
knn_model.fit(x_train, y_train)
print('---KNN---')
get_clf_eval(y_val, knn_model.predict(x_val))

---KNN---
F1: 0.4917


In [46]:
# SVC
svc_model = SVC(C=10, kernel = 'rbf', random_state = 300)
svc_model.fit(x_train, y_train)
print('---SVC---')
get_clf_eval(y_val, svc_model.predict(x_val))

---SVC---
F1: 0.5221


In [47]:
# DT - baseline
dt_model = DecisionTreeClassifier()
dt_model.fit(x_train, y_train)
print('---DT---')
get_clf_eval(y_val, dt_model.predict(x_val))

---DT---
F1: 0.7833


In [48]:
# RF
rf_model = RandomForestClassifier()
rf_model.fit(x_train, y_train)
print('---RF---')
get_clf_eval(y_val, rf_model.predict(x_val))

---RF---
F1: 0.8251


In [49]:
# Ensemble
ensemble = VotingClassifier(estimators=[('rf',rf_model),
                                         ('knn',knn_model),
                                           ('dt',dt_model),
                                        ('svc', svc_model)])
ensemble.fit(x_train, y_train)
print('---ENSEMBLE---')
get_clf_eval(y_val, ensemble.predict(x_val))

---ENSEMBLE---
F1: 0.6505


In [50]:
df_train2 = df_train.drop('is_converted', axis=1)
feature_importance_rf = pd.DataFrame(rf_model.feature_importances_, index=df_train2.columns, columns=['Feature Importance'])

feature_importance_rf_sorted = feature_importance_rf.sort_values(by='Feature Importance', ascending=False)

# 정렬된 feature importance를 출력합니다.
print("RF Feature Importance (Sorted):\n", feature_importance_rf_sorted)
# print("Feature Importance:\n", feature_importance_rf)

# feature_importance_rf = pd.DataFrame({'Feature Importance': rf_model.feature_importances_})
# print("Feature Importance:\n", feature_importance_rf)

RF Feature Importance (Sorted):
                          Feature Importance
customer_idx                       0.406988
lead_owner                         0.135523
customer_country                   0.049865
response_corporate                 0.043000
customer_type                      0.037585
lead_desc_length                   0.037385
product_category                   0.033237
customer_job                       0.033026
com_reg_ver_win_rate               0.031768
bant_submit                        0.023888
business_unit                      0.020121
customer_position                  0.020006
ver_win_rate_x                     0.019574
expected_timeline                  0.019206
historical_existing_cnt            0.018538
business_area                      0.016447
ver_win_ratio_per_bu               0.015198
enterprise                         0.014011
inquiry_type                       0.012872
ver_cus                            0.005509
ver_pro                            0.002109

In [51]:
from sklearn.metrics import f1_score

def objective_RF(trial):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        # 'criterion': 'gini',  # 기준을 'gini'로 변경
        # 'max_depth' : trial.suggest_int('max_depth', 3, 10),
        'min_samples_split': trial.suggest_int('min_samples_split', 1, 10),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10),
        'min_weight_fraction_leaf': 0.0,
        'max_features': trial.suggest_int('min_samples_leaf', 1.0, 5.0),
        # 'max_leaf_nodes': None,
        # 'min_impurity_decrease': 0.0,
        # 'bootstrap': True,
        # 'n_jobs': None,
        'random_state': 28
    }

    model = RandomForestClassifier(**param)
    model.fit(x_train, y_train)

    pred_y = model.predict(x_val)

    # F1 score를 평가 지표로 사용
    return f1_score(y_val, pred_y)

RF_study = optuna.create_study(direction='maximize')
RF_study.optimize(objective_RF, n_trials=10)

print('Number of finished trials: {}'.format(len(RF_study.trials)))
print('Best trial:')
trial = RF_study.best_trial

print('  Value: {}'.format(trial.value))
print('  Params: ')

for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))

[I 2024-02-20 21:07:55,830] A new study created in memory with name: no-name-4321052b-ce76-4f4b-8fbb-995f35e4c5a4
[I 2024-02-20 21:08:01,286] Trial 0 finished with value: 0.7424144609425436 and parameters: {'n_estimators': 109, 'min_samples_split': 6, 'min_samples_leaf': 7}. Best is trial 0 with value: 0.7424144609425436.
[I 2024-02-20 21:08:28,908] Trial 1 finished with value: 0.7413127413127413 and parameters: {'n_estimators': 429, 'min_samples_split': 8, 'min_samples_leaf': 10}. Best is trial 0 with value: 0.7424144609425436.
[I 2024-02-20 21:08:43,943] Trial 2 finished with value: 0.7449967721110394 and parameters: {'n_estimators': 413, 'min_samples_split': 10, 'min_samples_leaf': 6}. Best is trial 2 with value: 0.7449967721110394.
[I 2024-02-20 21:08:46,488] Trial 3 finished with value: 0.7518987341772152 and parameters: {'n_estimators': 144, 'min_samples_split': 4, 'min_samples_leaf': 1}. Best is trial 3 with value: 0.7518987341772152.
[I 2024-02-20 21:08:56,902] Trial 4 finished

InvalidParameterError: The 'min_samples_split' parameter of RandomForestClassifier must be an int in the range [2, inf) or a float in the range (0.0, 1.0]. Got 1 instead.

In [52]:
params = trial.params

best_rf = RandomForestClassifier(**params)
best_rf.fit(x_train, y_train)

y_pred = best_rf.predict(x_val)
# y_pred_bool = np.where(y_pred == 'True', True, False)
print(y_pred)
# 평가
get_clf_eval(y_val, y_pred)

NameError: name 'trial' is not defined

In [53]:
# XGBoost
xgb = xgboost.XGBClassifier().fit(x_train, y_train)
print('---XGBoost---')
get_clf_eval(y_val, xgb.predict(x_val))

---XGBoost---
F1: 0.8018


In [54]:
df_train2 = df_train.drop('is_converted', axis=1)
feature_importance_xgb = pd.DataFrame({'Feature Importance': xgb.feature_importances_}, index=df_train2.columns)
feature_importance_xgb_sorted = feature_importance_xgb.sort_values(by='Feature Importance', ascending=False)

# 정렬된 feature importance를 출력합니다.
print("XGB Feature Importance (Sorted):\n", feature_importance_xgb_sorted)

XGB Feature Importance (Sorted):
                          Feature Importance
customer_idx                       0.193005
customer_type                      0.149983
business_unit                      0.070901
com_reg_ver_win_rate               0.063205
id_strategic_ver                   0.054102
lead_owner                         0.046948
bant_submit                        0.046775
response_corporate                 0.035189
inquiry_type                       0.034716
customer_country                   0.031521
enterprise                         0.025936
ver_win_rate_x                     0.025122
historical_existing_cnt            0.024128
idit_strategic_ver                 0.023905
business_area                      0.023335
expected_timeline                  0.022283
product_category                   0.021590
ver_win_ratio_per_bu               0.020082
lead_desc_length                   0.019189
customer_job                       0.018430
ver_cus                            0.01640

In [70]:
from sklearn.utils.class_weight import compute_class_weight

classes = np.unique(y_train)
weights = compute_class_weight(class_weight='balanced', classes = classes, y = y_train)
class_weights = dict(zip(classes, weights))
print(class_weights)

{False: 0.5448249724366042, True: 6.077248270561107}


In [71]:

def objective(trial):
    """Define the objective function"""

    params = {
        'class_weight': class_weights,
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.001, 0.1),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 1.0),
        'subsample': trial.suggest_loguniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_loguniform('colsample_bytree', 0.01, 1.0),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),
        'eval_metric': 'auc',
        'scale_pos_weight': trial.suggest_loguniform('scales_pos_weight', 0.2, 1.0),
        # 'eval_metric': 'F1',
        'use_label_encoder': False
    }

    # Fit the model
    optuna_model = xgboost.XGBClassifier(**params)
    optuna_model.fit(x_train, y_train)

    # Make predictions
    y_pred = optuna_model.predict(x_val)

    # Evaluate predictions
    F_1 = accuracy_score(y_val, y_pred)
    return F_1

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print('Number of finished trials: {}'.format(len(study.trials)))
print('Best trial:')
trial = study.best_trial

print('  Value: {}'.format(trial.value))
print('  Params: ')

for key, value in trial.params.items():
    print('    {}: {}'.format(key, value))

[I 2024-02-21 21:54:46,675] A new study created in memory with name: no-name-c08dfa66-d4a7-4e66-ada3-78fe2db674e6
[I 2024-02-21 21:54:46,929] Trial 0 finished with value: 0.9600337268128162 and parameters: {'max_depth': 10, 'learning_rate': 0.08881706254138509, 'n_estimators': 144, 'min_child_weight': 10, 'gamma': 0.03453221072449202, 'subsample': 0.5904715175572246, 'colsample_bytree': 0.08346419913735163, 'reg_alpha': 4.391586502424888e-06, 'reg_lambda': 2.141954563141683e-07, 'scales_pos_weight': 0.4537785767986721}. Best is trial 0 with value: 0.9600337268128162.
[I 2024-02-21 21:54:47,446] Trial 1 finished with value: 0.9211635750421585 and parameters: {'max_depth': 8, 'learning_rate': 0.012651384918107788, 'n_estimators': 263, 'min_child_weight': 4, 'gamma': 3.931476844411423e-06, 'subsample': 0.5553132056283949, 'colsample_bytree': 0.11249644662648992, 'reg_alpha': 1.1831877317347588e-06, 'reg_lambda': 0.038586443624964106, 'scales_pos_weight': 0.568261800971502}. Best is trial 

Number of finished trials: 50
Best trial:
  Value: 0.9698988195615514
  Params: 
    max_depth: 8
    learning_rate: 0.08290687170178082
    n_estimators: 341
    min_child_weight: 8
    gamma: 7.705816242014445e-07
    subsample: 0.8651211674861486
    colsample_bytree: 0.6517026248954959
    reg_alpha: 0.15608214784434282
    reg_lambda: 0.027025025317903874
    scales_pos_weight: 0.6626033469345955


In [20]:
params = trial.params

best_xgb = xgboost.XGBClassifier(**params)
best_xgb.fit(x_train, y_train)

y_pred = best_xgb.predict(x_val)
# y_pred_bool = np.where(y_pred == 'True', True, False)
print(y_pred)
# 평가
get_clf_eval(y_val, y_pred)

NameError: name 'trial' is not defined

In [21]:
# LightGBM
lgbm = LGBMClassifier(metric='auc', boost_from_average=False).fit(x_train, y_train)
print('---LightGBM---')
get_clf_eval(y_val, lgbm.predict(x_val))

[LightGBM] [Info] Number of positive: 3903, number of negative: 43536
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001722 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1100
[LightGBM] [Info] Number of data points in the train set: 47439, number of used features: 24
---LightGBM---
F1: 0.7886


In [22]:
from optuna.pruners import SuccessiveHalvingPruner
def objective(trial):
    params = {
        'num_leaves': trial.suggest_int('num_leaves', 2, 1024, step=1, log=True),
        'max_depth': trial.suggest_int('max_depth', 1, 15, step=1, log=False),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 100, 800, step=1, log=True),
        'objective': 'binary',
        'boost_from_average': False,
        # 'boosting_type': 'dart',
        'is_unbalance': True,
        # 'class_weight': trial.suggest_categorical('class_weight', ['balanced', None]),
        'class_weight': 'balanced',
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 20, step=1, log=False),
        'subsample': trial.suggest_uniform('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.7, 1.0),
        'reg_alpha': trial.suggest_uniform('reg_alpha', 0.0, 1.0),
        'reg_lambda': trial.suggest_uniform('reg_lambda', 0.0, 1.0),
        'random_state': 0
    }

    # score = []
    # for df_train, df_valid in zip(df_trains, df_valids):
    #     clf = LGBMClassifier(**params)
    #     clf.fit(df_train[features], df_train[target])
    #
    #     pred = clf.predict(df_valid[features])
    #     true = df_valid[target].values
    #     score.append(accuracy(true, pred))
    # score = np.mean(score)
    optuna_model = LGBMClassifier(**params)
    optuna_model.fit(x_train, y_train,eval_set=[(x_val, y_val)])

# Make predictions
    y_pred = optuna_model.predict(x_val)

# Evaluate predictions
    F_1 = accuracy_score(y_val, y_pred)
    return F_1

# Hyperparameter Tuning
# study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=0), pruner=SuccessiveHalvingPruner())
study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=0))
study.optimize(objective, n_trials=20)

[I 2024-02-21 22:04:05,266] A new study created in memory with name: no-name-f6f23c87-372d-4727-a5ce-c9b120b8b57e


[LightGBM] [Info] Number of positive: 3903, number of negative: 43536
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001647 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1100
[LightGBM] [Info] Number of data points in the train set: 47439, number of used features: 24


[I 2024-02-21 22:04:06,136] Trial 0 finished with value: 0.9112141652613828 and parameters: {'num_leaves': 54, 'max_depth': 11, 'learning_rate': 0.040064836619643954, 'n_estimators': 310, 'min_child_samples': 9, 'subsample': 0.8937682339199968, 'colsample_bytree': 0.8312761633788077, 'reg_alpha': 0.8917730007820798, 'reg_lambda': 0.9636627605010293}. Best is trial 0 with value: 0.9112141652613828.


[LightGBM] [Info] Number of positive: 3903, number of negative: 43536
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001571 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1100
[LightGBM] [Info] Number of data points in the train set: 47439, number of used features: 24


[I 2024-02-21 22:04:06,575] Trial 1 finished with value: 0.8336424957841484 and parameters: {'num_leaves': 18, 'max_depth': 12, 'learning_rate': 0.0337983049214134, 'n_estimators': 325, 'min_child_samples': 19, 'subsample': 0.7213108174593661, 'colsample_bytree': 0.7261387899104622, 'reg_alpha': 0.02021839744032572, 'reg_lambda': 0.832619845547938}. Best is trial 0 with value: 0.9112141652613828.


[LightGBM] [Info] Number of positive: 3903, number of negative: 43536
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001532 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1100
[LightGBM] [Info] Number of data points in the train set: 47439, number of used features: 24


[I 2024-02-21 22:04:09,309] Trial 2 finished with value: 0.9714165261382799 and parameters: {'num_leaves': 241, 'max_depth': 14, 'learning_rate': 0.09519592150539828, 'n_estimators': 527, 'min_child_samples': 10, 'subsample': 0.9341587528859366, 'colsample_bytree': 0.7354823277606799, 'reg_alpha': 0.6399210213275238, 'reg_lambda': 0.1433532874090464}. Best is trial 2 with value: 0.9714165261382799.


[LightGBM] [Info] Number of positive: 3903, number of negative: 43536
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001332 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1100
[LightGBM] [Info] Number of data points in the train set: 47439, number of used features: 24


[I 2024-02-21 22:04:09,958] Trial 3 finished with value: 0.8758010118043845 and parameters: {'num_leaves': 714, 'max_depth': 8, 'learning_rate': 0.025981363554663423, 'n_estimators': 173, 'min_child_samples': 16, 'subsample': 0.8368450996649646, 'colsample_bytree': 0.8705301846605945, 'reg_alpha': 0.018789800436355142, 'reg_lambda': 0.6176354970758771}. Best is trial 2 with value: 0.9714165261382799.


[LightGBM] [Info] Number of positive: 3903, number of negative: 43536
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001277 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1100
[LightGBM] [Info] Number of data points in the train set: 47439, number of used features: 24


[I 2024-02-21 22:04:11,050] Trial 4 finished with value: 0.9608768971332209 and parameters: {'num_leaves': 81, 'max_depth': 10, 'learning_rate': 0.08785127695211986, 'n_estimators': 412, 'min_child_samples': 8, 'subsample': 0.8311095861398023, 'colsample_bytree': 0.9092893587781794, 'reg_alpha': 0.06022547162926983, 'reg_lambda': 0.6667667154456677}. Best is trial 2 with value: 0.9714165261382799.
[I 2024-02-21 22:04:11,285] Trial 5 finished with value: 0.5187183811129848 and parameters: {'num_leaves': 119, 'max_depth': 4, 'learning_rate': 0.01345631972964045, 'n_estimators': 192, 'min_child_samples': 8, 'subsample': 0.8710590311253639, 'colsample_bytree': 0.8315804540386961, 'reg_alpha': 0.9883738380592262, 'reg_lambda': 0.10204481074802807}. Best is trial 2 with value: 0.9714165261382799.


[LightGBM] [Info] Number of positive: 3903, number of negative: 43536
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001621 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1100
[LightGBM] [Info] Number of data points in the train set: 47439, number of used features: 24


[I 2024-02-21 22:04:11,460] Trial 6 finished with value: 0.606239460370995 and parameters: {'num_leaves': 6, 'max_depth': 3, 'learning_rate': 0.044989205684622634, 'n_estimators': 169, 'min_child_samples': 10, 'subsample': 0.7733276776004808, 'colsample_bytree': 0.7476908750936558, 'reg_alpha': 0.11037514116430513, 'reg_lambda': 0.6563295894652734}. Best is trial 2 with value: 0.9714165261382799.


[LightGBM] [Info] Number of positive: 3903, number of negative: 43536
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001621 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1100
[LightGBM] [Info] Number of data points in the train set: 47439, number of used features: 24
[LightGBM] [Info] Number of positive: 3903, number of negative: 43536
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001495 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1100
[LightGBM] [Info] Number of data points in the train set: 47439, number of used features: 24


[I 2024-02-21 22:04:11,829] Trial 7 finished with value: 0.6290893760539629 and parameters: {'num_leaves': 4, 'max_depth': 3, 'learning_rate': 0.023373576487028164, 'n_estimators': 551, 'min_child_samples': 2, 'subsample': 0.9513834722496411, 'colsample_bytree': 0.7288295223681889, 'reg_alpha': 0.9764594650133958, 'reg_lambda': 0.4686512016477016}. Best is trial 2 with value: 0.9714165261382799.


[LightGBM] [Info] Number of positive: 3903, number of negative: 43536
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001662 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1100
[LightGBM] [Info] Number of data points in the train set: 47439, number of used features: 24


[I 2024-02-21 22:04:12,614] Trial 8 finished with value: 0.9297639123102867 and parameters: {'num_leaves': 880, 'max_depth': 10, 'learning_rate': 0.054860982288095536, 'n_estimators': 108, 'min_child_samples': 6, 'subsample': 0.7360589683639507, 'colsample_bytree': 0.7888420592566434, 'reg_alpha': 0.11872771895424405, 'reg_lambda': 0.317983179393976}. Best is trial 2 with value: 0.9714165261382799.


[LightGBM] [Info] Number of positive: 3903, number of negative: 43536
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001452 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1100
[LightGBM] [Info] Number of data points in the train set: 47439, number of used features: 24


[I 2024-02-21 22:04:12,813] Trial 9 finished with value: 0.1760539629005059 and parameters: {'num_leaves': 22, 'max_depth': 1, 'learning_rate': 0.049257472025760515, 'n_estimators': 324, 'min_child_samples': 6, 'subsample': 0.8569744160400099, 'colsample_bytree': 0.7281821532275324, 'reg_alpha': 0.5759464955561793, 'reg_lambda': 0.9292961975762141}. Best is trial 2 with value: 0.9714165261382799.


[LightGBM] [Info] Number of positive: 3903, number of negative: 43536
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001724 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1100
[LightGBM] [Info] Number of data points in the train set: 47439, number of used features: 24


[I 2024-02-21 22:04:15,653] Trial 10 finished with value: 0.9719224283305228 and parameters: {'num_leaves': 258, 'max_depth': 15, 'learning_rate': 0.08969707083284607, 'n_estimators': 691, 'min_child_samples': 14, 'subsample': 0.9958953351745876, 'colsample_bytree': 0.955986640722914, 'reg_alpha': 0.5874424969009222, 'reg_lambda': 0.013048945064446538}. Best is trial 10 with value: 0.9719224283305228.


[LightGBM] [Info] Number of positive: 3903, number of negative: 43536
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001599 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1100
[LightGBM] [Info] Number of data points in the train set: 47439, number of used features: 24


[I 2024-02-21 22:04:18,399] Trial 11 finished with value: 0.9716694772344013 and parameters: {'num_leaves': 294, 'max_depth': 15, 'learning_rate': 0.09758074597008058, 'n_estimators': 799, 'min_child_samples': 14, 'subsample': 0.9858697752138047, 'colsample_bytree': 0.993716802755091, 'reg_alpha': 0.5926938891782882, 'reg_lambda': 0.005596828096247286}. Best is trial 10 with value: 0.9719224283305228.


[LightGBM] [Info] Number of positive: 3903, number of negative: 43536
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001674 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1100
[LightGBM] [Info] Number of data points in the train set: 47439, number of used features: 24


[I 2024-02-21 22:04:22,481] Trial 12 finished with value: 0.9720067453625633 and parameters: {'num_leaves': 238, 'max_depth': 15, 'learning_rate': 0.07293331351360245, 'n_estimators': 747, 'min_child_samples': 14, 'subsample': 0.9921505919606595, 'colsample_bytree': 0.9911136724383788, 'reg_alpha': 0.36711885069757555, 'reg_lambda': 0.007899429029182219}. Best is trial 12 with value: 0.9720067453625633.


[LightGBM] [Info] Number of positive: 3903, number of negative: 43536
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001646 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1100
[LightGBM] [Info] Number of data points in the train set: 47439, number of used features: 24


[I 2024-02-21 22:04:26,983] Trial 13 finished with value: 0.9722596964586847 and parameters: {'num_leaves': 225, 'max_depth': 13, 'learning_rate': 0.06515905080012946, 'n_estimators': 758, 'min_child_samples': 14, 'subsample': 0.9931895571891459, 'colsample_bytree': 0.9979442196243827, 'reg_alpha': 0.34778767310727365, 'reg_lambda': 0.26288731059569437}. Best is trial 13 with value: 0.9722596964586847.


[LightGBM] [Info] Number of positive: 3903, number of negative: 43536
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001428 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1100
[LightGBM] [Info] Number of data points in the train set: 47439, number of used features: 24


[I 2024-02-21 22:04:29,628] Trial 14 finished with value: 0.9677908937605396 and parameters: {'num_leaves': 146, 'max_depth': 13, 'learning_rate': 0.058606007989168124, 'n_estimators': 557, 'min_child_samples': 20, 'subsample': 0.9347602803276877, 'colsample_bytree': 0.9954839326760775, 'reg_alpha': 0.33011128543422125, 'reg_lambda': 0.27552911084576603}. Best is trial 13 with value: 0.9722596964586847.


[LightGBM] [Info] Number of positive: 3903, number of negative: 43536
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001533 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1100
[LightGBM] [Info] Number of data points in the train set: 47439, number of used features: 24


[I 2024-02-21 22:04:32,131] Trial 15 finished with value: 0.9676222596964587 and parameters: {'num_leaves': 480, 'max_depth': 8, 'learning_rate': 0.06457098792942602, 'n_estimators': 795, 'min_child_samples': 13, 'subsample': 0.959246137010887, 'colsample_bytree': 0.937844873247205, 'reg_alpha': 0.3199434030287674, 'reg_lambda': 0.2597610599728337}. Best is trial 13 with value: 0.9722596964586847.


[LightGBM] [Info] Number of positive: 3903, number of negative: 43536
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001608 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1100
[LightGBM] [Info] Number of data points in the train set: 47439, number of used features: 24


[I 2024-02-21 22:04:32,892] Trial 16 finished with value: 0.934822934232715 and parameters: {'num_leaves': 30, 'max_depth': 12, 'learning_rate': 0.07115796951476112, 'n_estimators': 442, 'min_child_samples': 17, 'subsample': 0.9073378861599921, 'colsample_bytree': 0.9550177032321719, 'reg_alpha': 0.37899970684044026, 'reg_lambda': 0.1752485618283931}. Best is trial 13 with value: 0.9722596964586847.


[LightGBM] [Info] Number of positive: 3903, number of negative: 43536
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001587 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1100
[LightGBM] [Info] Number of data points in the train set: 47439, number of used features: 24


[I 2024-02-21 22:04:33,218] Trial 17 finished with value: 0.6575885328836425 and parameters: {'num_leaves': 12, 'max_depth': 6, 'learning_rate': 0.014752546545958752, 'n_estimators': 245, 'min_child_samples': 12, 'subsample': 0.7877424067478546, 'colsample_bytree': 0.8804764563306592, 'reg_alpha': 0.24177009547203127, 'reg_lambda': 0.3848663811972053}. Best is trial 13 with value: 0.9722596964586847.


[LightGBM] [Info] Number of positive: 3903, number of negative: 43536
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001666 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1100
[LightGBM] [Info] Number of data points in the train set: 47439, number of used features: 24


[I 2024-02-21 22:04:34,742] Trial 18 finished with value: 0.877318718381113 and parameters: {'num_leaves': 48, 'max_depth': 14, 'learning_rate': 0.010441805395210542, 'n_estimators': 648, 'min_child_samples': 17, 'subsample': 0.9726754743323941, 'colsample_bytree': 0.9321621953388086, 'reg_alpha': 0.4779929776329695, 'reg_lambda': 0.48338224452754264}. Best is trial 13 with value: 0.9722596964586847.


[LightGBM] [Info] Number of positive: 3903, number of negative: 43536
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001489 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1100
[LightGBM] [Info] Number of data points in the train set: 47439, number of used features: 24


[I 2024-02-21 22:04:36,603] Trial 19 finished with value: 0.9654300168634065 and parameters: {'num_leaves': 139, 'max_depth': 10, 'learning_rate': 0.06962984287103297, 'n_estimators': 439, 'min_child_samples': 15, 'subsample': 0.9103123061428822, 'colsample_bytree': 0.9733597876226461, 'reg_alpha': 0.7167308458432986, 'reg_lambda': 0.18693582504406941}. Best is trial 13 with value: 0.9722596964586847.


In [23]:
params = study.best_trial.params

best_lgb = LGBMClassifier(**params)
best_lgb.fit(x_train, y_train)

y_pred = best_lgb.predict(x_val)
# y_pred_bool = np.where(y_pred == 'True', True, False)
print(y_pred)
# 평가
get_clf_eval(y_val, y_pred)

[LightGBM] [Info] Number of positive: 3903, number of negative: 43536
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001522 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1100
[LightGBM] [Info] Number of data points in the train set: 47439, number of used features: 24
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.082274 -> initscore=-2.411843
[LightGBM] [Info] Start training from score -2.411843
[False False False ... False False False]
F1: 0.8395


In [60]:
feature_importance_lgb = pd.DataFrame({'Feature Importance': best_lgb.feature_importances_}, index=df_train2.columns)
feature_importance_lgb_sorted = feature_importance_lgb.sort_values(by='Feature Importance', ascending=False)

# 정렬된 feature importance를 출력합니다.
print("LGB Feature Importance (Sorted):\n", feature_importance_lgb_sorted)

LGB Feature Importance (Sorted):
                          Feature Importance
customer_idx                          32168
lead_owner                            26065
customer_country                       9629
response_corporate                     8875
customer_job                           6727
product_category                       6237
ver_win_ratio_per_bu                   4550
historical_existing_cnt                4168
ver_win_rate_x                         2676
inquiry_type                           2673
bant_submit                            2503
business_area                          2439
expected_timeline                      2055
customer_type                          1737
lead_desc_length                       1563
com_reg_ver_win_rate                   1488
customer_position                      1415
business_unit                           976
enterprise                              557
idit_strategic_ver                      116
id_strategic_ver                         7

### 테스트 데이터 예측

In [24]:
# 예측에 필요한 데이터 분리
x_test = df_test.drop(["is_converted", "id"], axis=1)

In [25]:
print(df_test)

         id  bant_submit  customer_country  business_unit  \
0     19844         0.00               537              2   
1      9738         0.25                 0              3   
2      8491         1.00                73              2   
3     19895         0.50                 0              2   
4     10465         1.00               537              2   
...     ...          ...               ...            ...   
5266  13855         0.50               537              0   
5267   7979         0.25                 0              3   
5268  12887         0.75               537              0   
5269  17530         0.00                53              3   
5270   4330         0.25                19              0   

      com_reg_ver_win_rate  customer_idx  customer_type  enterprise  \
0                 0.073248         47466              1           0   
1                      NaN          5405              1           1   
2                      NaN         13597              

In [28]:
test_pred = best_lgb.predict(sc.transform(x_test.fillna(0)))
# test_pred = rf_model.predict(x_test.fillna(0))
# pred_bool = np.where(test_pred == 'True', True, False)
# count_true = sum(1 for item in pred_bool if item)  # True의 개수를 세는 방법
# print("True의 개수:", count_true)
sum(test_pred) # True로 예측된 개수
# print(pred_bool)

549

### 제출 파일 작성

In [29]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**