In [46]:
!pip install catboost



In [47]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    classification_report
)

In [48]:
# Mount Google Drive if using Google Colab
from google.colab import drive
drive.mount('/content/drive')

# Load datasets
train_file_path = '/content/drive/MyDrive/LGdata/train.csv'
test_file_path = '/content/drive/MyDrive/LGdata/submission.csv'
df_train = pd.read_csv(train_file_path)
df_test = pd.read_csv(test_file_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [49]:
# Drop unnecessary columns
drop_col = ['customer_country.1', 'customer_country', 'id_strategic_ver', 'it_strategic_ver', 'idit_strategic_ver']
df_train.drop(drop_col, axis=1, inplace=True)
df_test.drop(drop_col, axis=1, inplace=True)

In [50]:
label_columns = list(df_train.columns)

label_columns.remove("is_converted")

In [51]:
'''
# 수치형 변수 전체 Standardizaiton, Normalization 실행
'''
# Initialize scalers
scaler_standard = StandardScaler()
scaler_minmax = MinMaxScaler()
columns_to_ST = ["com_reg_ver_win_rate", "historical_existing_cnt","lead_desc_length"]
columns_to_NM = ["ver_win_rate_x", "ver_win_ratio_per_bu"]

# Apply Standardization to each column
for column in columns_to_ST:
    # Standardization
    df_train[column + '_standardized'] = scaler_standard.fit_transform(df_train[[column]])
    df_test[column + '_standardized'] = scaler_standard.fit_transform(df_test[[column]])
    # Drop the original column
    df_train.drop(columns=[column], inplace=True)
    df_test.drop(columns=[column], inplace=True)

# Apply Standardization to each column
for column in columns_to_NM:
    # Normalization
    df_train[column + '_normalized'] = scaler_minmax.fit_transform(df_train[[column]])
    df_test[column + '_normalized'] = scaler_minmax.fit_transform(df_test[[column]])
    # Drop the original column
    df_train.drop(columns=[column], inplace=True)
    df_test.drop(columns=[column], inplace=True)

In [52]:
'''
# business_area 날리고 2개 피처 추가
'''
def is_hospital(value):
    if value == "hospital & health care":
        return 1
    else:
        return 0

def is_power(value):
    if value == "power plant / renewable energy":
        return 1
    else:
        return 0

# "business_area" 열에서 함수를 적용하여 새로운 열 생성
df_train['is_hospital'] = df_train['business_area'].apply(is_hospital)
df_train['is_power'] = df_train['business_area'].apply(is_hospital)
df_test['is_hospital'] = df_test['business_area'].apply(is_hospital)
df_test['is_power'] = df_test['business_area'].apply(is_hospital)

In [53]:
'''
# expected_timeline 에서 특정 단어 필터링
'''
# 새로운 열을 추가하기 위한 함수 정의
def contains_budget(value):
    if 'budget' in str(value).lower():  # 대소문자 구분없이 'budget'이 포함되어 있는지 확인
        return 1
    else:
        return 0

# "expected_timeline" 열에서 함수를 적용하여 새로운 열 생성
df_train['contains_budget'] = df_train['expected_timeline'].apply(contains_budget)
df_test['contains_budget'] = df_test['expected_timeline'].apply(contains_budget)

# 새로운 열을 추가하기 위한 함수 정의
def contains_etc(value):
    if 'etc' in str(value).lower():  # 대소문자 구분없이 'budget'이 포함되어 있는지 확인
        return 1
    else:
        return 0

# "expected_timeline" 열에서 함수를 적용하여 새로운 열 생성
df_train['contains_etc'] = df_train['expected_timeline'].apply(contains_etc)
df_test['contains_etc'] = df_test['expected_timeline'].apply(contains_etc)

# 새로운 열을 추가하기 위한 함수 정의
def contains_hence(value):
    if 'hence' in str(value).lower():  # 대소문자 구분없이 'budget'이 포함되어 있는지 확인
        return 1
    else:
        return 0

# "expected_timeline" 열에서 함수를 적용하여 새로운 열 생성
df_train['contains_hence'] = df_train['expected_timeline'].apply(contains_hence)
df_test['contains_hence'] = df_test['expected_timeline'].apply(contains_hence)

In [54]:
# 새로운 열을 추가하기 위한 함수 정의
def contains_although(value):
    if 'although' in str(value).lower():  # 대소문자 구분없이 'budget'이 포함되어 있는지 확인
        return 1
    else:
        return 0

# "expected_timeline" 열에서 함수를 적용하여 새로운 열 생성
df_train['contains_although'] = df_train['expected_timeline'].apply(contains_although)
df_test['contains_although'] = df_test['expected_timeline'].apply(contains_although)

# 새로운 열을 추가하기 위한 함수 정의
def contains_more(value):
    if 'more' in str(value).lower():  # 대소문자 구분없이 'budget'이 포함되어 있는지 확인
        return 1
    else:
        return 0

# "expected_timeline" 열에서 함수를 적용하여 새로운 열 생성
df_train['contains_more'] = df_train['expected_timeline'].apply(contains_more)
df_test['contains_more'] = df_test['expected_timeline'].apply(contains_more)

# 새로운 열을 추가하기 위한 함수 정의
def contains_year(value):
    if 'year' in str(value).lower():  # 대소문자 구분없이 'budget'이 포함되어 있는지 확인
        return 1
    else:
        return 0

# "expected_timeline" 열에서 함수를 적용하여 새로운 열 생성
df_train['contains_year'] = df_train['expected_timeline'].apply(contains_year)
df_test['contains_year'] = df_test['expected_timeline'].apply(contains_year)

In [55]:
# Calculate the most frequent value (mode) for each column
modes = df_train.mode().iloc[0]

# Fill missing (NA) values with the mode
df_train.fillna(modes, inplace=True)

# Calculate the most frequent value (mode) for each column
modes = df_test.mode().iloc[0]

# Fill missing (NA) values with the mode
df_test.fillna(modes, inplace=True)

In [57]:
'''
df_train과 df_test에서 숫자 형식인거 다 str로 바꾸기 (일부제외)
'''

# 변환할 라벨들의 리스트
labels_to_convert = [
    "bant_submit",
    "com_reg_ver_win_rate_standardized",
    "historical_existing_cnt_standardized",
    "id_strategic_ver",
    "it_strategic_ver",
    "idit_strategic_ver",
    "lead_desc_length_standardized",
    "ver_cus",
    "ver_pro",
    "ver_win_rate_x_normalized",
    "ver_win_ratio_per_bu_normalized"
]

# df_test와 df_train에서 라벨들의 데이터 타입을 str로 변환
for label in labels_to_convert:
    if label in df_test.columns:
        df_test[label] = df_test[label].astype(str)
    if label in df_train.columns:
        df_train[label] = df_train[label].astype(str)

In [58]:
x_train, x_val, y_train, y_val = train_test_split(
    df_train.drop("is_converted", axis=1),
    df_train["is_converted"],
    test_size=0.2,
    shuffle=True,
    random_state=400,
)

In [59]:
cat_features_col = list(x_train.columns)

In [60]:
catboost_model = CatBoostClassifier(
    verbose=0,
    iterations=1000,
    depth=6,
    learning_rate=0.1,
    loss_function='CrossEntropy',
    one_hot_max_size=5,
    cat_features=cat_features_col
    )

In [61]:
catboost_model.fit(x_train, y_train)

<catboost.core.CatBoostClassifier at 0x7d7f51360700>

In [62]:
feature_importances = catboost_model.get_feature_importance()

# 특성 중요도 출력
for score, name in sorted(zip(feature_importances, x_train.columns), reverse=True):
    print(f"{name}: {score}")

lead_owner: 29.794078100019465
customer_idx: 23.006719398437852
response_corporate: 6.322731210629518
inquiry_type: 5.865759381240368
lead_desc_length_standardized: 4.540083167850406
product_category: 4.32242961919456
business_subarea: 3.1757043498576287
customer_type: 3.0184312504664184
customer_job: 2.570101807187274
com_reg_ver_win_rate_standardized: 2.2902167793996258
historical_existing_cnt_standardized: 2.0035104139229856
customer_position: 1.7665213829519146
product_subcategory: 1.3225426173830392
ver_win_ratio_per_bu_normalized: 1.3059201296254948
product_modelname: 1.2596673808630918
business_unit: 1.2061769402621862
ver_cus: 1.160299284137255
business_area: 1.086924098463695
bant_submit: 1.0639375125598267
expected_timeline: 0.9154603564095906
ver_win_rate_x_normalized: 0.8023750547170138
enterprise: 0.5361130929190054
ver_pro: 0.4635913904656987
is_power: 0.14600725485380864
contains_etc: 0.031694779770526794
contains_year: 0.01957884585436287
is_hospital: 0.0034244005574986