In [24]:
!pip install catboost

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [25]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    classification_report
)

In [41]:
df_train = pd.read_csv("train.csv") # 학습용 데이터
df_test = pd.read_csv("submission.csv") # 테스트 데이터(제출파일의 데이터)

In [42]:
'''
# customer_country, customer_country.1 날리고 response_orporate만 남기기
'''
drop_col = ['customer_country.1', 'customer_country', 'id_strategic_ver', 'it_strategic_ver', 'idit_strategic_ver']
df_train.drop(drop_col, axis=1, inplace=True)
df_test.drop(drop_col, axis=1, inplace=True)

In [43]:
label_columns = list(df_train.columns)

label_columns.remove("is_converted")

In [44]:
df_train['customer_type'] = df_train['customer_type'].replace('End-Customer', 'End Customer')
df_test['customer_type'] = df_test['customer_type'].replace('End-Customer', 'End Customer')

df_train['customer_type'] = df_train['customer_type'].replace('Specifier/ Influencer', 'Specifier / Influencer')
df_test['customer_type'] = df_test['customer_type'].replace('Specifier/ Influencer', 'Specifier / Influencer')

In [45]:
'''
# 수치형 변수 전체 Standardizaiton, Normalization 실행
'''
# Initialize scalers
scaler_standard = StandardScaler()
scaler_minmax = MinMaxScaler()
columns_to_ST = ["com_reg_ver_win_rate", "historical_existing_cnt","lead_desc_length"]
columns_to_NM = ["ver_win_rate_x", "ver_win_ratio_per_bu"]

# Apply Standardization to each column
for column in columns_to_ST:
    # Standardization
    df_train[column + '_standardized'] = scaler_standard.fit_transform(df_train[[column]])
    df_test[column + '_standardized'] = scaler_standard.fit_transform(df_test[[column]])
    # Drop the original column
    df_train.drop(columns=[column], inplace=True)
    df_test.drop(columns=[column], inplace=True)

# Apply Standardization to each column
for column in columns_to_NM:
    # Normalization
    df_train[column + '_normalized'] = scaler_minmax.fit_transform(df_train[[column]])
    df_test[column + '_normalized'] = scaler_minmax.fit_transform(df_test[[column]])
    # Drop the original column
    df_train.drop(columns=[column], inplace=True)
    df_test.drop(columns=[column], inplace=True)

In [46]:
'''
# business_area 날리고 2개 피처 추가
'''
def is_hospital(value):
    if value == "hospital & health care":
        return 1
    else:
        return 0

def is_power(value):
    if value == "power plant / renewable energy":
        return 1
    else:
        return 0

# "business_area" 열에서 함수를 적용하여 새로운 열 생성
df_train['is_hospital'] = df_train['business_area'].apply(is_hospital)
df_train['is_power'] = df_train['business_area'].apply(is_hospital)
df_test['is_hospital'] = df_test['business_area'].apply(is_hospital)
df_test['is_power'] = df_test['business_area'].apply(is_hospital)

In [47]:
'''
# expected_timeline 에서 특정 단어 필터링
'''
# 새로운 열을 추가하기 위한 함수 정의
def contains_budget(value):
    if 'budget' in str(value).lower():  # 대소문자 구분없이 'budget'이 포함되어 있는지 확인
        return 1
    else:
        return 0

# "expected_timeline" 열에서 함수를 적용하여 새로운 열 생성
df_train['contains_budget'] = df_train['expected_timeline'].apply(contains_budget)
df_test['contains_budget'] = df_test['expected_timeline'].apply(contains_budget)

# 새로운 열을 추가하기 위한 함수 정의
def contains_etc(value):
    if 'etc' in str(value).lower():  # 대소문자 구분없이 'budget'이 포함되어 있는지 확인
        return 1
    else:
        return 0

# "expected_timeline" 열에서 함수를 적용하여 새로운 열 생성
df_train['contains_etc'] = df_train['expected_timeline'].apply(contains_etc)
df_test['contains_etc'] = df_test['expected_timeline'].apply(contains_etc)

# 새로운 열을 추가하기 위한 함수 정의
def contains_hence(value):
    if 'hence' in str(value).lower():  # 대소문자 구분없이 'budget'이 포함되어 있는지 확인
        return 1
    else:
        return 0

# "expected_timeline" 열에서 함수를 적용하여 새로운 열 생성
df_train['contains_hence'] = df_train['expected_timeline'].apply(contains_hence)
df_test['contains_hence'] = df_test['expected_timeline'].apply(contains_hence)

# 새로운 열을 추가하기 위한 함수 정의
def contains_although(value):
    if 'although' in str(value).lower():  # 대소문자 구분없이 'budget'이 포함되어 있는지 확인
        return 1
    else:
        return 0

# "expected_timeline" 열에서 함수를 적용하여 새로운 열 생성
df_train['contains_although'] = df_train['expected_timeline'].apply(contains_although)
df_test['contains_although'] = df_test['expected_timeline'].apply(contains_although)

# 새로운 열을 추가하기 위한 함수 정의
def contains_more(value):
    if 'more' in str(value).lower():  # 대소문자 구분없이 'budget'이 포함되어 있는지 확인
        return 1
    else:
        return 0

# "expected_timeline" 열에서 함수를 적용하여 새로운 열 생성
df_train['contains_more'] = df_train['expected_timeline'].apply(contains_more)
df_test['contains_more'] = df_test['expected_timeline'].apply(contains_more)

# 새로운 열을 추가하기 위한 함수 정의
def contains_year(value):
    if 'year' in str(value).lower():  # 대소문자 구분없이 'budget'이 포함되어 있는지 확인
        return 1
    else:
        return 0

# "expected_timeline" 열에서 함수를 적용하여 새로운 열 생성
df_train['contains_year'] = df_train['expected_timeline'].apply(contains_year)
df_test['contains_year'] = df_test['expected_timeline'].apply(contains_year)

In [48]:
# Calculate the most frequent value (mode) for each column
modes = df_train.mode().iloc[0]

# Fill missing (NA) values with the mode
df_train.fillna(modes, inplace=True)

# Calculate the most frequent value (mode) for each column
modes = df_test.mode().iloc[0]

# Fill missing (NA) values with the mode
df_test.fillna(modes, inplace=True)

In [49]:
'''
df_train과 df_test에서 숫자 형식인거 다 str로 바꾸기 (일부제외)
'''

# 변환할 라벨들의 리스트
labels_to_convert = [
    "bant_submit",
    "com_reg_ver_win_rate_standardized",
    "historical_existing_cnt_standardized",
    "lead_desc_length_standardized",
    "ver_cus",
    "ver_pro",
    "ver_win_rate_x_normalized",
    "ver_win_ratio_per_bu_normalized"
]

# df_test와 df_train에서 라벨들의 데이터 타입을 str로 변환
for label in labels_to_convert:
    if label in df_test.columns:
        df_test[label] = df_test[label].astype(str)
    if label in df_train.columns:
        df_train[label] = df_train[label].astype(str)

df_train['is_converted'] = df_train['is_converted'].astype(int)
df_test['is_converted'] = df_test['is_converted'].astype(int)

In [50]:
x_train, x_val, y_train, y_val = train_test_split(
    df_train.drop("is_converted", axis=1),
    df_train["is_converted"],
    test_size=0.2,
    shuffle=True,
    random_state=400,
)

In [51]:
cat_features_col = list(x_train.columns)

In [52]:
catboost_model = CatBoostClassifier(
    verbose=0, 
    iterations=1000, 
    depth=6, 
    learning_rate=0.1, 
    loss_function='CrossEntropy', 
    one_hot_max_size=5,
    cat_features=cat_features_col
    )

In [53]:
catboost_model.fit(x_train, y_train)

<catboost.core.CatBoostClassifier at 0x7fb142504fa0>

In [54]:
# 모델을 사용하여 검증 데이터셋에 대한 예측 수행
y_val_pred = catboost_model.predict(x_val)
print (y_val_pred)

y_val_pred = np.array([True if pred == 'True' else False for pred in y_val_pred])

print("Classification Report:\n", classification_report(y_val, y_val_pred))
print("오차행렬:\n", confusion_matrix(y_val, y_val_pred, labels=[True, False]))
print("\n정확도: {:.4f}".format(accuracy_score(y_val, y_val_pred)))
print("정밀도: {:.4f}".format(precision_score(y_val, y_val_pred, labels=[True, False])))
print("재현율: {:.4f}".format(recall_score(y_val, y_val_pred)))
print("F1: {:.4f}".format(f1_score(y_val, y_val_pred, labels=[True, False])))

[0 0 0 ... 0 0 0]
Classification Report:
               precision    recall  f1-score   support

           0       0.92      1.00      0.96     10913
           1       0.00      0.00      0.00       947

    accuracy                           0.92     11860
   macro avg       0.46      0.50      0.48     11860
weighted avg       0.85      0.92      0.88     11860

오차행렬:
 [[    0   947]
 [    0 10913]]

정확도: 0.9202
정밀도: 0.0000
재현율: 0.0000
F1: 0.0000


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [55]:
x_test = df_test.drop(["is_converted", "id"], axis=1)

test_pred = catboost_model.predict(x_test)

df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_pred
df_sub['is_converted'] = df_sub['is_converted'].astype(bool)
print(df_sub['is_converted'])

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

0        True
1        True
2        True
3       False
4        True
        ...  
5266    False
5267     True
5268    False
5269    False
5270     True
Name: is_converted, Length: 5271, dtype: bool
