In [54]:
!pip install catboost

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [55]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from catboost import CatBoostClassifier
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    classification_report
)

In [56]:
df_train = pd.read_csv("train.csv") # 학습용 데이터
df_test = pd.read_csv("submission.csv") # 테스트 데이터(제출파일의 데이터)

In [57]:
# 결측치 처리
text_columns = ["customer_type", "customer_job", "inquiry_type", "product_category",
                "product_subcategory", "product_modelname", "customer_position",
                "response_corporate", "expected_timeline", "business_area"]

df_train[text_columns] = df_train[text_columns].fillna('')
df_test[text_columns] = df_test[text_columns].fillna('')

# 자연어 데이터 통합
df_train['text_data'] = df_train[text_columns].astype(str).apply(lambda x: ' '.join(x), axis=1)
df_test['text_data'] = df_test[text_columns].astype(str).apply(lambda x: ' '.join(x), axis=1)

In [58]:
# 결측치 처리를 위한 Imputer를 사용하여 빈 문자열을 대체
imputer = SimpleImputer(strategy='constant', fill_value='missing')
df_train['text_data'] = imputer.fit_transform(df_train['text_data'].values.reshape(-1, 1)).ravel()
df_test['text_data'] = imputer.transform(df_test['text_data'].values.reshape(-1, 1)).ravel()

In [59]:
# TF-IDF 변환
tfidf_vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
tfidf_train = tfidf_vectorizer.fit_transform(df_train['text_data'])
tfidf_test = tfidf_vectorizer.transform(df_test['text_data'])

In [60]:
# TruncatedSVD를 사용하여 차원 축소
svd = TruncatedSVD(n_components=50, random_state=42)
svd_train = svd.fit_transform(tfidf_train)
svd_test = svd.transform(tfidf_test)

In [63]:
# 변환된 데이터를 데이터프레임에 추가
df_svd_train = pd.DataFrame(svd_train, columns=[f'svd_{i}' for i in range(svd_train.shape[1])])
df_svd_test = pd.DataFrame(svd_test, columns=[f'svd_{i}' for i in range(svd_test.shape[1])])

In [64]:
df_train = pd.concat([df_train, df_svd_train], axis=1)
df_test = pd.concat([df_test, df_svd_test], axis=1)

In [65]:
# 원래의 텍스트 열 및 중간 열들 삭제
df_train.drop(text_columns + ['text_data'], axis=1, inplace=True)
df_test.drop(text_columns + ['text_data'], axis=1, inplace=True)

In [66]:
# 수치형 변수 전체 Standardization, Normalization 실행
scaler_standard = StandardScaler()
scaler_minmax = MinMaxScaler()
columns_to_ST = ["com_reg_ver_win_rate", "historical_existing_cnt", "lead_desc_length"]
columns_to_NM = ["ver_win_rate_x", "ver_win_ratio_per_bu"]

In [67]:
# Apply Standardization to each column
for column in columns_to_ST:
    df_train[column + '_standardized'] = scaler_standard.fit_transform(df_train[[column]])
    df_test[column + '_standardized'] = scaler_standard.transform(df_test[[column]])
    df_train.drop(columns=[column], inplace=True)
    df_test.drop(columns=[column], inplace=True)

# Apply Normalization to each column
for column in columns_to_NM:
    df_train[column + '_normalized'] = scaler_minmax.fit_transform(df_train[[column]])
    df_test[column + '_normalized'] = scaler_minmax.transform(df_test[[column]])
    df_train.drop(columns=[column], inplace=True)
    df_test.drop(columns=[column], inplace=True)

In [68]:
# 훈련 데이터와 검증 데이터로 분할
x_train, x_val, y_train, y_val = train_test_split(
    df_train.drop("is_converted", axis=1),
    df_train["is_converted"],
    test_size=0.2,
    shuffle=True,
    random_state=400,
)

cat_features_col = list(x_train.columns)
cat_features_col.remove("bant_submit")
cat_features_col.remove("com_reg_ver_win_rate_standardized")
cat_features_col.remove("historical_existing_cnt_standardized")
cat_features_col.remove("id_strategic_ver")
cat_features_col.remove("it_strategic_ver")
cat_features_col.remove("idit_strategic_ver")
cat_features_col.remove("lead_desc_length_standardized")
cat_features_col.remove("ver_cus")
cat_features_col.remove("ver_pro")
cat_features_col.remove("ver_win_rate_x_normalized")
cat_features_col.remove("ver_win_ratio_per_bu_normalized")

for i in range(50):
  cat_features_col.remove("svd_"+str(i))

In [69]:
catboost_model = CatBoostClassifier(
    verbose=0,
    iterations=1000,
    depth=6,
    learning_rate=0.1,
    loss_function='Logloss',
    one_hot_max_size=5,
    cat_features=cat_features_col
)

In [70]:
# Calculate the most frequent value (mode) for each column
modes = x_train.mode().iloc[0]

# Fill missing (NA) values with the mode
x_train.fillna(modes, inplace=True)

In [71]:
catboost_model.fit(x_train, y_train)

<catboost.core.CatBoostClassifier at 0x7fb3ac6b5ff0>

In [72]:
# Calculate the most frequent value (mode) for each column
modes = x_val.mode().iloc[0]

# Fill missing (NA) values with the mode
x_val.fillna(modes, inplace=True)

In [73]:
# 검증 데이터에 대한 예측 및 성능 평가
y_val_pred = catboost_model.predict(x_val)
y_val_pred = np.array([True if pred == 'True' else False for pred in y_val_pred])

print("Classification Report:\n", classification_report(y_val, y_val_pred))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred, labels=[True, False]))
print("\nAccuracy: {:.4f}".format(accuracy_score(y_val, y_val_pred)))
print("Precision: {:.4f}".format(precision_score(y_val, y_val_pred, labels=[True, False])))
print("Recall: {:.4f}".format(recall_score(y_val, y_val_pred)))
print("F1 Score: {:.4f}".format(f1_score(y_val, y_val_pred)))

Classification Report:
               precision    recall  f1-score   support

       False       0.98      0.99      0.99     10913
        True       0.92      0.79      0.85       947

    accuracy                           0.98     11860
   macro avg       0.95      0.89      0.92     11860
weighted avg       0.98      0.98      0.98     11860

Confusion Matrix:
 [[  750   197]
 [   67 10846]]

Accuracy: 0.9777
Precision: 0.9180
Recall: 0.7920
F1 Score: 0.8503


In [74]:
x_test = df_test.drop(["is_converted", "id"], axis=1)

# Calculate the most frequent value (mode) for each column
modes = x_test.mode().iloc[0]

# Fill missing (NA) values with the mode
x_test.fillna(modes, inplace=True)

test_pred = catboost_model.predict(x_test)

df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)