### 1. 데이터 확인
### 필수 라이브러리

In [70]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import seaborn as sns
from sklearn.impute import SimpleImputer
import scipy.stats as stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
import re

### 데이터셋 읽어오기

In [71]:
df_train = pd.read_csv("train.csv")  # 학습용 데이터
df_test = pd.read_csv("submission.csv")  # 테스트 데이터(제출 파일의 데이터)

In [72]:
# 지역 코드에 따른 지역명 매핑
region_mapping = {
    "EU": ["LGEAG", "LGECZ", "LGEFS", "LGEDG", "LGEHS", "LGEMK", "LGEIS", "LGESC", "LGEEH", "LGEBN", "LGEWR", "LGEPL", "LGEMA", "LGEPT", "LGERO", "LGEES", "LGENO", "LGESW", "LGEUK"],
    "RC": ["LGEAK", "LGERM", "LGERI", "LGERA", "LGEUR", "LGELV"],
    "MA": ["LGEAS", "LGEEG", "LGELF", "LGESK", "LGEMC", "LGESA", "LGETU", "LGEOT", "LGEDF", "LGEGF", "LGEME", "LGEAF", "LEAO", "LGENI", "LGETK", "LGEAT", "LGESJ", "LGEEF", "LGEYK", "LGEIR"],
    "AP": ["LGEAP", "LGEQA", "LGETL", "LGECH", "LGEYT", "LGETR", "LGETA", "LGESY", "LGESH", "LGEQH", "LGEQD", "LGEPN", "LGENE", "LGEKS", "LGEHZ", "LGEHN", "LGEHK", "LGEIL", "LGEPH", "LGEVH", "LGEKR", "LGESL", "LGEIN", "LGETH", "LGEML", "LGETT", "LGEJP"],
    "NA": ["LGECI", "LGERS", "LGEMX", "LGEMS", "LGEMM", "LGEMR", "LGEUS", "LGEMU", "LGEAI"],
    "LA": ["LGEAG", "LGEBR", "LGECL", "LGEVZ", "LGECB", "LGEPS", "LGEPR", "LGESP", "LGEAR"],
    "OT": ["LGEEB", "LGELA", "LGEBT", "MA", "RC"]
}


def categorize_region(code):
    for region, codes in region_mapping.items():
        if code in codes:
            return region
    return "ETC"  

df_train['region'] = df_train['response_corporate'].apply(categorize_region)
df_test['region'] = df_test['response_corporate'].apply(categorize_region)

In [73]:
def extract_country(value):
    if not isinstance(value, str):
        value = str(value)
    match = re.search(r'\/([^\/]+)$', value)
    if match:
        return match.group(1)
    return None

df_train['customer_country'] = df_train['customer_country'].apply(extract_country)
df_test['customer_country'] = df_test['customer_country'].apply(extract_country)

### 2. 데이터 전처리

In [74]:
### 결측치 확인

In [75]:
print(df_train.isna().sum()/len(df_train))

bant_submit                0.000000
customer_country           0.070473
business_unit              0.000000
com_reg_ver_win_rate       0.754330
customer_idx               0.000000
customer_type              0.741345
enterprise                 0.000000
historical_existing_cnt    0.768023
id_strategic_ver           0.941921
it_strategic_ver           0.981096
idit_strategic_ver         0.923017
customer_job               0.315908
lead_desc_length           0.000000
inquiry_type               0.015869
product_category           0.326717
product_subcategory        0.844264
product_modelname          0.844365
customer_country.1         0.016560
customer_position          0.000000
response_corporate         0.000000
expected_timeline          0.520464
ver_cus                    0.000000
ver_pro                    0.000000
ver_win_rate_x             0.689421
ver_win_ratio_per_bu       0.741918
business_area              0.689421
business_subarea           0.906811
lead_owner                 0

In [76]:
[df_train.isna().sum()/len(df_train) > 0.6]

[bant_submit                False
 customer_country           False
 business_unit              False
 com_reg_ver_win_rate        True
 customer_idx               False
 customer_type               True
 enterprise                 False
 historical_existing_cnt     True
 id_strategic_ver            True
 it_strategic_ver            True
 idit_strategic_ver          True
 customer_job               False
 lead_desc_length           False
 inquiry_type               False
 product_category           False
 product_subcategory         True
 product_modelname           True
 customer_country.1         False
 customer_position          False
 response_corporate         False
 expected_timeline          False
 ver_cus                    False
 ver_pro                    False
 ver_win_rate_x              True
 ver_win_ratio_per_bu        True
 business_area               True
 business_subarea            True
 lead_owner                 False
 is_converted               False
 region       

In [77]:
df_train.columns[df_train.isna().sum()/len(df_train) > 0.6]

Index(['com_reg_ver_win_rate', 'customer_type', 'historical_existing_cnt',
       'id_strategic_ver', 'it_strategic_ver', 'idit_strategic_ver',
       'product_subcategory', 'product_modelname', 'ver_win_rate_x',
       'ver_win_ratio_per_bu', 'business_area', 'business_subarea'],
      dtype='object')

In [78]:
# 제거 칼럼 
del_cols = ['customer_country.1','it_strategic_ver', 'id_strategic_ver', 'idit_strategic_ver','product_subcategory', 'product_modelname', 'ver_win_rate_x',
       'ver_win_ratio_per_bu', 'business_area', 'business_subarea']
df_train.drop(del_cols, axis=1, inplace=True)
df_test.drop(del_cols, axis=1, inplace=True)

In [79]:
# target('is_converted')의 분포 확인
target_distribution = df_train['is_converted'].value_counts()
min_samples = target_distribution.min()

In [80]:
# 언더 샘플링

df_train_balanced = pd.concat([
    df_train[df_train['is_converted'] == True].sample(min_samples, random_state=42),
    df_train[df_train['is_converted'] == False].sample(min_samples, random_state=42)
], ignore_index=True)

#from imblearn.under_sampling import NeighbourhoodCleaningRule

# NeighbourhoodCleaningRule 객체 생성
#ncr = NeighbourhoodCleaningRule()

# 다운 샘플링할 데이터와 해당 데이터의 레이블을 설정
#X_train = df_train.drop('is_converted', axis=1)
#y_train = df_train['is_converted']

# NeighbourhoodCleaningRule을 사용하여 다운 샘플링 적용
#X_train_resampled, y_train_resampled = ncr.fit_resample(X_train, y_train)

# 샘플링된 데이터로 DataFrame 생성
#df_train_balanced = pd.DataFrame(X_train_resampled, columns=X_train.columns)
#df_train_balanced['is_converted'] = y_train_resampled


In [81]:
df_train_balanced['is_converted'].value_counts()

True     4850
False    4850
Name: is_converted, dtype: int64

In [82]:
df_train = df_train_balanced

In [83]:
# 수치형 및 범주형 칼럼 리스트를 정의
numeric_columns = df_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_columns = df_train.select_dtypes(exclude=[np.number]).columns.tolist()

In [84]:
# 결측치 처리
numeric_imputer = SimpleImputer(strategy='mean')
categorical_imputer = SimpleImputer(strategy='most_frequent')

df_train[numeric_columns] = numeric_imputer.fit_transform(df_train[numeric_columns])
df_test[numeric_columns] = numeric_imputer.transform(df_test[numeric_columns])

df_train[categorical_columns] = categorical_imputer.fit_transform(df_train[categorical_columns])
df_test[categorical_columns] = categorical_imputer.transform(df_test[categorical_columns])

In [85]:
# 범주형 데이터 레이블 인코딩
def label_encoding(series: pd.Series) -> pd.Series:
    """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""
    my_dict = {}
    series = series.astype(str)
    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)
    return series

In [169]:
# 레이블 인코딩할 칼럼들
#label_columns = [
#    "customer_country",
#    "business_unit",
#    "enterprise",
#    "customer_job",
#    "inquiry_type",
#    "product_category",
#    "customer_position",
#    "response_corporate",
#    "expected_timeline",
#]

#df_all = pd.concat([train[label_columns], test[label_columns]])

#for col in label_columns:
#    test[col] = label_encoding(test[col])

In [86]:
for col in categorical_columns:
    df_train[col] = label_encoding(df_train[col])
    df_test[col] = label_encoding(df_test[col])

학습 데이터와 제출 데이터 분리

In [87]:
# 수치형 데이터 스케일링
scaler = StandardScaler()
df_train[numeric_columns] = scaler.fit_transform(df_train[numeric_columns])
df_test[numeric_columns] = scaler.transform(df_test[numeric_columns])

### 2-2. 학습, 검증 데이터 분리

In [88]:
x_train, x_val, y_train, y_val = train_test_split(
    df_train.drop("is_converted", axis=1),
    df_train["is_converted"],
    test_size=0.2,
    shuffle=True,
    random_state=42,
)

In [89]:
# 오버 샘플링
#from imblearn.over_sampling import ADASYN
#adasyn = ADASYN(random_state=42)
#x_train_oversampled, y_train_oversampled = adasyn.fit_resample(x_train, y_train)
#print(len(x_train_oversampled), len(y_train_oversampled))
#print(y_train_oversampled.value_counts())
from imblearn.over_sampling import BorderlineSMOTE
smote = BorderlineSMOTE(random_state=42)
x_train_smote, y_train_smote = smote.fit_resample(x_train, y_train)
print(len(x_train_smote), len(y_train_smote))
print(y_train_smote.value_counts())


7766 7766
1    3883
0    3883
Name: is_converted, dtype: int64


### 3. 모델 학습
### 모델 정의

In [90]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from mlxtend.classifier import StackingClassifier

In [25]:
#model = DecisionTreeClassifier(class_weight='balanced')

In [91]:
# PyCaret을 사용하여 모델 튜닝, 블렌딩 및 예측 수행
from pycaret.classification import *

In [92]:
exp_clf = setup(data=df_train, target='is_converted', ignore_features=['customer_id'])

Unnamed: 0,Description,Value
0,Session id,255
1,Target,is_converted
2,Target type,Binary
3,Original data shape,"(9700, 20)"
4,Transformed data shape,"(9700, 20)"
5,Transformed train set shape,"(6790, 20)"
6,Transformed test set shape,"(2910, 20)"
7,Ignore features,1
8,Numeric features,19
9,Preprocess,True


In [94]:
# 모델 생성 및 비교
top4 = compare_models(sort='F1', n_select=4)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lightgbm,Light Gradient Boosting Machine,0.9206,0.9786,0.924,0.918,0.9208,0.8412,0.8417,0.203
xgboost,Extreme Gradient Boosting,0.9156,0.9767,0.9158,0.9158,0.9156,0.8312,0.8316,0.08
catboost,CatBoost Classifier,0.9144,0.9774,0.9081,0.9198,0.9138,0.8289,0.8292,1.838
rf,Random Forest Classifier,0.9091,0.9744,0.8999,0.917,0.9081,0.8183,0.8187,0.19
et,Extra Trees Classifier,0.8947,0.9616,0.8895,0.899,0.8941,0.7894,0.7897,0.17
gbc,Gradient Boosting Classifier,0.8954,0.9677,0.8781,0.9098,0.8934,0.7909,0.7917,0.244
dt,Decision Tree Classifier,0.8716,0.8716,0.8807,0.8652,0.8728,0.7432,0.7436,0.016
ada,Ada Boost Classifier,0.8658,0.9472,0.8451,0.8824,0.863,0.7317,0.7329,0.084
knn,K Neighbors Classifier,0.6979,0.7698,0.7167,0.6911,0.7035,0.3959,0.3963,0.023
qda,Quadratic Discriminant Analysis,0.7097,0.7983,0.6029,0.7672,0.6747,0.4194,0.4298,0.011


In [95]:
print(top4)

[LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=255, reg_alpha=0.0, reg_lambda=0.0, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0), XGBClassifier(base_score=None, booster='gbtree', callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device='cpu', early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=No

In [96]:
# 모델 튜닝
tuned_top4 = [tune_model(i) for i in top4]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9219,0.9768,0.9294,0.9159,0.9226,0.8439,0.844
1,0.9264,0.9792,0.9353,0.9191,0.9271,0.8527,0.8529
2,0.9205,0.9834,0.9206,0.9206,0.9206,0.8409,0.8409
3,0.9102,0.9732,0.9059,0.9139,0.9099,0.8203,0.8204
4,0.9308,0.9819,0.9235,0.9373,0.9304,0.8616,0.8617
5,0.9499,0.9849,0.9676,0.9345,0.9507,0.8999,0.9004
6,0.8999,0.9696,0.9204,0.8839,0.9017,0.7997,0.8004
7,0.9131,0.9722,0.9263,0.9023,0.9141,0.8262,0.8265
8,0.9116,0.9768,0.9204,0.9043,0.9123,0.8233,0.8234
9,0.9087,0.9714,0.9027,0.9134,0.908,0.8174,0.8174


Processing:   0%|          | 0/7 [00:00<?, ?it/s]

Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.863,0.9735,0.9971,0.7865,0.8794,0.726,0.7536
1,0.8763,0.9764,0.9882,0.8077,0.8889,0.7525,0.7721
2,0.8822,0.9805,0.9941,0.8125,0.8942,0.7643,0.7842
3,0.8689,0.9674,0.9794,0.8024,0.8821,0.7378,0.7565
4,0.9043,0.9816,0.9794,0.8517,0.9111,0.8085,0.8178
5,0.8616,0.9821,0.9882,0.7882,0.877,0.7232,0.7476
6,0.8409,0.9649,0.9882,0.7631,0.8612,0.682,0.7137
7,0.8542,0.9735,0.9912,0.7778,0.8716,0.7085,0.7367
8,0.8527,0.9721,0.9882,0.7773,0.8701,0.7056,0.733
9,0.8837,0.9723,0.9764,0.8234,0.8934,0.7674,0.7809


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.891,0.969,0.9029,0.8822,0.8924,0.782,0.7822
1,0.8969,0.9702,0.9059,0.8902,0.898,0.7938,0.7939
2,0.9205,0.9788,0.9176,0.9231,0.9204,0.8409,0.841
3,0.8807,0.9582,0.8618,0.896,0.8786,0.7614,0.762
4,0.9264,0.9756,0.9059,0.9448,0.9249,0.8527,0.8535
5,0.9249,0.9796,0.9469,0.9068,0.9264,0.8498,0.8506
6,0.8763,0.9611,0.8761,0.8761,0.8761,0.7526,0.7526
7,0.894,0.9701,0.8791,0.9058,0.8922,0.7879,0.7883
8,0.8969,0.9745,0.9145,0.8832,0.8986,0.7938,0.7943
9,0.8822,0.9639,0.8555,0.9034,0.8788,0.7643,0.7654


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8704,0.9572,0.8676,0.8728,0.8702,0.7408,0.7408
1,0.8881,0.9635,0.8765,0.8976,0.8869,0.7761,0.7764
2,0.8778,0.9648,0.8559,0.8954,0.8752,0.7555,0.7563
3,0.8704,0.9501,0.8324,0.9013,0.8654,0.7408,0.743
4,0.8954,0.9646,0.8559,0.9297,0.8913,0.7909,0.7934
5,0.9057,0.9702,0.9056,0.9056,0.9056,0.8115,0.8115
6,0.8822,0.9491,0.8643,0.896,0.8799,0.7643,0.7648
7,0.8733,0.9575,0.8496,0.8916,0.8701,0.7467,0.7475
8,0.8925,0.9651,0.9086,0.88,0.894,0.785,0.7854
9,0.8719,0.9508,0.8201,0.9145,0.8647,0.7437,0.7477


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [97]:
blender_top4 = blend_models(estimator_list=tuned_top4)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.9219,0.9814,0.9265,0.9184,0.9224,0.8439,0.8439
1,0.919,0.9807,0.9176,0.9204,0.919,0.838,0.838
2,0.9323,0.9852,0.9294,0.9349,0.9322,0.8645,0.8645
3,0.9043,0.9731,0.8971,0.9104,0.9037,0.8085,0.8086
4,0.9396,0.9826,0.9206,0.9572,0.9385,0.8792,0.8799
5,0.947,0.9888,0.9587,0.9366,0.9475,0.894,0.8942
6,0.9102,0.9743,0.9145,0.9064,0.9104,0.8203,0.8204
7,0.9249,0.98,0.9174,0.9311,0.9242,0.8498,0.8499
8,0.9131,0.9788,0.9351,0.8955,0.9149,0.8262,0.827
9,0.9057,0.9746,0.8879,0.9205,0.9039,0.8115,0.812


In [98]:
final_model = finalize_model(blender_top4)
prediction = predict_model(final_model, data=df_test.iloc[-100:])

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.84,0.9408,0.7273,0.7742,0.75,0.6325,0.6332


In [99]:
from pycaret.utils import check_metric
check_metric(prediction['is_converted'], prediction['Label'], metric = 'F1')

ImportError: cannot import name 'check_metric' from 'pycaret.utils' (c:\Users\happy\AppData\Local\Programs\Python\Python310\lib\site-packages\pycaret\utils\__init__.py)

### 모델 학습

In [140]:
#model.fit(x_train.fillna(0), y_train)

### 모델 성능 보기

In [100]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [101]:
pred = final_model.predict(x_val.fillna(0))
get_clf_eval(y_val, pred)

오차행렬:
 [[964   9]
 [ 12 955]]

정확도: 0.9892
정밀도: 0.9877
재현율: 0.9908
F1: 0.9892


# 4. 제출하기

## 테스트 데이터 예측

In [102]:
# 예측에 필요한 데이터 분리
x_test = df_test.drop(["is_converted", "id"], axis=1)

In [103]:
test_pred = final_model.predict(x_test.fillna(0))
sum(test_pred) # True로 예측된 개수

1475

## 제출파일 작성

In [32]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

### 우측 상단의 제출 버튼을 클릭해 결과를 확인하세요