# 라이브러리 호출

In [2]:
import pandas as pd
import random
import os
import numpy as np
from sklearn.tree import DecisionTreeClassifier

# 시드(seed) 고정

매번 고정된 결과를 얻기 위해서 사용합니다.  
시드를 고정하지 않는다면 같은 코드라도 매번 다른 결과가 나올 수 있습니다.  
항상 동일한 결과를 얻기 위해서 사용합니다.

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

# 데이터 불러오기
read_csv() 함수를 사용하여 데이터를 불러옵니다.

In [82]:
train = pd.read_csv('/content/train.csv')
test = pd.read_csv('/content/test.csv')

In [57]:
# train.csv 파일의 데이터를 확인합니다.
train.head(3)

Unnamed: 0,ID,가입일,음성사서함이용,주간통화시간,주간통화횟수,주간통화요금,저녁통화시간,저녁통화횟수,저녁통화요금,밤통화시간,밤통화횟수,밤통화요금,상담전화건수,전화해지여부
0,TRAIN_00000,329,0,99.2,93,27.3,268.8,68,28.92,262.9,328,32.89,2,0
1,TRAIN_00001,2,80,323.9,323,83.7,269.4,326,32.09,322.8,209,32.32,2,0
2,TRAIN_00002,93,28,282.4,323,34.2,207.0,322,32.82,280.8,328,8.28,0,0


# 피처엔지니어링


In [None]:
test

In [30]:
train['총통화시간'] = train['주간통화시간'] + train['저녁통화시간'] + train['밤통화시간']
train['총통화횟수'] = train['주간통화횟수'] + train['저녁통화횟수'] + train['밤통화횟수']
train['총통화요금'] = train['주간통화요금'] + train['저녁통화요금'] + train['밤통화요금']
train['평균통화시간'] = train['총통화시간'] / train['총통화횟수']
train['평균통화요금'] = train['총통화요금'] / train['총통화횟수']
train['주간통화사용'] = train['주간통화시간'] + train['주간통화횟수']
train['저녁통화사용'] = train['저녁통화시간'] + train['저녁통화횟수']
train['밤통화사용'] = train['밤통화시간'] + train['밤통화횟수']
train['평균통화사용'] = train['주간통화사용'] + train['저녁통화시간'] + train['밤통화사용']
train['상담빈도횟수'] =train['상담전화건수']/ train['가입일']

In [70]:
test['총통화시간'] = test['주간통화시간'] + test['저녁통화시간'] + test['밤통화시간']
test['총통화횟수'] = test['주간통화횟수'] + test['저녁통화횟수'] + test['밤통화횟수']
test['총통화요금'] = test['주간통화요금'] + test['저녁통화요금'] + test['밤통화요금']
test['평균통화시간'] = test['총통화시간'] / test['총통화횟수']
test['평균통화요금'] = test['총통화요금'] / test['총통화횟수']
test['주간통화사용'] = test['주간통화시간'] + test['주간통화횟수']
test['저녁통화사용'] = test['저녁통화시간'] + test['저녁통화횟수']
test['밤통화사용'] = test['밤통화시간'] + test['밤통화횟수']
test['평균통화사용'] = test['주간통화사용'] + test['저녁통화시간'] + test['밤통화사용']
test['상담빈도횟수'] =test['상담전화건수']/ test['가입일']


In [83]:
# 예측에 도움이 될 수 없는 ID 컬럼을 제거합니다.
# x_train은 독립변수이므로 종속변수를 제거합니다.
x_train = train.drop(['ID', '전화해지여부'], axis = 1)

# y_train은 종속변수의 값을 설정해줍니다.
y_train = train['전화해지여부']

# test 데이터에서 예측에 도움이 될 수 없는 ID 컬럼을 제거합니다.
test = test.drop('ID', axis = 1)

In [None]:
test

In [16]:
from sklearn.cluster import KMeans
CLUSTERS = [10,50,100,500,1000]


for n_clusters in CLUSTERS:
    # KMeans 클러스터링 모델을 생성합니다
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    
    # 모델을 훈련합니다
    kmeans.fit(x_train)
    
    # 각 데이터가 속한 클러스터 번호를 예측합니다
    cluster_labels = kmeans.predict(x_train)
    
    # 클러스터링 결과를 저장합니다
    x_train['cluster_'+str(n_clusters)] = cluster_labels




In [17]:
from sklearn.cluster import KMeans
CLUSTERS = [10,50,100,500,1000]


for n_clusters in CLUSTERS:
    # KMeans 클러스터링 모델을 생성합니다
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    
    # 모델을 훈련합니다
    kmeans.fit(test)
    
    # 각 데이터가 속한 클러스터 번호를 예측합니다
    cluster_labels = kmeans.predict(test)
    
    # 클러스터링 결과를 저장합니다
    test['cluster_'+str(n_clusters)] = cluster_labels



# 주성분 분석

In [None]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
temp = train.drop(['ID', '전화해지여부'], axis=1)
temp

In [None]:
train

In [None]:
temp_test = test.drop(['ID'], axis=1)
temp

In [None]:
scaler = StandardScaler()
temp = scaler.fit_transform(temp)
temp

In [None]:
scaler = StandardScaler()
temp_test = scaler.fit_transform(test)
temp

In [None]:
pca = PCA()
pca.fit(temp)

In [None]:
pca_test = PCA()
pca_test.fit(temp_test)

In [None]:
print(pca.explained_variance_)
print(pca.explained_variance_ratio_)

In [None]:
sum = 0
for i, ratio in enumerate(pca.explained_variance_ratio_):
    sum += ratio
    print(f'{sum} by PCA{i}')

In [None]:
sum = 0
for i, ratio in enumerate(pca_test.explained_variance_ratio_):
    sum += ratio
    print(f'{sum} by PCA{i}')

In [None]:
temp = pca.fit_transform(temp)
temp = pd.DataFrame(temp)
temp

In [None]:
temp_test = pca_test.fit_transform(temp_test)
temp_test = pd.DataFrame(temp_test)
temp_test

In [None]:
train = pd.concat([train, temp.iloc[:, 0:8]], axis=1)
train

In [None]:
train = train.drop(['ID', '전화해지여부'], axis=1)

In [None]:
train

In [None]:
test.columns

In [None]:
test = pd.concat([test, temp_test.iloc[:, 0:8]], axis=1)
test

In [None]:
train.columns = train.columns.astype(str)
test.columns = train.columns.astype(str)  # 모든 열 이름을 문자열로 변환

In [22]:
x_train

Unnamed: 0,가입일,음성사서함이용,주간통화시간,주간통화횟수,주간통화요금,저녁통화시간,저녁통화횟수,저녁통화요금,밤통화시간,밤통화횟수,...,주간통화사용,저녁통화사용,밤통화사용,평균통화사용,상담빈도횟수,cluster_10,cluster_50,cluster_100,cluster_500,cluster_1000
0,329,0,99.2,93,27.3,268.8,68,28.92,262.9,328,...,192.2,336.8,590.9,1051.9,0.006079,3,44,45,295,334
1,2,80,323.9,323,83.7,269.4,326,32.09,322.8,209,...,646.9,595.4,531.8,1448.1,1.000000,1,40,23,122,248
2,93,28,282.4,323,34.2,207.0,322,32.82,280.8,328,...,605.4,529.0,608.8,1421.2,0.000000,1,40,23,263,426
3,223,1,221.4,223,25.1,233.0,61,23.90,203.8,234,...,444.4,294.0,437.8,1115.2,0.000000,5,27,34,114,555
4,222,0,96.3,222,28.7,223.9,69,28.08,263.1,223,...,318.3,292.9,486.1,1028.3,0.036036,3,42,45,210,447
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30195,263,80,289.6,201,21.8,280.5,323,29.88,208.0,66,...,490.6,603.5,274.0,1045.1,0.007605,6,32,69,166,179
30196,283,81,210.7,280,90.5,284.1,202,32.80,287.8,203,...,490.7,486.1,490.8,1265.6,0.007067,8,49,70,2,801
30197,24,0,222.4,33,22.1,233.9,32,22.22,293.6,95,...,255.4,265.9,388.6,877.9,0.083333,0,3,92,92,586
30198,63,1,262.4,202,29.6,280.6,282,28.88,280.9,207,...,464.4,562.6,487.9,1232.9,0.031746,8,30,20,264,507


# 모델 정의  
의사결정나무 모델을 사용하기 위해 모델을 정의합니다.  
[의사결정나무 모델의 개념을 알고 싶다면 여기를 클릭해주세요.](https://dacon.io/competitions/open/235698/talkboard/403509?page=1&dtype=recent)

In [60]:
x_train

Unnamed: 0,가입일,음성사서함이용,주간통화시간,주간통화횟수,주간통화요금,저녁통화시간,저녁통화횟수,저녁통화요금,밤통화시간,밤통화횟수,...,총통화시간,총통화횟수,총통화요금,평균통화시간,평균통화요금,주간통화사용,저녁통화사용,밤통화사용,평균통화사용,상담빈도횟수
0,329,0,99.2,93,27.3,268.8,68,28.92,262.9,328,...,630.9,489,89.11,1.290184,0.182229,192.2,336.8,590.9,1051.9,0.006079
1,2,80,323.9,323,83.7,269.4,326,32.09,322.8,209,...,916.1,858,148.11,1.067716,0.172622,646.9,595.4,531.8,1448.1,1.000000
2,93,28,282.4,323,34.2,207.0,322,32.82,280.8,328,...,770.2,973,75.30,0.791572,0.077390,605.4,529.0,608.8,1421.2,0.000000
3,223,1,221.4,223,25.1,233.0,61,23.90,203.8,234,...,658.2,518,58.36,1.270656,0.112664,444.4,294.0,437.8,1115.2,0.000000
4,222,0,96.3,222,28.7,223.9,69,28.08,263.1,223,...,583.3,514,59.58,1.134825,0.115914,318.3,292.9,486.1,1028.3,0.036036
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30195,263,80,289.6,201,21.8,280.5,323,29.88,208.0,66,...,778.1,590,60.96,1.318814,0.103322,490.6,603.5,274.0,1045.1,0.007605
30196,283,81,210.7,280,90.5,284.1,202,32.80,287.8,203,...,782.6,685,129.58,1.142482,0.189168,490.7,486.1,490.8,1265.6,0.007067
30197,24,0,222.4,33,22.1,233.9,32,22.22,293.6,95,...,749.9,160,48.54,4.686875,0.303375,255.4,265.9,388.6,877.9,0.083333
30198,63,1,262.4,202,29.6,280.6,282,28.88,280.9,207,...,823.9,691,79.36,1.192330,0.114848,464.4,562.6,487.9,1232.9,0.031746


In [59]:
!pip install optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting optuna
  Downloading optuna-3.1.0-py3-none-any.whl (365 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.3/365.3 KB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0
  Downloading alembic-1.10.2-py3-none-any.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 KB[0m [31m24.4 MB/s[0m eta [36m0:00:00[0m
Collecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting cmaes>=0.9.1
  Downloading cmaes-0.9.1-py3-none-any.whl (21 kB)
Collecting Mako
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 KB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, cmaes, alembic, optuna
Successfully installed Mako-1.2.4 alembic-1.10.2 cmaes-0.9.1 colorlog-6.7.0 optuna-3.1.0


In [64]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp39-none-manylinux1_x86_64.whl (76.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m19.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.1.1


In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.model_selection import KFold
import lightgbm as lgb
from catboost import CatBoostClassifier
import xgboost as xgb
import numpy as np
from imblearn.over_sampling import SMOTE

# 모델 파라미터 설정
lgb_params = {'max_depth': 10, 'learning_rate': 0.09}

cat_params = {'max_depth': 8, 'learning_rate': 0.06800749578049192, 'n_estimators': 967}

xgb_params = {'max_depth': 10, 'learning_rate': 0.0493607891381531}
smote = SMOTE(random_state=42)             

# 모델 객체 생성
lgb_model = lgb.LGBMClassifier(**lgb_params)
cat_model = CatBoostClassifier(**cat_params)
xgb_model = xgb.XGBClassifier(**xgb_params)

# 스태킹 분류기 객체 생성
estimators = [('lgb', lgb_model), ('cat', cat_model), ('xgb', xgb_model)]
stacking_model = StackingClassifier(estimators=estimators, final_estimator=lgb_model)


x_train_resampled, y_train_resampled = smote.fit_resample(x_train, y_train)


# k-fold 교차 검증을 수행하여 스태킹 모델 학습
kf = KFold(n_splits=50, shuffle=True, random_state=42)
meta_features = np.zeros((len(x_train_resampled), 3))  # 두번째 레이어 모델의 입력 feature를 저장할 배열
for train_idx, val_idx in kf.split(x_train_resampled):
    x_train_fold = x_train_resampled.iloc[train_idx]
    y_train_fold = y_train_resampled[train_idx]
    x_val_fold = x_train_resampled.iloc[val_idx]
    y_val_fold = y_train_resampled[val_idx]

    # train set을 이용하여 첫번째 레이어 모델 학습
    lgb_model.fit(x_train_fold, y_train_fold)
    cat_model.fit(x_train_fold, y_train_fold)
    xgb_model.fit(x_train_fold, y_train_fold)

    # validation set에 대한 첫번째 레이어 모델의 예측 확률 계산
    lgb_pred = lgb_model.predict_proba(x_val_fold)[:, 1]
    cat_pred = cat_model.predict_proba(x_val_fold)[:, 1]
    xgb_pred = xgb_model.predict_proba(x_val_fold)[:, 1]

    # 두번째 레이어 모델에 입력할 feature 생성
    meta_features[val_idx, 0] = lgb_pred
    meta_features[val_idx, 1] = cat_pred
    meta_features[val_idx, 2] = xgb_pred

# 두번째 레이어 모델 학습
stacking_model.fit(meta_features, y_train)



[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
802:	learn: 0.1133232	total: 11.1s	remaining: 2.27s
803:	learn: 0.1131777	total: 11.1s	remaining: 2.26s
804:	learn: 0.1130642	total: 11.2s	remaining: 2.24s
805:	learn: 0.1129616	total: 11.2s	remaining: 2.23s
806:	learn: 0.1128483	total: 11.2s	remaining: 2.22s
807:	learn: 0.1127513	total: 11.2s	remaining: 2.2s
808:	learn: 0.1126244	total: 11.2s	remaining: 2.19s
809:	learn: 0.1125469	total: 11.2s	remaining: 2.17s
810:	learn: 0.1124557	total: 11.2s	remaining: 2.16s
811:	learn: 0.1123374	total: 11.2s	remaining: 2.15s
812:	learn: 0.1121380	total: 11.3s	remaining: 2.13s
813:	learn: 0.1119307	total: 11.3s	remaining: 2.12s
814:	learn: 0.1117523	total: 11.3s	remaining: 2.1s
815:	learn: 0.1116988	total: 11.3s	remaining: 2.09s
816:	learn: 0.1115687	total: 11.3s	remaining: 2.08s
817:	learn: 0.1114816	total: 11.3s	remaining: 2.06s
818:	learn: 0.1113966	total: 11.3s	remaining: 2.05s
819:	learn: 0.1113117	total: 11.3s	remaining: 2.03s
820:	learn: 0.11

In [131]:
test_meta_features = np.column_stack((
    lgb_model.predict_proba(test)[:, 1],
    cat_model.predict_proba(test)[:, 1],
    xgb_model.predict_proba(test)[:, 1]
))

test_preds = stacking_model.predict_proba(test_meta_features)[:, 1]

array([0.30472982, 0.34788638, 0.13367295, ..., 0.20619911, 0.29537365,
       0.28701458])

In [152]:
# threshold 값을 설정합니다.
th = 0.4=3

# test_preds 값이 threshold(th) 이하면 0으로, 그 이상이면 1로 변환합니다.
test_preds_binary = np.where(test_preds <= th, 0, 1)

In [153]:
test_preds_binary

array([0, 0, 0, ..., 0, 0, 0])

#  파일 저장
예측 결과를 제출하기 위해서 예측값을 제출 파일에 할당하고 저장합니다.

In [154]:
# 제출 파일을 불러옵니다.
submit = pd.read_csv('./sample_submission.csv')
# 예측한 값을 '전화해지여부' 컬럼에 할당합니다.


In [155]:
# 예측한 값을 '전화해지여부' 컬럼에 할당합니다.
submit['전화해지여부'] = test_preds_binary
submit.head()

Unnamed: 0,ID,전화해지여부
0,TEST_00000,0
1,TEST_00001,0
2,TEST_00002,0
3,TEST_00003,1
4,TEST_00004,1


In [156]:
# 예측한 파일을 저장합니다.
submit.to_csv('./lgbm_fold25.csv', index = False)