<a href="https://colab.research.google.com/github/jh941213/dacon_call/blob/main/final_dacon_call.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 라이브러리 호출

In [None]:
import pandas as pd
import random
import os
import numpy as np
from sklearn.tree import DecisionTreeClassifier

# 시드(seed) 고정

매번 고정된 결과를 얻기 위해서 사용합니다.  
시드를 고정하지 않는다면 같은 코드라도 매번 다른 결과가 나올 수 있습니다.  
항상 동일한 결과를 얻기 위해서 사용합니다.

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

# 데이터 불러오기
read_csv() 함수를 사용하여 데이터를 불러옵니다.

In [None]:
train = pd.read_csv('/content/train.csv')
test = pd.read_csv('/content/test.csv')

In [None]:
# train.csv 파일의 데이터를 확인합니다.
train.head(3)

Unnamed: 0,ID,가입일,음성사서함이용,주간통화시간,주간통화횟수,주간통화요금,저녁통화시간,저녁통화횟수,저녁통화요금,밤통화시간,밤통화횟수,밤통화요금,상담전화건수,전화해지여부
0,TRAIN_00000,329,0,99.2,93,27.3,268.8,68,28.92,262.9,328,32.89,2,0
1,TRAIN_00001,2,80,323.9,323,83.7,269.4,326,32.09,322.8,209,32.32,2,0
2,TRAIN_00002,93,28,282.4,323,34.2,207.0,322,32.82,280.8,328,8.28,0,0


# 피처엔지니어링


In [None]:
# 예측에 도움이 될 수 없는 ID 컬럼을 제거합니다.
# x_train은 독립변수이므로 종속변수를 제거합니다.
x_train = train.drop(['ID', '전화해지여부'], axis = 1)

# y_train은 종속변수의 값을 설정해줍니다.
y_train = train['전화해지여부']

# test 데이터에서 예측에 도움이 될 수 없는 ID 컬럼을 제거합니다.
test = test.drop('ID', axis = 1)

Optuna 를 이용한 최적의 파라미터 찾기

In [None]:
!pip install optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting optuna
  Downloading optuna-3.1.0-py3-none-any.whl (365 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.3/365.3 KB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0
  Downloading alembic-1.10.2-py3-none-any.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.2/212.2 KB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
Collecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting cmaes>=0.9.1
  Downloading cmaes-0.9.1-py3-none-any.whl (21 kB)
Collecting Mako
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 KB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, cmaes, alembic, optuna
Successfully installed Mako-1.2.4 alembic-1.10.2 cmaes-0.9.1 colorlog-6.7.0 optuna-3.1.0


In [None]:
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp39-none-manylinux1_x86_64.whl (76.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.6/76.6 MB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.1.1


# Stacking
- 디시젼트리
- 익스트라트리
- Catboost
- Xgb
- LGBM  

블렌더 모델 : lgbm

In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import KFold
import lightgbm as lgb
from catboost import CatBoostClassifier
import xgboost as xgb
import numpy as np
from sklearn.ensemble import ExtraTreesClassifier
from tqdm.notebook import tqdm

# 모델 파라미터 설정
lgb_params = {'max_depth': 10, 'learning_rate': 0.09,'random_state':42}
cat_params = {'max_depth': 8, 'learning_rate': 0.06800749578049192, 'n_estimators': 967, 'random_state':42}
xgb_params = {'max_depth': 10, 'learning_rate': 0.0493607891381531,'random_state':42}
ex_params = {'n_estimators': 489, 'max_depth': 36, 'min_samples_split': 2, 'min_samples_leaf': 1, 'random_state':42}

# 모델 객체 생성
lgb_model = lgb.LGBMClassifier(**lgb_params)
cat_model = CatBoostClassifier(**cat_params)
xgb_model = xgb.XGBClassifier(**xgb_params)
ex_model = ExtraTreesClassifier(**ex_params)

# 스태킹 분류기 객체 생성
estimators = [('lgb', lgb_model),('cat',cat_model), ('xgb', xgb_model), ('ex',ex_model)]
stacking_model = StackingClassifier(estimators=estimators, final_estimator=lgb_model)

# k-fold 교차 검증을 수행하여 스태킹 모델 학습
kf = KFold(n_splits=25, shuffle=True, random_state=42)
meta_features = np.zeros((len(x_train),4 ))  # 두번째 레이어 모델의 입력 feature를 저장할 배열
for train_idx, val_idx in tqdm(kf.split(x_train)):
    x_train_fold = x_train.iloc[train_idx]
    y_train_fold = y_train[train_idx]
    x_val_fold = x_train.iloc[val_idx]
    y_val_fold = y_train[val_idx]
    
    # train set을 이용하여 첫번째 레이어 모델 학습
    lgb_model.fit(x_train_fold, y_train_fold)
    cat_model.fit(x_train_fold, y_train_fold)
    xgb_model.fit(x_train_fold, y_train_fold)
    ex_model.fit(x_train_fold, y_train_fold)
  

    # validation set에 대한 첫번째 레이어 모델의 예측 확률 계산
    lgb_pred = lgb_model.predict_proba(x_val_fold)[:, 1]
    cat_pred = cat_model.predict_proba(x_val_fold)[:, 1]
    xgb_pred = xgb_model.predict_proba(x_val_fold)[:, 1]
    ex_pred = ex_model.predict_proba(x_val_fold)[:, 1] 
   
    meta_features[val_idx, 0] = lgb_pred
    meta_features[val_idx, 1] = cat_pred
    meta_features[val_idx, 2] = xgb_pred
    meta_features[val_idx, 3] = ex_pred
    
stacking_model.fit(meta_features, y_train)



0it [00:00, ?it/s]

[1;30;43m스트리밍 출력 내용이 길어서 마지막 5000줄이 삭제되었습니다.[0m
802:	learn: 0.0908019	total: 9.26s	remaining: 1.89s
803:	learn: 0.0907018	total: 9.27s	remaining: 1.88s
804:	learn: 0.0905874	total: 9.28s	remaining: 1.87s
805:	learn: 0.0905348	total: 9.29s	remaining: 1.86s
806:	learn: 0.0904944	total: 9.3s	remaining: 1.84s
807:	learn: 0.0904647	total: 9.31s	remaining: 1.83s
808:	learn: 0.0903739	total: 9.33s	remaining: 1.82s
809:	learn: 0.0902674	total: 9.34s	remaining: 1.81s
810:	learn: 0.0901852	total: 9.35s	remaining: 1.8s
811:	learn: 0.0901598	total: 9.36s	remaining: 1.79s
812:	learn: 0.0901343	total: 9.37s	remaining: 1.77s
813:	learn: 0.0899858	total: 9.38s	remaining: 1.76s
814:	learn: 0.0898849	total: 9.39s	remaining: 1.75s
815:	learn: 0.0897500	total: 9.4s	remaining: 1.74s
816:	learn: 0.0896252	total: 9.41s	remaining: 1.73s
817:	learn: 0.0895730	total: 9.42s	remaining: 1.72s
818:	learn: 0.0894913	total: 9.43s	remaining: 1.7s
819:	learn: 0.0894026	total: 9.44s	remaining: 1.69s
820:	learn: 0.0893

TypeError: ignored

In [None]:
# test set에 대한 예측 확률 계산
test_meta_features = np.column_stack((lgb_model.predict_proba(test)[:, 1],
cat_model.predict_proba(test)[:, 1],
xgb_model.predict_proba(test)[:, 1],
ex_model.predict_proba(test)[:, 1]

))
test_preds = stacking_model.predict_proba(test_meta_features)[:, 1]

In [None]:
# threshold 값을 설정합니다.
th = 0.3

# test_preds 값이 threshold(th) 이하면 0으로, 그 이상이면 1로 변환합니다.
test_preds_binary = np.where(test_preds <= th, 0, 1)

In [None]:
test_preds_binary

array([0, 0, 0, ..., 0, 0, 0])

#  파일 저장
예측 결과를 제출하기 위해서 예측값을 제출 파일에 할당하고 저장합니다.

In [None]:
# 제출 파일을 불러옵니다.
submit = pd.read_csv('./sample_submission.csv')
# 예측한 값을 '전화해지여부' 컬럼에 할당합니다.


In [None]:
# 예측한 값을 '전화해지여부' 컬럼에 할당합니다.
submit['전화해지여부'] = test_preds_binary
submit.head()

Unnamed: 0,ID,전화해지여부
0,TEST_00000,0
1,TEST_00001,0
2,TEST_00002,0
3,TEST_00003,0
4,TEST_00004,0


In [None]:
# 예측한 파일을 저장합니다.
submit.to_csv('./dc.csv', index = False)