In [94]:
import pandas as pd
import numpy as np
import random
import os

SEED = 42
def seed_everything(seed: int = SEED):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
seed_everything()

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifierCV
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import catboost
from catboost import CatBoostClassifier, CatBoostRegressor

from sklearn.metrics import f1_score, roc_auc_score

import warnings
warnings.filterwarnings('ignore')

In [159]:
train = pd.read_csv('preprocessed_csv/train_preprocessed.csv')
test = pd.read_csv('preprocessed_csv/test_preprocessed.csv')

In [160]:
train[['com_reg_ver_win_rate', 'ver_win_rate_x', 'ver_win_ratio_per_bu']] = train[['com_reg_ver_win_rate', 'ver_win_rate_x', 'ver_win_ratio_per_bu']].replace(np.nan, 0)
test[['com_reg_ver_win_rate', 'ver_win_rate_x', 'ver_win_ratio_per_bu']] = test[['com_reg_ver_win_rate', 'ver_win_rate_x', 'ver_win_ratio_per_bu']].replace(np.nan, 0)

In [161]:
def encode_categorical_variables(train, test):
    categorical_columns = train.select_dtypes(include=['object']).columns.tolist()
    
    label_encoders = {}
    
    for col in categorical_columns:
        le = LabelEncoder()
        le = le.fit(train[col])
        train[col] = le.transform(train[col])
        
        for label in np.unique(test[col]):
            if label not in le.classes_:
                le.classes_ = np.append(le.classes_, label)
        test[col] = le.transform(test[col])
    
    return train, test

train, test = encode_categorical_variables(train, test)

In [162]:
def encode_with_other_category(train, test, column_name):
    le = LabelEncoder()
    train[column_name] = train[column_name].astype(str)  # int를 str로 변환
    le.fit(train[column_name])
    
    # 테스트 데이터에 대해 'other' 카테고리 처리
    # train에 없는 라벨을 기준으로 한다.
    test[column_name] = test[column_name].astype(str)  # int를 str로 변환
    test[column_name] = test[column_name].apply(lambda x: x if x in le.classes_ else 'other')
    
    # 'other' 라벨 추가
    le_classes = np.append(le.classes_, 'other')
    le.classes_ = le_classes
    
    # 라벨 인코딩 적용
    train[column_name] = le.transform(train[column_name])
    test[column_name] = le.transform(test[column_name])
    
    return train, test

train, test = encode_with_other_category(train, test, "customer_idx")
train, test = encode_with_other_category(train, test, "lead_owner")

In [163]:
train_y = train['is_converted']
train_X = train.drop(columns='is_converted', axis=1)

train_X, valid_X, train_y, valid_y = train_test_split(train_X, train_y, train_size=0.8, shuffle=True, stratify=train_y)
test = test.drop(columns='is_converted', axis=1)

# Stacking

In [164]:
lgbm = LGBMClassifier(
        nthread=4,
        n_estimators=1000,
        learning_rate=0.02,
        max_depth=10,
        num_leaves=60,
        colsample_bytree=0.511,
        subsample=0.785,
        max_bin=208,
        reg_alpha=7.009,
        reg_lambda=6.579,
        min_child_weight=40,
        min_child_samples=91,
        silent=-1,
        verbose=-1,
        random_state=SEED)

cbc = CatBoostClassifier(
        iterations=10670, 
        od_wait=1507, 
        learning_rate=0.16098437506461638, 
        reg_lambda=24.546834859312227, 
        subsample=0.16549759534249936, 
        random_strength=49.80933051421884, 
        depth=6, 
        min_data_in_leaf=14, 
        leaf_estimation_iterations=14, 
        bagging_temperature=0.39841620949747386, 
        colsample_bylevel=0.6140733146459497,
        verbose=100, task_type='CPU',
        random_state=SEED)

xgb = XGBClassifier(
        colsample_bytree=0.8497054793943353, 
        gamma=3.935081236491979, 
        learning_rate=0.07509368737832518, 
        max_depth=int(7.215329530514387),  
        min_child_weight=int(1.7019371790152105),  
        n_estimators=int(240.30115793246688),  
        subsample=0.971358258121751,
        random_state=SEED)

rf = RandomForestClassifier(
        max_depth=int(77.32983850657278),  
        min_samples_leaf=int(2.9945638773827494),
        min_samples_split=int(3.126668709295796),  
        n_estimators=int(224.4759742656425),
        random_state=SEED)

ridge = RidgeClassifierCV()
    
bagging = BaggingClassifier(random_state=SEED)

In [165]:
estimators = [('lgbm', lgbm), ('cbc', cbc), ('xgb', xgb), ('rf', rf), ('ridge', ridge), ('bagging', bagging)]
stack = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(), verbose=1)
stack.fit(train_X, train_y)

0:	learn: 0.5451832	total: 13.9ms	remaining: 2m 28s
100:	learn: 0.0080865	total: 1.56s	remaining: 2m 43s
200:	learn: 0.0056780	total: 2.84s	remaining: 2m 28s
300:	learn: 0.0049295	total: 4.08s	remaining: 2m 20s
400:	learn: 0.0044190	total: 5.18s	remaining: 2m 12s
500:	learn: 0.0040814	total: 6.99s	remaining: 2m 21s
600:	learn: 0.0037746	total: 8.15s	remaining: 2m 16s
700:	learn: 0.0035220	total: 9.28s	remaining: 2m 11s
800:	learn: 0.0032956	total: 10.4s	remaining: 2m 7s
900:	learn: 0.0031871	total: 11.4s	remaining: 2m 3s
1000:	learn: 0.0030325	total: 12.4s	remaining: 1m 59s
1100:	learn: 0.0029360	total: 13.4s	remaining: 1m 56s
1200:	learn: 0.0028020	total: 14.4s	remaining: 1m 53s
1300:	learn: 0.0027176	total: 15.4s	remaining: 1m 51s
1400:	learn: 0.0026386	total: 16.4s	remaining: 1m 48s
1500:	learn: 0.0025711	total: 17.5s	remaining: 1m 47s
1600:	learn: 0.0025189	total: 18.5s	remaining: 1m 44s
1700:	learn: 0.0024575	total: 19.5s	remaining: 1m 42s
1800:	learn: 0.0023701	total: 20.5s	remai

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.




[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   13.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


0:	learn: 0.5459911	total: 14.4ms	remaining: 2m 33s
100:	learn: 0.0077592	total: 1.56s	remaining: 2m 43s
200:	learn: 0.0060747	total: 2.88s	remaining: 2m 29s
300:	learn: 0.0049907	total: 4.05s	remaining: 2m 19s
400:	learn: 0.0043995	total: 5.23s	remaining: 2m 14s
500:	learn: 0.0038637	total: 6.48s	remaining: 2m 11s
600:	learn: 0.0035747	total: 7.73s	remaining: 2m 9s
700:	learn: 0.0031201	total: 8.95s	remaining: 2m 7s
800:	learn: 0.0029310	total: 10s	remaining: 2m 3s
900:	learn: 0.0028244	total: 11.1s	remaining: 1m 59s
1000:	learn: 0.0027232	total: 12.1s	remaining: 1m 57s
1100:	learn: 0.0026177	total: 13.2s	remaining: 1m 54s
1200:	learn: 0.0024927	total: 14.3s	remaining: 1m 52s
1300:	learn: 0.0024242	total: 15.4s	remaining: 1m 50s
1400:	learn: 0.0023007	total: 16.4s	remaining: 1m 48s
1500:	learn: 0.0022113	total: 17.5s	remaining: 1m 47s
1600:	learn: 0.0021573	total: 18.6s	remaining: 1m 45s
1700:	learn: 0.0021133	total: 19.7s	remaining: 1m 43s
1800:	learn: 0.0020475	total: 20.7s	remainin

4600:	learn: 0.0018766	total: 1m 14s	remaining: 1m 37s
4700:	learn: 0.0018719	total: 1m 15s	remaining: 1m 36s
4800:	learn: 0.0018656	total: 1m 17s	remaining: 1m 34s
4900:	learn: 0.0018536	total: 1m 18s	remaining: 1m 32s
5000:	learn: 0.0018438	total: 1m 20s	remaining: 1m 31s
5100:	learn: 0.0018219	total: 1m 21s	remaining: 1m 29s
5200:	learn: 0.0018131	total: 1m 23s	remaining: 1m 27s
5300:	learn: 0.0017952	total: 1m 25s	remaining: 1m 26s
5400:	learn: 0.0017881	total: 1m 27s	remaining: 1m 24s
5500:	learn: 0.0017798	total: 1m 28s	remaining: 1m 23s
5600:	learn: 0.0017659	total: 1m 30s	remaining: 1m 21s
5700:	learn: 0.0017549	total: 1m 31s	remaining: 1m 19s
5800:	learn: 0.0017462	total: 1m 33s	remaining: 1m 18s
5900:	learn: 0.0017277	total: 1m 34s	remaining: 1m 16s
6000:	learn: 0.0017248	total: 1m 36s	remaining: 1m 14s
6100:	learn: 0.0017198	total: 1m 37s	remaining: 1m 13s
6200:	learn: 0.0017132	total: 1m 39s	remaining: 1m 11s
6300:	learn: 0.0017085	total: 1m 41s	remaining: 1m 10s
6400:	lear

9100:	learn: 0.0012932	total: 1m 46s	remaining: 18.4s
9200:	learn: 0.0012910	total: 1m 48s	remaining: 17.3s
9300:	learn: 0.0012889	total: 1m 49s	remaining: 16.2s
9400:	learn: 0.0012876	total: 1m 51s	remaining: 15s
9500:	learn: 0.0012836	total: 1m 52s	remaining: 13.9s
9600:	learn: 0.0012767	total: 1m 53s	remaining: 12.7s
9700:	learn: 0.0012762	total: 1m 55s	remaining: 11.5s
9800:	learn: 0.0012739	total: 1m 56s	remaining: 10.3s
9900:	learn: 0.0012673	total: 1m 57s	remaining: 9.14s
10000:	learn: 0.0012617	total: 1m 58s	remaining: 7.96s
10100:	learn: 0.0012575	total: 2m	remaining: 6.78s
10200:	learn: 0.0012539	total: 2m 1s	remaining: 5.59s
10300:	learn: 0.0012450	total: 2m 2s	remaining: 4.39s
10400:	learn: 0.0012347	total: 2m 3s	remaining: 3.2s
10500:	learn: 0.0012320	total: 2m 4s	remaining: 2.01s
10600:	learn: 0.0012237	total: 2m 5s	remaining: 819ms
10669:	learn: 0.0012219	total: 2m 6s	remaining: 0us
0:	learn: 0.5460254	total: 28.7ms	remaining: 5m 5s
100:	learn: 0.0076273	total: 2.06s	rem

2800:	learn: 0.0018243	total: 32.4s	remaining: 1m 30s
2900:	learn: 0.0018050	total: 33.4s	remaining: 1m 29s
3000:	learn: 0.0017914	total: 34.6s	remaining: 1m 28s
3100:	learn: 0.0017753	total: 36s	remaining: 1m 27s
3200:	learn: 0.0017524	total: 37.3s	remaining: 1m 26s
3300:	learn: 0.0017321	total: 38.4s	remaining: 1m 25s
3400:	learn: 0.0017228	total: 39.5s	remaining: 1m 24s
3500:	learn: 0.0016973	total: 40.6s	remaining: 1m 23s
3600:	learn: 0.0016803	total: 41.7s	remaining: 1m 21s
3700:	learn: 0.0016638	total: 42.8s	remaining: 1m 20s
3800:	learn: 0.0016459	total: 44.3s	remaining: 1m 20s
3900:	learn: 0.0016431	total: 45.9s	remaining: 1m 19s
4000:	learn: 0.0016408	total: 47.5s	remaining: 1m 19s
4100:	learn: 0.0016370	total: 49.5s	remaining: 1m 19s
4200:	learn: 0.0016305	total: 51.5s	remaining: 1m 19s
4300:	learn: 0.0016239	total: 53.7s	remaining: 1m 19s
4400:	learn: 0.0016053	total: 56s	remaining: 1m 19s
4500:	learn: 0.0015946	total: 57.9s	remaining: 1m 19s
4600:	learn: 0.0015811	total: 59

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 12.2min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   37.3s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   58.9s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    8.0s finished


In [166]:
pred = stack.predict(valid_X)
score = f1_score(pred, valid_y) 
print(score) # 검증 스코어

0.9824561403508771


In [169]:
# 제출용 prediction
best_threshold = 0.00119
pred_probs = stack.predict_proba(test)[:, 1]
final_preds = pred_probs >= best_threshold
print(sum(final_preds) / len(final_preds)) # True 비율

0.35325365205843295


In [170]:
# True, False로 수정
def convert_to_boolean_vector(input_vector):
    return np.array(input_vector == 1, dtype=bool)          
    
result_vector = convert_to_boolean_vector(final_preds)

In [171]:
submission = pd.read_csv('submission.csv')
submission['is_converted'] = result_vector
submission.to_csv('submission_csv/submission.csv')