In [49]:
import pandas as pd
import numpy as np
import random
import os

SEED = 42
def seed_everything(seed: int = SEED):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
seed_everything()

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import catboost
from catboost import CatBoostClassifier, CatBoostRegressor

from sklearn.metrics import f1_score, roc_auc_score

import warnings
warnings.filterwarnings('ignore')

In [25]:
train = pd.read_csv('preprocessed_csv/train_preprocessed.csv')
test = pd.read_csv('preprocessed_csv/test_preprocessed.csv')

In [26]:
train[['com_reg_ver_win_rate', 'ver_win_rate_x', 'ver_win_ratio_per_bu']] = train[['com_reg_ver_win_rate', 'ver_win_rate_x', 'ver_win_ratio_per_bu']].replace(np.nan, 0)
test[['com_reg_ver_win_rate', 'ver_win_rate_x', 'ver_win_ratio_per_bu']] = test[['com_reg_ver_win_rate', 'ver_win_rate_x', 'ver_win_ratio_per_bu']].replace(np.nan, 0)

In [27]:
def encode_categorical_variables(train, test):
    categorical_columns = train.select_dtypes(include=['object']).columns.tolist()
    
    label_encoders = {}
    
    for col in categorical_columns:
        le = LabelEncoder()
        le = le.fit(train[col])
        train[col] = le.transform(train[col])
        
        for label in np.unique(test[col]):
            if label not in le.classes_:
                le.classes_ = np.append(le.classes_, label)
        test[col] = le.transform(test[col])
    
    return train, test

train, test = encode_categorical_variables(train, test)

In [28]:
def encode_with_other_category(train, test, column_name):
    le = LabelEncoder()
    train[column_name] = train[column_name].astype(str)  # int를 str로 변환
    le.fit(train[column_name])
    
    # 테스트 데이터에 대해 'other' 카테고리 처리
    # train에 없는 라벨을 기준으로 한다.
    test[column_name] = test[column_name].astype(str)  # int를 str로 변환
    test[column_name] = test[column_name].apply(lambda x: x if x in le.classes_ else 'other')
    
    # 'other' 라벨 추가
    le_classes = np.append(le.classes_, 'other')
    le.classes_ = le_classes
    
    # 라벨 인코딩 적용
    train[column_name] = le.transform(train[column_name])
    test[column_name] = le.transform(test[column_name])
    
    return train, test

train, test = encode_with_other_category(train, test, "customer_idx")
train, test = encode_with_other_category(train, test, "lead_owner")

In [29]:
train_y = train['is_converted']
train_X = train.drop(columns='is_converted', axis=1)

train_X, valid_X, train_y, valid_y = train_test_split(train_X, train_y, train_size=0.8, shuffle=True, stratify=train_y)
test = test.drop(columns='is_converted', axis=1)

# Voting

In [63]:
lgbm = LGBMClassifier(
        nthread=4,
        n_estimators=1000,
        learning_rate=0.02,
        max_depth=10,
        num_leaves=60,
        colsample_bytree=0.511,
        subsample=0.785,
        max_bin=208,
        reg_alpha=7.009,
        reg_lambda=6.579,
        min_child_weight=40,
        min_child_samples=91,
        silent=-1,
        verbose=-1,
        random_state=SEED)

cbc = CatBoostClassifier(
        iterations=10670, 
        od_wait=1507, 
        learning_rate=0.16098437506461638, 
        reg_lambda=24.546834859312227, 
        subsample=0.16549759534249936, 
        random_strength=49.80933051421884, 
        depth=6, 
        min_data_in_leaf=14, 
        leaf_estimation_iterations=14, 
        bagging_temperature=0.39841620949747386, 
        colsample_bylevel=0.6140733146459497,
        verbose=100, 
        task_type='CPU',
        random_state=SEED)

xgb = XGBClassifier(
        colsample_bytree=0.8497054793943353, 
        gamma=3.935081236491979, 
        learning_rate=0.07509368737832518, 
        max_depth=int(7.215329530514387),  
        min_child_weight=int(1.7019371790152105),  
        n_estimators=int(240.30115793246688),  
        subsample=0.971358258121751,
        random_state=SEED)

rf = RandomForestClassifier(
        max_depth=int(77.32983850657278),  
        min_samples_leaf=int(2.9945638773827494),
        min_samples_split=int(3.126668709295796),  
        n_estimators=int(224.4759742656425),
        random_state=SEED)

# lr = LogisticRegression(
#         C=0.41994536370093083, 
#         penalty='l2', 
#         l1_ratio=0.7209270556562788,
#         max_iter=1000,
#         solver='saga',
#         random_state=SEED)

bagging = BaggingClassifier(random_state=SEED)

models = [lgbm, cbc, xgb, rf, bagging]

In [64]:
# weight 찾기
model_list = ['lgbm', 'cbc', 'xgb', 'rf', 'bagging']
i = 0; model_f1 = {}
best_threshold= 0.0035

for model in models:  
    model.fit(train_X, train_y)
    pred = model.predict(valid_X)
    if model_list[i] == 'cbc':
        pred = np.array(pred) == 'True'
    globals()[f'preds_{model_list[i]}'] = pred

    score = f1_score(valid_y, pred)
    model_f1[model_list[i]] = score
    i += 1   

0:	learn: 0.5451832	total: 11.5ms	remaining: 2m 2s
100:	learn: 0.0080865	total: 1.38s	remaining: 2m 24s
200:	learn: 0.0056780	total: 2.56s	remaining: 2m 13s
300:	learn: 0.0049295	total: 3.83s	remaining: 2m 11s
400:	learn: 0.0044190	total: 5s	remaining: 2m 8s
500:	learn: 0.0040814	total: 6.19s	remaining: 2m 5s
600:	learn: 0.0037746	total: 7.48s	remaining: 2m 5s
700:	learn: 0.0035220	total: 8.65s	remaining: 2m 3s
800:	learn: 0.0032956	total: 9.79s	remaining: 2m
900:	learn: 0.0031871	total: 10.9s	remaining: 1m 58s
1000:	learn: 0.0030325	total: 12.1s	remaining: 1m 56s
1100:	learn: 0.0029360	total: 13.2s	remaining: 1m 54s
1200:	learn: 0.0028020	total: 14.4s	remaining: 1m 53s
1300:	learn: 0.0027176	total: 15.4s	remaining: 1m 51s
1400:	learn: 0.0026386	total: 16.6s	remaining: 1m 50s
1500:	learn: 0.0025711	total: 17.9s	remaining: 1m 49s
1600:	learn: 0.0025189	total: 19s	remaining: 1m 47s
1700:	learn: 0.0024575	total: 20.2s	remaining: 1m 46s
1800:	learn: 0.0023701	total: 21.3s	remaining: 1m 44s

In [66]:
print(globals()['preds_lgbm'])
print(model_f1)   # {'lgbm': 0.9773429454170957, 'cbc': 0.9804727646454265, 'xgb': 0.9820788530465949, 'rf': 0.9798657718120805, 'bagging': 0.9783281733746131}

[False False  True ... False False False]
{'lgbm': 0.9773429454170957, 'cbc': 0.9804727646454265, 'xgb': 0.9820788530465949, 'rf': 0.9798657718120805, 'bagging': 0.9783281733746131}


In [67]:
estimators = [('lgbm', lgbm), ('cbc', cbc), ('xgb', xgb), ('rf', rf), ('bagging', bagging)]
best_model = VotingClassifier(estimators, voting='soft', weights=[1, 2, 2, 1, 1]) # hard voting은 predict_proba 안 됨
best_model.fit(train_X, train_y)

0:	learn: 0.5451832	total: 12.7ms	remaining: 2m 15s
100:	learn: 0.0080865	total: 1.24s	remaining: 2m 9s
200:	learn: 0.0056780	total: 2.42s	remaining: 2m 5s
300:	learn: 0.0049295	total: 3.59s	remaining: 2m 3s
400:	learn: 0.0044190	total: 4.66s	remaining: 1m 59s
500:	learn: 0.0040814	total: 5.72s	remaining: 1m 56s
600:	learn: 0.0037746	total: 6.85s	remaining: 1m 54s
700:	learn: 0.0035220	total: 7.92s	remaining: 1m 52s
800:	learn: 0.0032956	total: 9.17s	remaining: 1m 53s
900:	learn: 0.0031871	total: 10.2s	remaining: 1m 50s
1000:	learn: 0.0030325	total: 11.2s	remaining: 1m 48s
1100:	learn: 0.0029360	total: 12.3s	remaining: 1m 47s
1200:	learn: 0.0028020	total: 13.3s	remaining: 1m 45s
1300:	learn: 0.0027176	total: 14.4s	remaining: 1m 43s
1400:	learn: 0.0026386	total: 15.5s	remaining: 1m 42s
1500:	learn: 0.0025711	total: 16.5s	remaining: 1m 40s
1600:	learn: 0.0025189	total: 17.5s	remaining: 1m 39s
1700:	learn: 0.0024575	total: 18.5s	remaining: 1m 37s
1800:	learn: 0.0023701	total: 19.6s	remain

In [68]:
pred = best_model.predict(valid_X)
score = f1_score(pred, valid_y) 
print(score) # 검증 스코어

0.9820420728578758


In [87]:
# 제출용 prediction
best_threshold = 0.00485
pred_probs = best_model.predict_proba(test)[:, 1]
final_preds = pred_probs >= best_threshold
print(sum(final_preds) / len(final_preds)) # True 비율

0.32821096566116487


In [88]:
# True, False로 수정
def convert_to_boolean_vector(input_vector):
    return np.array(input_vector == 1, dtype=bool)          
    
result_vector = convert_to_boolean_vector(final_preds)

In [89]:
submission = pd.read_csv('submission.csv')
submission['is_converted'] = result_vector
submission.to_csv('submission_csv/voting_submission.csv')