## 데이터 불러오기

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# 필요한 라이브러리 설치
!pip install pycaret
!pip install catboost
!pip install mlxtend

# 데이터 처리 및 모델링을 위한 라이브러리
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# 모델링 관련 라이브러리
import xgboost as xgb
import catboost as cb
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from mlxtend.classifier import StackingClassifier
from sklearn.linear_model import LogisticRegression

# 데이터 불균형 처리를 위한 라이브러리
from imblearn.over_sampling import SMOTE

# 성능 평가 지표
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score

# 하이퍼파라미터 튜닝을 위한 라이브러리
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold


In [None]:
# 파일 불러오기
train = '/content/drive/MyDrive/lg/dongtrain.csv'
try:
    df_train = pd.read_csv(train, encoding='ISO-8859-1')
except UnicodeDecodeError:
    df_train = pd.read_csv(train, encoding='cp1252')


sub = '/content/drive/MyDrive/lg/dongsub.csv'
try:
    df_test = pd.read_csv(sub, encoding='ISO-8859-1')
except UnicodeDecodeError:
    df_test = pd.read_csv(sub, encoding='cp1252')

sub1 = '/content/drive/MyDrive/lg/dongsub.csv'
try:
    df_sub = pd.read_csv(sub1, encoding='ISO-8859-1')
except UnicodeDecodeError:
    df_sub = pd.read_csv(sub1, encoding='cp1252')


## 전처리

In [None]:
# 훈련, 테스트 데이터가 동일한 전처리 적용되게 하려고 합침
combined_data = pd.concat([df_train, df_test.drop(columns=['id'])], ignore_index=True)


In [None]:
# 조건에 따른 새로운 열 추가
combined_data['as_strategic_ver'] = (
    ((combined_data['business_area'] == 'corporate / office') | (combined_data['business_area'] == 'residential (home)')) &
    (combined_data['business_unit'] == 'AS')
).astype(int)

print(combined_data)

       bant_submit            customer_country business_unit  \
0             1.00    /Quezon City/Philippines            AS   
1             1.00          /PH-00/Philippines            AS   
2             1.00             /Kolkata /India            AS   
3             1.00          /Bhubaneswar/India            AS   
4             1.00            /Hyderabad/India            AS   
...            ...                         ...           ...   
64565         0.50          /SÃ£o Paulo/Brazil            AS   
64566         0.25  General /  / United States            IT   
64567         0.75      / OURO BRANCO / Brazil            AS   
64568         0.00                /  / Germany            IT   
64569         0.25           / Ongole  / India            AS   

       com_reg_ver_win_rate  customer_idx          customer_type  enterprise  \
0                  0.066667         32160           End-Customer  Enterprise   
1                  0.066667         23122           End-Customer  Enter

  print(combined_data)


In [None]:
# 진짜 0이랑 헷갈릴까봐 결측치 -999로 채움
combined_data['com_reg_ver_win_rate'] = combined_data['com_reg_ver_win_rate'].fillna(-999)

In [None]:
# 구매기록 있 없으로 나눔
combined_data['has_historical'] = combined_data['historical_existing_cnt'].notnull().astype(int)
combined_data['no_historical'] = combined_data['historical_existing_cnt'].isnull().astype(int)

       bant_submit            customer_country business_unit  \
0             1.00    /Quezon City/Philippines            AS   
1             1.00          /PH-00/Philippines            AS   
2             1.00             /Kolkata /India            AS   
3             1.00          /Bhubaneswar/India            AS   
4             1.00            /Hyderabad/India            AS   
...            ...                         ...           ...   
64565         0.50          /SÃ£o Paulo/Brazil            AS   
64566         0.25  General /  / United States            IT   
64567         0.75      / OURO BRANCO / Brazil            AS   
64568         0.00                /  / Germany            IT   
64569         0.25           / Ongole  / India            AS   

       com_reg_ver_win_rate  customer_idx          customer_type  enterprise  \
0                  0.066667         32160           End-Customer  Enterprise   
1                  0.066667         23122           End-Customer  Enter

  print(combined_data)


In [None]:
# 글 길이를 5등분
bins = [0, 252, 504, 756, 1008, 1264]
labels = ['Very Short', 'Short', 'Medium', 'Long', 'Very Long']
combined_data['lead_desc_length_cat'] = pd.cut(combined_data['lead_desc_length'], bins=bins, labels=labels, include_lowest=True)


In [None]:
# 원핫 인코딩 적용
lead_desc_length_dummies = pd.get_dummies(combined_data['lead_desc_length_cat'], prefix='desc_length')
combined_data = pd.concat([combined_data, lead_desc_length_dummies], axis=1)

In [None]:
# object인 특성들 레이블인코딩
object_cols = combined_data.select_dtypes(include=['object']).columns

label_encoders = {}

for col in object_cols:
    le = LabelEncoder()
    combined_data[col] = le.fit_transform(combined_data[col].astype(str))
    label_encoders[col] = le

combined_data = combined_data.fillna(0)

In [None]:
# combined_data에서 훈련 데이터와 테스트 데이터 분할
n_train = df_train.shape[0]

df_train_prepared = combined_data.iloc[:n_train, :]
df_test_prepared = combined_data.iloc[n_train:, :]

## 모델

In [None]:
# 사용할 특성 리스트 정의
selected_features = [
    "com_reg_ver_win_rate", "customer_idx",
    "customer_type",
    "inquiry_type", "it_strategic_ver", "has_historical", "no_historical",
    "response_corporate","as_strategic_ver",
    "lead_owner",
    "desc_length_Short", "desc_length_Medium",  "desc_length_Long", "desc_length_Very Long", "desc_length_Very Short"
]

In [None]:
!pip install optuna
import optuna

Collecting optuna
  Downloading optuna-3.5.0-py3-none-any.whl (413 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m413.4/413.4 kB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog (from optuna)
  Downloading colorlog-6.8.2-py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.2-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, alembic, optuna
Successfully installed Mako-1.3.2 alembic-1.13.1 colorlog-6.8.2 optuna-3.5.0


In [None]:
def objective(trial):
    # 하이퍼파라미터 범위 설정
    xgb_params = {
        'max_depth': trial.suggest_int('xgb_max_depth', 3, 10),
        'learning_rate': trial.suggest_float('xgb_learning_rate', 0.01, 0.2),
        'n_estimators': trial.suggest_int('xgb_n_estimators', 100, 1000),
        'min_child_weight': trial.suggest_int('xgb_min_child_weight', 1, 10),
        'use_label_encoder': False,
        'eval_metric': 'logloss'
    }
    cb_params = {
        'depth': trial.suggest_int('cb_depth', 3, 10),
        'learning_rate': trial.suggest_float('cb_learning_rate', 0.01, 0.2),
        'iterations': trial.suggest_int('cb_iterations', 100, 1000),
        'auto_class_weights': 'Balanced',
        'verbose': False
    }

    # 모델 정의
    clf1 = xgb.XGBClassifier(**xgb_params)
    clf2 = cb.CatBoostClassifier(**cb_params)
    voting_clf = VotingClassifier(estimators=[('xgb', clf1), ('cb', clf2)], voting='soft')


    # 교차 검증 수행
    scores = cross_val_score(voting_clf, df_train_prepared[selected_features], df_train_prepared['is_converted'], cv=5, scoring='f1', n_jobs=-1)

    # 평균 F1 점수 반환
    return scores.mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)  # 시도할 횟수

best_params = study.best_trial.params
print("Best parameters:", best_params)

[I 2024-02-25 14:21:00,418] A new study created in memory with name: no-name-93874eb0-cf65-4e48-9342-2217ed6c6f90
[I 2024-02-25 14:21:41,725] Trial 0 finished with value: 0.692563565702179 and parameters: {'xgb_max_depth': 8, 'xgb_learning_rate': 0.1756276717243763, 'xgb_n_estimators': 443, 'xgb_min_child_weight': 6, 'cb_depth': 7, 'cb_learning_rate': 0.17957290487247815, 'cb_iterations': 714}. Best is trial 0 with value: 0.692563565702179.
[I 2024-02-25 14:21:59,701] Trial 1 finished with value: 0.6980296257234648 and parameters: {'xgb_max_depth': 4, 'xgb_learning_rate': 0.091996781096378, 'xgb_n_estimators': 854, 'xgb_min_child_weight': 4, 'cb_depth': 5, 'cb_learning_rate': 0.12252674744734088, 'cb_iterations': 314}. Best is trial 1 with value: 0.6980296257234648.
[I 2024-02-25 14:23:17,807] Trial 2 finished with value: 0.6869459657440309 and parameters: {'xgb_max_depth': 7, 'xgb_learning_rate': 0.1060915904454814, 'xgb_n_estimators': 154, 'xgb_min_child_weight': 3, 'cb_depth': 9, 'c

Best parameters: {'xgb_max_depth': 3, 'xgb_learning_rate': 0.18775526182290408, 'xgb_n_estimators': 358, 'xgb_min_child_weight': 4, 'cb_depth': 8, 'cb_learning_rate': 0.13860523505031813, 'cb_iterations': 382}


In [None]:
# 최적의 하이퍼파라미터로 모델 정의
clf1_optimized = xgb.XGBClassifier(
    max_depth=best_params['xgb_max_depth'],
    learning_rate=best_params['xgb_learning_rate'],
    n_estimators=best_params['xgb_n_estimators'],
    min_child_weight=best_params['xgb_min_child_weight'],
    use_label_encoder=False,
    eval_metric='logloss'
)

clf2_optimized = cb.CatBoostClassifier(
    depth=best_params['cb_depth'],
    learning_rate=best_params['cb_learning_rate'],
    iterations=best_params['cb_iterations'],
    auto_class_weights='Balanced',
    verbose=False
)

# VotingClassifier 구성
voting_clf_optimized = VotingClassifier(
    estimators=[('xgb', clf1_optimized), ('cb', clf2_optimized)],
    voting='soft'
)

# 교차 검증을 통한 모델 성능 평가
scores = cross_val_score(voting_clf_optimized, df_train_prepared[selected_features], df_train_prepared['is_converted'], cv=5, scoring='f1', n_jobs=-1)

print("교차 검증 F1 점수: ", scores)
print("평균 F1 점수: ", scores.mean())

교차 검증 F1 점수:  [0.86988077 0.63111819 0.5430303  0.93489861 0.55566219]
평균 F1 점수:  0.7069180112792277


In [None]:
# 전체 훈련 데이터셋에 대한 클래스 가중치 계산
class_weights = {cls: len(df_train_prepared['is_converted']) / (len(np.where(df_train_prepared['is_converted'] == cls)[0]) * len(np.unique(df_train_prepared['is_converted']))) for cls in np.unique(df_train_prepared['is_converted'])}

# 전체 훈련 데이터셋에 대한 sample_weights 생성
sample_weights = np.array([class_weights[cls] for cls in df_train_prepared['is_converted']])

In [None]:
voting_clf_optimized.fit(df_train_prepared[selected_features], df_train_prepared['is_converted'], sample_weight=sample_weights)


## 예측 파일 생성

In [None]:
# 테스트 데이터에 대한 예측 수행
x_test = df_test_prepared[selected_features]
test_pred = voting_clf_optimized.predict(x_test)

df_sub['is_converted'] = test_pred
df_sub['is_converted'] = df_sub['is_converted'].astype(bool)

file_path = '/content/drive/My Drive/lg/sub0226(1).csv'
df_sub.to_csv(file_path, index=False)