## 데이터 불러오기

In [45]:
# Google Drive를 마운트
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [46]:
# 필요한 라이브러리 설치
!pip install pycaret
!pip install catboost
!pip install mlxtend

# 데이터 처리 및 모델링을 위한 라이브러리
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# 모델링 관련 라이브러리
import xgboost as xgb
import catboost as cb
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from mlxtend.classifier import StackingClassifier
from sklearn.linear_model import LogisticRegression

# 데이터 불균형 처리를 위한 라이브러리
from imblearn.over_sampling import SMOTE

# 성능 평가 지표
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score

# 하이퍼파라미터 튜닝을 위한 라이브러리
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold




In [47]:

train = '/content/drive/MyDrive/lg/rrtrain.csv'
try:
    df_train = pd.read_csv(train, encoding='ISO-8859-1')
except UnicodeDecodeError:
    df_train = pd.read_csv(train, encoding='cp1252')

sub = '/content/drive/MyDrive/lg/rrsub.csv'
try:
    df_test = pd.read_csv(sub, encoding='ISO-8859-1')
except UnicodeDecodeError:
    df_test = pd.read_csv(sub, encoding='cp1252')

sub1 = '/content/drive/MyDrive/lg/rrsub.csv'
try:
    df_sub = pd.read_csv(sub1, encoding='ISO-8859-1')
except UnicodeDecodeError:
    df_sub = pd.read_csv(sub1, encoding='cp1252')


## 전처리

In [48]:
# 훈련 및 테스트 데이터를 결합하여 동일한 데이터 전처리 적용
combined_data = pd.concat([df_train, df_test.drop(columns=['id'])], ignore_index=True)

In [49]:
# as_strategic_ver 열 추가
combined_data['as_strategic_ver'] = (
    ((combined_data['business_area'] == 'corporate / office') | (combined_data['business_area'] == 'hotel & accommodation')) &
    (combined_data['business_unit'] == 'AS')
).astype(int)

In [50]:
# com_reg_ver_win_rate 결측치를 -999로 채움
combined_data['com_reg_ver_win_rate'] = combined_data['com_reg_ver_win_rate'].fillna(-999)

In [51]:
# historical_existing_cnt 기록 유무에 따른 특성 생성
combined_data['has_historical'] = combined_data['historical_existing_cnt'].notnull().astype(int)
combined_data['no_historical'] = combined_data['historical_existing_cnt'].isnull().astype(int)

In [52]:
# lead_desc_length_cat를 5개 구간으로 나누고 원핫인코딩 적용
bins = [0, 252, 504, 756, 1008, 1264]
labels = ['Very Short', 'Short', 'Medium', 'Long', 'Very Long']
combined_data['lead_desc_length_cat'] = pd.cut(combined_data['lead_desc_length'], bins=bins, labels=labels, include_lowest=True)

lead_desc_length_dummies = pd.get_dummies(combined_data['lead_desc_length_cat'], prefix='desc_length')
combined_data = pd.concat([combined_data, lead_desc_length_dummies], axis=1)

In [53]:
# object 타입 특성에 대한 레이블 인코딩 적용
object_cols = combined_data.select_dtypes(include=['object']).columns
label_encoders = {}

for col in object_cols:
    le = LabelEncoder()
    combined_data[col] = le.fit_transform(combined_data[col].astype(str))
    label_encoders[col] = le

# 결측치를 0으로 채우기
combined_data = combined_data.fillna(0)

In [54]:
# 훈련 데이터와 테스트 데이터를 분리하기 위한 인덱스 설정
n_train = df_train.shape[0]
df_train_prepared = combined_data.iloc[:n_train, :]
df_test_prepared = combined_data.iloc[n_train:, :]

## 모델

In [55]:
# 모델 학습에 사용될 특성 선택
selected_features = [
    "com_reg_ver_win_rate", "customer_idx",
    "customer_type",
    "inquiry_type", "it_strategic_ver", "has_historical", "no_historical",
    "response_corporate","as_strategic_ver",
    "lead_owner",
    "desc_length_Short", "desc_length_Medium",  "desc_length_Long", "desc_length_Very Long", "desc_length_Very Short"
]

In [56]:
# 클래스 가중치 계산 및 적용
class_weights = {cls: len(df_train_prepared['is_converted']) / (len(np.where(df_train_prepared['is_converted'] == cls)[0]) * len(np.unique(df_train_prepared['is_converted']))) for cls in np.unique(df_train_prepared['is_converted'])}
sample_weights = np.array([class_weights[cls] for cls in df_train_prepared['is_converted']])

# 모델 정의 및 앙상블 모델 구성
clf1 = xgb.XGBClassifier(
    max_depth=4,
    learning_rate=0.03369170868676631,
    n_estimators=818,
    min_child_weight=3,
    use_label_encoder=False,
    eval_metric='logloss',
)

clf2 = cb.CatBoostClassifier(
    depth=7,
    learning_rate=0.06966166550130459,
    iterations=516,
    auto_class_weights='Balanced',
    verbose=0,
)

voting_clf = VotingClassifier(estimators=[('xgb', clf1), ('cb', clf2)], voting='soft')

# 교차 검증 수행
scores = cross_val_score(voting_clf, df_train_prepared[selected_features],
                         df_train_prepared['is_converted'], cv=5, scoring='f1')

print("교차 검증 F1 점수: ", scores)
print("평균 F1 점수: ", scores.mean())

교차 검증 F1 점수:  [0.87751938 0.62544031 0.55123675 0.92956243 0.55297863]
평균 F1 점수:  0.7073475004038701


In [57]:
# 최종 모델 학습
voting_clf.fit(df_train_prepared[selected_features], df_train_prepared['is_converted'], sample_weight=sample_weights)

## 예측 파일 생성

In [58]:
# 테스트 데이터에 대한 예측 수행
x_test = df_test_prepared[selected_features]
test_pred = voting_clf.predict(x_test)

# 예측 결과를 제출 파일 형식에 맞게 저장
df_sub['is_converted'] = test_pred
df_sub['is_converted'] = df_sub['is_converted'].astype(bool)

file_path = '/content/drive/My Drive/lg/sub0226.csv'
df_sub.to_csv(file_path, index=False)