#라이브러리 설치 및 Import

In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/My Drive/lg_aimers4


Mounted at /content/drive
/content/drive/My Drive/lg_aimers4


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split

In [None]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.3-cp310-cp310-manylinux2014_x86_64.whl (98.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.3


#데이터 분석

In [None]:
df_train = pd.read_csv("train.csv") # 학습용 데이터
df_test = pd.read_csv("submission.csv") # 테스트 데이터(제출파일의 데이터)

In [None]:
len(df_train)

59299

In [None]:
# dataframe 열 이름과 타입 출력
print(df_train.dtypes)

bant_submit                float64
customer_country            object
business_unit               object
com_reg_ver_win_rate       float64
customer_idx                 int64
customer_type               object
enterprise                  object
historical_existing_cnt    float64
id_strategic_ver           float64
it_strategic_ver           float64
idit_strategic_ver         float64
customer_job                object
lead_desc_length             int64
inquiry_type                object
product_category            object
product_subcategory         object
product_modelname           object
customer_country.1          object
customer_position           object
response_corporate          object
expected_timeline           object
ver_cus                      int64
ver_pro                      int64
ver_win_rate_x             float64
ver_win_ratio_per_bu       float64
business_area               object
business_subarea            object
lead_owner                   int64
is_converted        

# 데이터 처리

---
전처리 및 결측치 처리


In [None]:
# 라벨링을 true:1,false:0으로 바꾸기
df_train['is_converted'] = df_train['is_converted'].apply(lambda x: 1 if x == True else 0)

In [None]:
df_train['customer_type'].fillna('others', inplace=True)

In [None]:
df_train['customer_job'].fillna('none', inplace=True)

In [None]:
df_train['inquiry_type'].fillna('others', inplace=True)

In [None]:
df_train['lead_owner'] = df_train['lead_owner'].astype(str)

In [None]:
#가공변수:고객의 국적과 담당 자사 법인명 기반의 지역 정보(대륙)이 일치 여부(index:27)
df_train['same_country'] = (df_train['customer_country'] == df_train['customer_country.1']).astype(int)
df_train['same_country'] = df_train['same_country'].map({0: '불일치', 1: '일치'})

In [None]:
#가공변수 생성으로 인해 customer_country.1 열 제거
df_train.drop('customer_country.1', axis=1, inplace=True)

In [None]:
# 독립변수와 종속변수 분리
train_x = df_train.drop(['is_converted'], axis = 1)
train_y = df_train['is_converted']

In [None]:
# lead_desc_length의 전체 평균 계산
lead_desc_length_avg = train_x['lead_desc_length'].mean()

In [None]:
# historical_existing_cnt 결측치 0으로 처리
train_x['historical_existing_cnt'].fillna(0, inplace=True)

In [None]:
# 적극성 지표인 가공변수를 생성하기 위한 함수를 정의

def calculate_engagement_weight(row):
    weight = 0
    # bant_submit 값이 0.5 미만인 경우
    if row['bant_submit'] < 0.5:
        weight += 0.5
    # lead_desc_length가 평균 이상인 경우 (평균을 다시 계산하지 않고, 함수 외부에서 계산된 평균을 사용)
    if row['lead_desc_length'] > lead_desc_length_avg:
        weight += 1.5
    # historical_existing_cnt가 0인 경우
    if row['historical_existing_cnt'] == 0:
        weight += 1.0
    return weight

In [None]:
# 각 행마다 가중치 총합 계산
train_x['engagement_weight_sum'] = train_x.apply(calculate_engagement_weight, axis=1)

가공변수를 위해 사용한 변수를 제거

In [None]:
train_x.drop('bant_submit', axis=1, inplace=True)

In [None]:
train_x.drop('historical_existing_cnt', axis=1, inplace=True)

In [None]:
train_x.drop('lead_desc_length', axis=1, inplace=True)

In [None]:
train_x.head(5)

Unnamed: 0,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,id_strategic_ver,it_strategic_ver,idit_strategic_ver,customer_job,...,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,same_country,engagement_weight_sum
0,/Quezon City/Philippines,AS,0.066667,32160,End-Customer,Enterprise,,,,purchasing,...,less than 3 months,1,0,0.003079,0.026846,corporate / office,Engineering,0,일치,1.0
1,/PH-00/Philippines,AS,0.066667,23122,End-Customer,Enterprise,,,,media and communication,...,less than 3 months,1,0,0.003079,0.026846,corporate / office,Advertising,1,일치,1.5
2,/Kolkata /India,AS,0.088889,1755,End-Customer,Enterprise,,,,engineering,...,less than 3 months,1,0,0.003079,0.026846,corporate / office,Construction,2,일치,0.0
3,/Bhubaneswar/India,AS,0.088889,4919,End-Customer,Enterprise,,,,entrepreneurship,...,less than 3 months,1,0,0.003079,0.026846,corporate / office,IT/Software,3,일치,1.0
4,/Hyderabad/India,AS,0.088889,17126,Specifier/ Influencer,Enterprise,,,,consulting,...,less than 3 months,0,0,0.003079,0.026846,corporate / office,,4,일치,2.5


In [None]:
df_train = pd.concat([train_x, train_y], axis = 1)
df_train.head()

Unnamed: 0,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,id_strategic_ver,it_strategic_ver,idit_strategic_ver,customer_job,...,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,same_country,engagement_weight_sum,is_converted
0,/Quezon City/Philippines,AS,0.066667,32160,End-Customer,Enterprise,,,,purchasing,...,1,0,0.003079,0.026846,corporate / office,Engineering,0,일치,1.0,1
1,/PH-00/Philippines,AS,0.066667,23122,End-Customer,Enterprise,,,,media and communication,...,1,0,0.003079,0.026846,corporate / office,Advertising,1,일치,1.5,1
2,/Kolkata /India,AS,0.088889,1755,End-Customer,Enterprise,,,,engineering,...,1,0,0.003079,0.026846,corporate / office,Construction,2,일치,0.0,1
3,/Bhubaneswar/India,AS,0.088889,4919,End-Customer,Enterprise,,,,entrepreneurship,...,1,0,0.003079,0.026846,corporate / office,IT/Software,3,일치,1.0,1
4,/Hyderabad/India,AS,0.088889,17126,Specifier/ Influencer,Enterprise,,,,consulting,...,0,0,0.003079,0.026846,corporate / office,,4,일치,2.5,1


train/val 데이터 분리

In [None]:
x_train, x_val, y_train, y_val = train_test_split(
    df_train.drop("is_converted", axis=1),
    df_train["is_converted"],
    test_size=0.2,
    shuffle=True,
    random_state=42
)

##CatBoostClassifier 사용하기

In [None]:
from catboost import CatBoostClassifier, Pool

학습 및 검증 세팅

---
범주형 데이터, 텍스트 데이터, 수치형 데이터 변수를 인덱스로 지정하기


In [None]:
# text 데이터 결측치 처리
text_features=[0,13,16]


for idx in text_features:
    x_train.iloc[:, idx].fillna('', inplace=True)
    x_val.iloc[:, idx].fillna('', inplace=True)

# 범주형 변수 결측치 처리
cat_features=[1,4,5,9,10,11,12,14,15,21,22,23,24]

for idx in cat_features:
    x_train.iloc[:, idx].fillna('none', inplace=True)
    x_val.iloc[:, idx].fillna('none', inplace=True)

# 수치형 변수 결측치 처리
nantozero=[6,7,8]
for idx in nantozero:
    x_train.iloc[:, idx].fillna(0, inplace=True)
    x_val.iloc[:, idx].fillna(0, inplace=True)

# 학습데이터 세팅
train_data = Pool(
    x_train,
    label = y_train,
    cat_features=[1,4,5,9,10,11,12,14,15,21,22,23,24],
    text_features= text_features
)

# 검증데이터 세팅
eval_data = Pool(
    x_val,
    label = y_val,
    cat_features=[1,4,5,9,10,11,12,14,15,21,22,23,24],
    text_features= text_features
)

In [None]:
# 모델 정의
model = CatBoostClassifier(
    iterations=2000,  # 더 많은 반복으로 모델이 데이터에서 패턴을 더 잘 학습하도록 합니다.
    learning_rate=0.05,  # 학습률을 증가시켜 학습 속도를 높입니다. 너무 높으면 과적합의 위험이 있으므로 조심하세요.
    depth=6,  # 트리의 깊이를 설정합니다. 너무 깊은 트리는 과적합을 일으킬 수 있습니다.
    l2_leaf_reg=3,  # L2 정규화 계수를 사용하여 모델의 과적합을 방지합니다.
    eval_metric='AUC',  # 이진 분류 작업에 적합한 평가 지표인 AUC를 사용합니다.
    border_count=254,  # 기본값인 254를 사용하거나, 모델의 성능을 개선하기 위해 조정할 수 있습니다.
    random_seed=42,  # 결과의 재현성을 위해 랜덤 시드를 설정합니다.
    verbose=100,  # 학습 과정에서 100번의 반복마다 메트릭을 출력합니다.
    use_best_model=True,  # 검증 세트에서 가장 좋은 모델을 사용합니다.
    od_type='Iter',  # 반복에 대한 과적합 감지 유형을 설정합니다.
    od_wait=50,# 지정된 반복 동안 성능이 개선되지 않으면 학습을 중단합니다.
    class_weights= [1,11.5] # 1:10 일때 현재 성능 최고
)

# 모델 학습 및 검증
model.fit(train_data,
          eval_set= eval_data) # 학습과정 모니터링 설정



print(model.get_best_score())

0:	test: 0.8880520	best: 0.8880520 (0)	total: 1.1s	remaining: 36m 35s
100:	test: 0.9803481	best: 0.9803481 (100)	total: 1m 4s	remaining: 20m 21s
200:	test: 0.9821405	best: 0.9821405 (200)	total: 1m 52s	remaining: 16m 44s
300:	test: 0.9838407	best: 0.9838841 (298)	total: 2m 42s	remaining: 15m 18s
400:	test: 0.9844993	best: 0.9845127 (396)	total: 3m 31s	remaining: 14m 4s
500:	test: 0.9847410	best: 0.9847710 (496)	total: 4m 21s	remaining: 13m 1s
600:	test: 0.9849128	best: 0.9849260 (589)	total: 5m 9s	remaining: 12m 1s
700:	test: 0.9853017	best: 0.9853141 (697)	total: 5m 59s	remaining: 11m 6s
800:	test: 0.9854019	best: 0.9854517 (788)	total: 6m 47s	remaining: 10m 9s
900:	test: 0.9855377	best: 0.9855486 (894)	total: 7m 37s	remaining: 9m 17s
1000:	test: 0.9857002	best: 0.9857149 (983)	total: 8m 25s	remaining: 8m 24s
1100:	test: 0.9857604	best: 0.9857741 (1091)	total: 9m 13s	remaining: 7m 32s
1200:	test: 0.9858003	best: 0.9858122 (1185)	total: 10m 2s	remaining: 6m 40s
Stopped by overfitting d

##검증 데이터로 평가지표 출력해보기

In [None]:
def get__clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False]) #(답, 예측)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [None]:
pred = model.predict(x_val)
pred = np.where(pred == 1, True, False)
get__clf_eval(y_val, pred)

오차행렬:
 [[  909    76]
 [  490 10385]]

정확도: 0.9523
정밀도: 0.6497
재현율: 0.9228
F1: 0.7626


# 학습한 모델 저장

In [None]:
pip install joblib



In [None]:
from joblib import dump

dump(model, 'LGaimers4기_Catboost.joblib')

['LGaimers4기_Catboost.joblib']

In [None]:
from joblib import load

model = load('LGaimers4기_Catboost.joblib')

## 추론

In [None]:
# 예측에 필요한 데이터 분리
x_test = df_test.drop(["is_converted", "id"], axis=1)

학습 데이터와 똑같이 데이터 처리

In [None]:
#가공변수+열삭제
x_test['same_country'] = (x_test['customer_country'] == x_test['customer_country.1']).astype(int)
x_test['same_country'] = x_test['same_country'].map({0: '불일치', 1: '일치'})
x_test.drop('customer_country.1', axis=1, inplace=True)

In [None]:
# lead_desc_length의 전체 평균 계산
lead_desc_length_avg = x_test['lead_desc_length'].mean()

In [None]:
x_test['historical_existing_cnt'].fillna(0, inplace=True)

In [None]:
x_test['engagement_weight_sum'] = x_test.apply(calculate_engagement_weight, axis=1)

In [None]:
x_test.drop('bant_submit', axis=1, inplace=True)

In [None]:
x_test.drop('lead_desc_length', axis=1, inplace=True)

In [None]:
x_test.drop('historical_existing_cnt', axis=1, inplace=True)

In [None]:
####### 학습데이터와 똑같이 전처리 #######

nantozero=[6,7,8]
for idx in nantozero:
    x_test.iloc[:, idx].fillna(0, inplace=True)

text_features=[0,13,16]

for idx in text_features:
    x_test.iloc[:, idx].fillna('', inplace=True)

cat_features=[1,4,5,9,10,11,12,14,15,21,22,23,24]

for idx in cat_features:
    x_test.iloc[:, idx].fillna('none', inplace=True)

#테스트 데이터 세팅
test_data = Pool(
    x_test,
    cat_features=[1,4,5,9,10,11,12,14,15,21,22,23,24],
    text_features= text_features
)

예측

In [None]:
test_pred = model.predict(test_data)
sum(test_pred) # True로 예측된 개수

1840

In [None]:
# 1,0 을 True,False로 바꿔주기
test_pred = np.where(test_pred == 1, True, False)
test_pred

array([ True,  True,  True, ..., False, False,  True])

##제출 파일 저장

In [None]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)