# 영업 성공 여부 분류 경진대회

## 1. 데이터 확인

### 필수 라이브러리

In [1]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.3-cp310-cp310-macosx_11_0_universal2.whl (26.2 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.2/26.2 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0mm
Collecting plotly
  Downloading plotly-5.19.0-py3-none-any.whl (15.7 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.7/15.7 MB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m
Collecting graphviz
  Using cached graphviz-0.20.1-py3-none-any.whl (47 kB)
Collecting tenacity>=6.2.0
  Downloading tenacity-8.2.3-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, graphviz, plotly, catboost
Successfully installed catboost-1.2.3 graphviz-0.20.1 plotly-5.19.0 tenacity-8.2.3


In [55]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier

from sklearn.preprocessing import StandardScaler, MinMaxScaler

### 데이터 셋 읽어오기

In [56]:
df_train = pd.read_csv("train.csv") 
df_test = pd.read_csv("submission.csv") 

In [57]:
df_train.sample(5) # check train data

Unnamed: 0,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,it_strategic_ver,...,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted
42849,0.25,Keangnam Landmark 72 //Vietnam,AS,,47466,,Enterprise,115.0,,,...,LGEVH,,0,0,,,,,643,False
7193,1.0,/Sri Ganganagar/India,ID,,47466,,Enterprise,4.0,,,...,LGEIL,less than 3 months,0,0,,,,,175,False
38678,0.5,/São Paulo/Brazil,AS,,24069,,Enterprise,,,,...,LGESP,,0,0,,,,,23,False
49788,0.5,//Spain,IT,,4475,,SMB,,,,...,LGEES,,0,0,,,,,788,False
29693,0.5,/Rio branco/Brazil,AS,0.003937,19715,End-Customer,SMB,,,,...,LGESP,,1,0,0.003079,0.026846,corporate / office,,680,False


In [58]:
df_test.sample(5) # check test data

Unnamed: 0,id,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,...,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted
2059,13005,0.75,/ México / Mexico,AS,,18453,Specifier/ Influencer,Enterprise,,,...,LGEMS,3 months ~ 6 months,0,0,,,,,42,False
4765,15881,0.5,/ caracas / Venezuela,AS,,21440,End Customer,SMB,0.0,,...,LGEPS,less than 3 months,0,0,,,,,654,False
2023,19819,0.0,/ / Brazil,ID,0.073248,45479,Channel Partner,SMB,143.0,,...,LGESP,,0,0,0.001183,0.04984,retail,Restaurant,317,True
807,12743,1.0,/ Gornji Milanovac / Serbia,AS,,25211,End Customer,Enterprise,,,...,LGEMK,less than 3 months,0,0,,,,,766,False
2550,20973,0.0,/ / Spain,ID,0.025,36246,End Customer,Enterprise,45.0,,...,LGEES,,0,0,1.3e-05,0.053571,transportation,Others,999,False


## 2. 데이터 전처리

In [59]:
"""
제거할 column
'customer_country',
'customer_country.1'
"""

drop_columns = ['customer_country',
                'customer_country.1']

df_train.drop(drop_columns, axis=1, inplace=True)
df_test.drop(drop_columns, axis=1, inplace=True)

In [60]:
"""
Scaling for numeric columns

standard scaling columns
"com_reg_ver_win_rate", "historical_existing_cnt","lead_desc_length"

MinMax scaling columns
"ver_win_rate_x", "ver_win_ratio_per_bu"
"""

st_columns = ["com_reg_ver_win_rate", "historical_existing_cnt","lead_desc_length"]
mm_columns = ["ver_win_rate_x", "ver_win_ratio_per_bu"]

def scaling_data(columns : list,
                 scaling : str):
    if scaling == 'standard':
        scaler = StandardScaler()
    else:
        scaler = MinMaxScaler()
    for col in columns:
        col_name = 'scaled_' + col
        df_train[col_name] = scaler.fit_transform(df_train[[col]])
        df_test[col_name] = scaler.transform(df_test[[col]])

        df_train.drop(columns=[col], axis=1, inplace=True)
        df_test.drop(columns=[col], axis=1, inplace=True)

scaling_data(st_columns, 'standard')
scaling_data(mm_columns, 'minmax')

print(df_train.columns)

Index(['bant_submit', 'business_unit', 'customer_idx', 'customer_type',
       'enterprise', 'id_strategic_ver', 'it_strategic_ver',
       'idit_strategic_ver', 'customer_job', 'inquiry_type',
       'product_category', 'product_subcategory', 'product_modelname',
       'customer_position', 'response_corporate', 'expected_timeline',
       'ver_cus', 'ver_pro', 'business_area', 'business_subarea', 'lead_owner',
       'is_converted', 'scaled_com_reg_ver_win_rate',
       'scaled_historical_existing_cnt', 'scaled_lead_desc_length',
       'scaled_ver_win_rate_x', 'scaled_ver_win_ratio_per_bu'],
      dtype='object')


In [61]:
"""
preprocessing 'business_area' column
"""

def preprocessing_business_area(df, val):
    col_name = val + '_center'
    df[col_name] = df['business_area'].apply(lambda x : 1 if val == x else 0)
    
for df in [df_train, df_test]:
    for val in ['hospital & health care', 'power plant / renewable energy']:
        preprocessing_business_area(df, val)

df_train.drop(columns=['business_area'], axis=1, inplace=True)
df_test.drop(columns=['business_area'], axis=1, inplace=True)

In [62]:
"""
preprocessing 'expected_timeline' column
"""

def preprocessing_exsiting_timeline(df, val):
    col_name = 'is_' + val
    df[col_name] = df['expected_timeline'].apply(lambda x : 1 if val in str(x).lower() else 0)

vals = ['budget', 'etc', 'hence', 'system', 
        'closi', 'any', 'although', 'more',
        'year']

for df in [df_train, df_test]:
    for val in vals:
        preprocessing_exsiting_timeline(df, val)

df_train.drop(columns=['expected_timeline'], axis=1, inplace=True)
df_test.drop(columns=['expected_timeline'], axis=1, inplace=True)

In [63]:
print(df_train.shape)
print(df_test.shape)

(59299, 36)
(5271, 37)


In [64]:
"""
inmpute for mode value
"""

train_mode = df_train.mode().iloc[0]
df_train.fillna(train_mode, inplace=True)

test_mode = df_test.mode().iloc[0]
df_test.fillna(test_mode, inplace=True)

In [65]:
"""
numeric columns convert to category columns
"""

num_to_cat_columns = ['bant_submit',
                      'scaled_com_reg_ver_win_rate',
                      'scaled_historical_existing_cnt',
                      'id_strategic_ver',
                      'it_strategic_ver',
                      'idit_strategic_ver',
                      'scaled_lead_desc_length',
                      'ver_cus',
                      'ver_pro',
                      'scaled_ver_win_rate_x',
                      'scaled_ver_win_ratio_per_bu']

for col in num_to_cat_columns:
    df_train[col] = df_train[col].astype(str)
    df_test[col] = df_test[col].astype(str)

### 2-2. 학습, 검증 데이터 분리

In [66]:
x_train, x_val, y_train, y_val = train_test_split(
    df_train.drop("is_converted", axis=1),
    df_train["is_converted"],
    test_size=0.2,
    shuffle=True,
    random_state=400,
)

x_train.shape

(47439, 35)

## 3. 모델 학습

### 모델 정의 

In [67]:
model = CatBoostClassifier(
    verbose = 0,
    iterations = 1000,
    depth = 6,
    learning_rate = 0.1,
    loss_function = 'Logloss',
    one_hot_max_size = 5,
    cat_features = list(x_train.columns)
)

### 모델 학습

In [68]:
model.fit(x_train, y_train)

<catboost.core.CatBoostClassifier at 0x147851510>

### 모델 성능 보기

In [69]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [70]:
pred = model.predict(x_val)
get_clf_eval(y_val, pred)

오차행렬:
 [[  752   195]
 [   63 10850]]

정확도: 0.9782
정밀도: 0.9227
재현율: 0.7941
F1: 0.8536


## 4. 제출하기

### 테스트 데이터 예측

In [71]:
# split test for prediction
x_test = df_test.drop(["is_converted", "id"], axis=1)

In [72]:
test_pred = model.predict(x_test)
sum(test_pred) # numbers of True

834

### 제출 파일 작성

In [73]:
# load submission file
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_pred

# save submission file
df_sub.to_csv("submission.csv", index=False)