# 영업 성공 여부 분류 경진대회

## 1. 데이터 확인

### 필수 라이브러리

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [2]:
import random
import os

def seed_everything(seed=42):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    
seed_everything()

### 데이터 셋 읽어오기

In [3]:
df_train = pd.read_csv("train.csv") # 학습용 데이터
df_test = pd.read_csv("submission.csv") # 테스트 데이터(제출파일의 데이터)

In [4]:
df_train.head() # 학습용 데이터 살펴보기

Unnamed: 0,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,it_strategic_ver,...,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted
0,1.0,/Quezon City/Philippines,AS,0.066667,32160,End-Customer,Enterprise,,,,...,LGEPH,less than 3 months,1,0,0.003079,0.026846,corporate / office,Engineering,0,True
1,1.0,/PH-00/Philippines,AS,0.066667,23122,End-Customer,Enterprise,12.0,,,...,LGEPH,less than 3 months,1,0,0.003079,0.026846,corporate / office,Advertising,1,True
2,1.0,/Kolkata /India,AS,0.088889,1755,End-Customer,Enterprise,144.0,,,...,LGEIL,less than 3 months,1,0,0.003079,0.026846,corporate / office,Construction,2,True
3,1.0,/Bhubaneswar/India,AS,0.088889,4919,End-Customer,Enterprise,,,,...,LGEIL,less than 3 months,1,0,0.003079,0.026846,corporate / office,IT/Software,3,True
4,1.0,/Hyderabad/India,AS,0.088889,17126,Specifier/ Influencer,Enterprise,,,,...,LGEIL,less than 3 months,0,0,0.003079,0.026846,corporate / office,,4,True


## 2. 데이터 전처리

### 레이블 인코딩

In [5]:
def label_encoding(series: pd.Series) -> pd.Series:
    """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""

    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

In [6]:
# 레이블 인코딩할 칼럼들
label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_country.1",
    "customer_position",
    "response_corporate",
    "expected_timeline",
]

df_all = pd.concat([df_train[label_columns], df_test[label_columns]])

for col in label_columns:
    df_all[col] = label_encoding(df_all[col])

다시 학습 데이터와 제출 데이터를 분리합니다.

In [7]:
for col in label_columns:  
    df_train[col] = df_all.iloc[: len(df_train)][col]
    df_test[col] = df_all.iloc[len(df_train) :][col]

### 2-2. 학습, 검증 데이터 분리

In [8]:
X = df_train.drop("is_converted", axis=1).fillna(0)
y = df_train["is_converted"]

In [9]:
x_train, x_val, y_train, y_val = train_test_split(
    X,
    y,
    test_size=0.2,
    shuffle=True,
    random_state=42,
)

In [10]:
x_train

Unnamed: 0,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,it_strategic_ver,...,customer_position,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner
19499,0.25,165,0,0.000000,47466,9,0,19.0,0.0,0.0,...,74,21,271,0,0,0.000000,0.000000,6,86,115
5354,1.00,2749,2,0.000000,47466,33,0,4.0,0.0,0.0,...,21,21,8,0,0,0.000000,0.000000,6,86,175
35027,0.50,1869,2,0.000000,19504,33,0,0.0,0.0,0.0,...,74,50,267,0,0,0.000000,0.000000,6,86,194
22417,1.00,3109,0,0.048544,1944,33,0,0.0,0.0,0.0,...,21,18,246,0,0,0.000543,0.022634,11,86,68
48416,0.50,1927,3,0.000000,27783,33,1,0.0,0.0,0.0,...,74,12,271,0,0,0.000000,0.000000,6,86,980
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54343,1.00,11043,2,0.269231,7798,10,1,0.0,0.0,0.0,...,21,7,246,0,0,0.000060,0.131148,4,16,344
38158,0.50,3160,0,0.000000,5937,33,0,0.0,0.0,0.0,...,47,43,271,0,0,0.000000,0.000000,6,86,23
860,0.75,3388,2,0.057534,37179,33,1,0.0,1.0,0.0,...,21,21,246,0,0,0.003079,0.064566,0,86,174
15795,0.50,11717,0,0.000000,4372,33,1,0.0,0.0,0.0,...,64,14,271,0,0,0.000000,0.000000,6,86,62


In [11]:
x_val

Unnamed: 0,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,it_strategic_ver,...,customer_position,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner
37178,0.25,2005,0,0.000000,21752,33,1,1.0,0.0,0.0,...,74,33,271,0,0,0.000000,0.000000,6,86,1
6990,1.00,6075,2,0.000000,27412,33,0,0.0,0.0,0.0,...,36,21,246,0,0,0.000000,0.000000,6,86,174
21789,1.00,2501,2,0.043103,36483,33,0,0.0,0.0,0.0,...,80,15,246,0,0,0.000543,0.064070,11,86,234
2152,1.00,7838,2,0.000000,25096,33,0,0.0,0.0,0.0,...,21,21,246,0,0,0.000000,0.000000,6,86,164
23825,1.00,2036,2,0.000000,25219,9,0,0.0,0.0,0.0,...,21,46,246,0,0,0.000060,0.131148,4,16,285
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8850,0.75,9788,2,0.000000,47466,33,0,4.0,0.0,0.0,...,21,21,8,0,0,0.000000,0.000000,6,86,164
35738,0.50,6108,0,0.000000,25096,33,0,0.0,0.0,0.0,...,21,21,271,0,0,0.000000,0.000000,6,86,48
26894,0.25,17351,2,0.124122,10630,9,1,0.0,1.0,0.0,...,64,43,271,1,0,0.000717,0.071345,5,74,487
57604,1.00,9183,0,0.004000,12473,29,1,1.0,0.0,0.0,...,36,43,267,0,0,0.001183,0.011583,10,86,97


## 3. 모델 학습

### 모델 정의 

In [12]:
model = DecisionTreeClassifier(random_state=42)

### 모델 학습

In [13]:
model.fit(x_train, y_train)

### 모델 성능 보기

In [14]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [15]:
pred = model.predict(x_val)
get_clf_eval(y_val, pred)

오차행렬:
 [[  779   206]
 [  221 10654]]

정확도: 0.9640
정밀도: 0.7790
재현율: 0.7909
F1: 0.7849


## 4. 제출하기

### 테스트 데이터 예측

In [16]:
model.fit(X, y)

In [17]:
# 예측에 필요한 데이터 분리
x_test = df_test.drop(["is_converted", "id"], axis=1).fillna(0)

In [18]:
test_pred = model.predict(x_test)
sum(test_pred) # True로 예측된 개수

1335

### 제출 파일 작성

In [25]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission_original.csv")
df_sub["is_converted"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

In [26]:
df_sub["is_converted"].value_counts()

False    3936
True     1335
Name: is_converted, dtype: int64

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**