# 영업 성공 여부 분류 경진대회

## 1. 데이터 확인

### 필수 라이브러리

In [34]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

import seaborn as sb
import matplotlib.pyplot as plt
import lightgbm as lgb

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

### 데이터 셋 읽어오기

In [35]:
df_train = pd.read_csv("train.csv") # 학습용 데이터
df_test = pd.read_csv("submission.csv") # 테스트 데이터(제출파일의 데이터)

In [36]:
df_train.head() # 학습용 데이터 살펴보기

Unnamed: 0,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,it_strategic_ver,...,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted
0,1.0,/Quezon City/Philippines,AS,0.066667,32160,End-Customer,Enterprise,,,,...,LGEPH,less than 3 months,1,0,0.003079,0.026846,corporate / office,Engineering,0,True
1,1.0,/PH-00/Philippines,AS,0.066667,23122,End-Customer,Enterprise,12.0,,,...,LGEPH,less than 3 months,1,0,0.003079,0.026846,corporate / office,Advertising,1,True
2,1.0,/Kolkata /India,AS,0.088889,1755,End-Customer,Enterprise,144.0,,,...,LGEIL,less than 3 months,1,0,0.003079,0.026846,corporate / office,Construction,2,True
3,1.0,/Bhubaneswar/India,AS,0.088889,4919,End-Customer,Enterprise,,,,...,LGEIL,less than 3 months,1,0,0.003079,0.026846,corporate / office,IT/Software,3,True
4,1.0,/Hyderabad/India,AS,0.088889,17126,Specifier/ Influencer,Enterprise,,,,...,LGEIL,less than 3 months,0,0,0.003079,0.026846,corporate / office,,4,True


## 2. 데이터 전처리

### 레이블 인코딩

In [37]:
def label_encoding(series: pd.Series) -> pd.Series:
    """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""

    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

In [38]:
# 레이블 인코딩할 칼럼들
label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_country.1",
    "customer_position",
    "response_corporate",
    "expected_timeline",
]

df_all = pd.concat([df_train[label_columns], df_test[label_columns]])

for col in label_columns:
    df_all[col] = label_encoding(df_all[col])

다시 학습 데이터와 제출 데이터를 분리합니다.

In [39]:
for col in label_columns:  
    df_train[col] = df_all.iloc[: len(df_train)][col]
    df_test[col] = df_all.iloc[len(df_train) :][col]

In [40]:
df_train.head() # 학습용 데이터 살펴보기

Unnamed: 0,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,it_strategic_ver,...,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted
0,1.0,9070,0,0.066667,32160,10,0,,,,...,33,246,1,0,0.003079,0.026846,0,28,0,True
1,1.0,8406,0,0.066667,23122,10,0,12.0,,,...,33,246,1,0,0.003079,0.026846,0,0,1,True
2,1.0,6535,0,0.088889,1755,10,0,144.0,,,...,21,246,1,0,0.003079,0.026846,0,17,2,True
3,1.0,3388,0,0.088889,4919,10,0,,,,...,21,246,1,0,0.003079,0.026846,0,44,3,True
4,1.0,5799,0,0.088889,17126,29,0,,,,...,21,246,0,0,0.003079,0.026846,0,86,4,True


### 2-2. 학습, 검증 데이터 분리

In [41]:
df_train = df_train.fillna(0)
X_train, X_val, Y_train, Y_val = train_test_split(
    df_train.drop("is_converted", axis=1),
    df_train["is_converted"],
    test_size=0.2,
    shuffle=True,
    random_state=400,
)
# print([x_train.shape[1]])
# print(y_train)

In [42]:
scaler = StandardScaler()

scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)


In [52]:
from lightgbm import LGBMClassifier

# Initialize a LightGBM Classifier with 'auc' as the evaluation metric
model = LGBMClassifier(metric='auc', boost_from_average=False)

# Fit the model on the training data
model.fit(X_train, Y_train)

# Make predictions on the training and validation sets
y_train = model.predict(X_train)
y_val = model.predict(X_val)
print(y_val)

[LightGBM] [Info] Number of positive: 3903, number of negative: 43536
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001556 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2324
[LightGBM] [Info] Number of data points in the train set: 47439, number of used features: 28
[False False False ... False False False]


In [53]:
import lightgbm as lgb

# Create LightGBM Datasets for training and validation
train_data = lgb.Dataset(X_train, label=Y_train)
test_data = lgb.Dataset(X_val, label=Y_val, reference=train_data)
# print(Y_train)
# Define hyperparameters and objective for LightGBM
params = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
}

In [54]:
# Training a LightGBM Model
num_round = 100

# Train a LightGBM model using defined parameters, training data, and specified number of rounds
model = lgb.train(params, train_data,
                  num_round, valid_sets=[test_data])

[LightGBM] [Info] Number of positive: 3903, number of negative: 43536
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007582 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2324
[LightGBM] [Info] Number of data points in the train set: 47439, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.082274 -> initscore=-2.411843
[LightGBM] [Info] Start training from score -2.411843


In [55]:
# Calculating and Printing ROC-AUC Scores
from sklearn.metrics import roc_auc_score as ras
from sklearn.metrics import f1_score


print("Training F1 score: ", f1_score(Y_train, y_train))
print("Validation F1 score: ", f1_score(Y_val, y_val))
# Calculate and print the ROC-AUC score for the training and validation sets
print("Training ROC-AUC: ", ras(Y_train, y_train))
print("Validation ROC-AUC: ", ras(Y_val, y_val))
# print(y_val)

Training F1 score:  0.8389006342494715
Validation F1 score:  0.776595744680851
Training ROC-AUC:  0.8787644903801417
Validation ROC-AUC:  0.8428530111099489


## 4. 제출하기

### 테스트 데이터 예측

In [61]:
# 예측에 필요한 데이터 분리
x_test = df_test.drop(["is_converted", "id"], axis=1)

In [62]:
test_prob = model.predict(x_test.fillna(0))
# test_pred = (test_prob > 0.5).astype(int)
print(test_prob)
sum(test_prob) # True로 예측된 개수

[0.17984775 0.08210805 0.08210805 ... 0.03682683 0.12802815 0.03682683]


394.418145336162

### 제출 파일 작성

In [28]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_prob
print(test_prob)
# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

[False False False ... False False False]


PermissionError: [Errno 13] Permission denied: 'submission.csv'

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**