# 영업 성공 여부 분류 경진대회

## 1. 데이터 확인

### 필수 라이브러리

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import (accuracy_score, confusion_matrix, f1_score, precision_score, recall_score, )
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier,RandomForestClassifier, VotingClassifier
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.svm import SVC


### 데이터 셋 읽어오기


In [2]:
df_train = pd.read_csv("train.csv") # 학습용 데이터
df_test = pd.read_csv("submission.csv") # 테스트 데이터(제출파일의 데이터)


### 데이터 셋 확인하기


In [3]:
# 고객 수: 59299, 카테고리: 28 + 1(영업 성공 여부)
df_train.shape

(59299, 29)

In [7]:
sum_last_column = df_train.iloc[:, -1].sum()
print("마지막 열의 값들의 합:", sum_last_column)

마지막 열의 값들의 합: 4850


In [6]:
# 고객 예시
df_train.iloc[155]

bant_submit                                               1.0
customer_country                      /Belo horizonte /Brazil
business_unit                                              AS
com_reg_ver_win_rate                                 0.003937
customer_idx                                            12478
customer_type                                    End-Customer
enterprise                                                SMB
historical_existing_cnt                                   NaN
id_strategic_ver                                          NaN
it_strategic_ver                                          NaN
idit_strategic_ver                                        NaN
customer_job                                          finance
lead_desc_length                                           18
inquiry_type               Quotation or purchase consultation
product_category                                  multi-split
product_subcategory                                       NaN
product_

In [5]:
# NaN을 앞/뒤 행값으로 대체

df_train = df_train.fillna(0)
# df_train = df_train.bfill()

In [6]:
# 범주형 데이터 삭제

# object_columns = df_train.select_dtypes(include=['object'])
# df_train = df_train.drop(columns=object_columns)


### 데이터 셋 분포


## 2. 데이터 전처리

### 레이블 인코딩

In [7]:
def label_encoding(series: pd.Series) -> pd.Series:
    """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""

    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

In [8]:
# 레이블 인코딩할 칼럼들
label_columns = [
    "customer_country",
    "business_unit",
    "customer_country.1",    
    "business_subarea",
    "business_area",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_position",
    "response_corporate",
    "expected_timeline",
]

df_all = pd.concat([df_train[label_columns], df_test[label_columns]])

for col in label_columns:
    df_all[col] = label_encoding(df_all[col])

In [9]:
for col in label_columns:  
    df_train[col] = df_all.iloc[: len(df_train)][col]
    df_test[col] = df_all.iloc[len(df_train) :][col]

다시 학습 데이터와 제출 데이터를 분리합니다.

### 2-2. 학습, 검증 데이터 분리

In [10]:
# Train 데이터와 Validation 데이터 만들기

x_train, x_val, y_train, y_val = train_test_split(
    df_train.drop("is_converted", axis=1),
    df_train["is_converted"],
    test_size=0.2,
    shuffle=True,
    random_state=400,
)

In [11]:
# label = df_train.pop("is_converted")
# train = df_train

In [12]:
# import torch
# import torch.nn as nn
# import torch.optim as optim
# import torch.nn.functional as F

In [13]:
# # 정규화 및 데이터 타입 텐서로 변경

# from sklearn.preprocessing import StandardScaler
# sc = StandardScaler()

# x_train = sc.fit_transform(x_train)
# x_train = torch.tensor(x_train, dtype=torch.float32)

# x_val = sc.fit_transform(x_val)
# x_val = torch.tensor(x_val, dtype=torch.float32)

# y_train = torch.tensor(y_train.values, dtype=torch.float32)
# y_val = torch.tensor(y_val, dtype=torch.float32)

# print("shape of x: {}\nshape of y: {}".format(x_train.shape,y_train.shape))

## 3. 모델 학습

### 모델 정의 

In [14]:
# # 기본 제공

# model = DecisionTreeClassifier()

# model.fit(x_train, y_train)



In [15]:
# model = SVC()

# # Train the SVM classifier
# model.fit(x_train, y_train)

In [16]:
# model = AdaBoostClassifier()

# model.fit(x_train, y_train)


In [17]:
# model = GradientBoostingClassifier(n_estimators=700, max_depth=5, random_state=0)
# model.fit(x_train, y_train)

In [18]:
# model = GradientBoostingClassifier(n_estimators=1000, max_depth=1, random_state=0)
# model.fit(train, label)

In [19]:
# model = xgb.XGBClassifier(n_estimators=500, max_depth=10)

# model.fit(x_train, y_train)

In [20]:
# model = SGDClassifier()

# model.fit(x_train, y_train)

In [21]:
# model = LGBMClassifier(n_estimators=500)
# evals = [(x_val, y_val)]
# model.fit(x_train, y_train, eval_metric='logloss', eval_set=evals)

In [22]:
# model = RandomForestClassifier()

# model.fit(x_train, y_train)

In [23]:
# lr = LogisticRegression(max_iter=2000)  # max_iter를 적절한 값으로 설정
# knn = KNeighborsClassifier(8)

# # 보팅 분류기 생성
# model = VotingClassifier(estimators=[('LR', lr), ('KNN', knn)], voting='soft')

# # 데이터에 모델 학습
# model.fit(x_train, y_train)

In [24]:
# # 기본적인 MLP

# from torch import nn
# from torch.nn import functional as F

# class Net(nn.Module):
#   def __init__(self,input_shape):
#     super(Net,self).__init__()
#     self.fc1 = nn.Linear(input_shape,32)
#     self.fc2 = nn.Linear(32,64)
#     self.fc5 = nn.Linear(64,16)
#     self.fc6 = nn.Linear(16,4)
#     self.fc7 = nn.Linear(4,1)
    
#   def forward(self,x):
#     x = torch.relu(self.fc1(x))
#     x = torch.relu(self.fc2(x))
#     x = torch.relu(self.fc5(x))
#     x = torch.relu(self.fc6(x))
#     x = torch.sigmoid(self.fc7(x))
#     return x

In [25]:
# #hyper parameters

# learning_rate = 0.01
# epochs = 1001

# # Model , Optimizer, Loss

# model = Net(input_shape=x_train.shape[1])
# # optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)
# optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# loss_fn = nn.BCELoss()

In [26]:
# #forward loop
# losses = []

# for i in range(epochs):
        
#     #calculate output
#     output = model(x_train)
 
#     #calculate loss
#     loss = loss_fn(output,y_train.reshape(-1,1))
 
#     #accuracy
#     predicted = model(x_train)
    
#     #backprop
#     optimizer.zero_grad()
#     loss.backward()
#     optimizer.step()
    
#     if i%100 == 0:
#         losses.append(loss)
#         print("epoch {}\tloss : {}\t".format(i,loss))

### 모델 학습

### 모델 성능 보기

In [27]:
# def get_clf_eval(y_test, y_pred):
#     confusion = confusion_matrix(y_test, y_pred, labels=[1, 0])
#     accuracy = accuracy_score(y_test, y_pred)
#     precision = precision_score(y_test, y_pred, labels=[1, 0])
#     recall = recall_score(y_test, y_pred)
#     F1 = f1_score(y_test, y_pred, labels=[1, 0])

#     print("오차행렬:\n", confusion)
#     print("\n정확도: {:.4f}".format(accuracy))
#     print("정밀도: {:.4f}".format(precision))
#     print("재현율: {:.4f}".format(recall))
#     print("F1: {:.4f}".format(F1))

In [28]:
# pred_prob = model.predict(x_val)
# # preds = [1 if i > 0.5 else 0 for i in pred_prob]

# get_clf_eval(y_val, pred_prob)

In [29]:
# # model.eval()

# # with torch.no_grad():
# #     pred = model(x_val)

# # pred_labels = ((pred >= 0.5).float()).bool()

# # get_clf_eval(y_val, pred_labels)

# #########################################################

# pred_labels = model.predict(x_val)

# get_clf_eval(y_val, pred_labels)

## 4. 제출하기

### 테스트 데이터 예측

In [30]:
# 예측에 필요한 데이터 분리
x_test = df_test.drop(["is_converted", "id"], axis=1)

In [31]:
x_test = x_test.fillna(0)
# x_test = torch.tensor(x_test.values, dtype=torch.float32)

In [32]:
# columns = x_test.select_dtypes(include=['object'])
# x_test = x_test.drop(columns=columns)

In [33]:
pred_label = model.predict(x_test)
# pred_label = [1 if i > 0.5 else 0 for i in pred_label]

sum(pred_label) # True로 예측된 개수

41

In [34]:
# model.eval()

# with torch.no_grad():
#     prediction = model(x_test)

# pred_label = ((prediction >= 0.5).float()).bool()

### 제출 파일 작성

In [35]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = pred_label
print(pred_label)

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

[False False False ... False False False]


**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**