In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [7]:
data_path = "/home/workspace/LGamiers/"
df_train = pd.read_csv(data_path + "train.csv") # 학습용 데이터
df_test = pd.read_csv(data_path + "submission.csv")

In [10]:
def label_encoding(series: pd.Series) -> pd.Series:
    """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""

    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)
    # unique value 찾고 아스키코드 기준으로 sort
 
    # my_dict을 통해 unique value에 대응하는 label 생성 
    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    # my_dict의 index와 매치되는 series값을 my_dict의 value로 변환한다.
    series = series.map(my_dict)

    return series

In [17]:
# 레이블 인코딩할 칼럼들
label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_country.1",
    "customer_position",
    "response_corporate",
    "expected_timeline",
]

df_all = pd.concat([df_train[label_columns], df_test[label_columns]])

for col in label_columns:
    df_all[col] = label_encoding(df_all[col])

['Quotation or purchase consultation' 'Product Information'
 'Quotation or Purchase Consultation' 'Other'
 'Usage or technical consultation' 'Trainings' 'Services' 'Sales Inquiry'
 'Etc.' 'Technical Support' 'Usage or Technical Consultation'
 'Technical Consultation' 'Request for Partnership' nan 'sales'
 'technical' 'usage or technical consultation'
 'usage_or_technical_consultation' 'other'
 'quotation_or_purchase_consultation' 'other_' 'Request a Demo'
 'Request for Distributorship' 'Request for quotation or purchase'
 'Request for technical consulting' '(Select ID_Needs)' 'One Quick:Flex'
 'AIO' 'Needs' 'Purchase' 'technical_consultation' 'Customer Suggestions'
 'Event Inquiry' 'Others' 'OEM/ODM Request' 'Hospital TV' 'others'
 'i want to know the details about it' 'EDUCATIONAL EQUIPMENTS'
 'Digital platform' 'TV interactive' 'teach' 'Display Textbook and photos'
 'High inch 86 / 98 or 110' 'quotation_' 'Purchase or Quotation'
 'display product' 'first Info and pricing' 'Sales inqu

In [18]:
# 위에서 전처리할때는 한번에 concat해서 진행하고 분리한다. 
for col in label_columns:  
    df_train[col] = df_all.iloc[: len(df_train)][col]
    df_test[col] = df_all.iloc[len(df_train) :][col]

In [19]:
x_train, x_val, y_train, y_val = train_test_split(
    df_train.drop("is_converted", axis=1),
    df_train["is_converted"],
    test_size=0.2,
    shuffle=True,
    random_state=400,)

In [20]:
model = DecisionTreeClassifier()
# 결측치를 0으로 처리
model.fit(x_train.fillna(0), y_train)

In [21]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [22]:
pred = model.predict(x_val.fillna(0))
get_clf_eval(y_val, pred)

오차행렬:
 [[  734   213]
 [  248 10665]]

정확도: 0.9611
정밀도: 0.7475
재현율: 0.7751
F1: 0.7610
