In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [3]:
from google.colab import drive

drive.mount('/content/drive')

train_file_path = '/content/drive/MyDrive/LGdata/train.csv'
test_file_path = '/content/drive/MyDrive/LGdata/submission.csv'

df_train = pd.read_csv(train_file_path)
df_test = pd.read_csv(test_file_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
print(df_train.head())

   bant_submit          customer_country business_unit  com_reg_ver_win_rate  \
0          1.0  /Quezon City/Philippines            AS              0.066667   
1          1.0        /PH-00/Philippines            AS              0.066667   
2          1.0           /Kolkata /India            AS              0.088889   
3          1.0        /Bhubaneswar/India            AS              0.088889   
4          1.0          /Hyderabad/India            AS              0.088889   

   customer_idx          customer_type  enterprise  historical_existing_cnt  \
0         32160           End-Customer  Enterprise                      NaN   
1         23122           End-Customer  Enterprise                     12.0   
2          1755           End-Customer  Enterprise                    144.0   
3          4919           End-Customer  Enterprise                      NaN   
4         17126  Specifier/ Influencer  Enterprise                      NaN   

   id_strategic_ver  it_strategic_ver  ...  

In [5]:
print(df_test.head())

      id  bant_submit                                   customer_country  \
0  19844         0.00                                        /  / Brazil   
1   9738         0.25  400 N State Of Franklin Rd  Cloud IT / Johnson...   
2   8491         1.00                                         /  / U.A.E   
3  19895         0.50                         / Madison  / United States   
4  10465         1.00                               / Sao Paulo / Brazil   

  business_unit  com_reg_ver_win_rate  customer_idx          customer_type  \
0            ID              0.073248         47466           End Customer   
1            IT                   NaN          5405           End Customer   
2            ID                   NaN         13597  Specifier/ Influencer   
3            ID              0.118644         17204                    NaN   
4            ID              0.074949          2329           End Customer   

   enterprise  historical_existing_cnt  id_strategic_ver  ...  \
0  Enterp

In [6]:
def label_encoding(series: pd.Series) -> pd.Series:
    my_dict = {}

    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

In [7]:
label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_country.1",
    "customer_position",
    "response_corporate",
    "expected_timeline",
]

df_all = pd.concat([df_train[label_columns], df_test[label_columns]])

for col in label_columns:
    df_all[col] = label_encoding(df_all[col])

In [8]:
for col in label_columns:
    df_train[col] = df_all.iloc[: len(df_train)][col]
    df_test[col] = df_all.iloc[len(df_train) :][col]

In [9]:
x_train, x_val, y_train, y_val = train_test_split(
    df_train.drop("is_converted", axis=1),
    df_train["is_converted"],
    test_size=0.2,
    shuffle=True,
    random_state=400,
)

In [10]:
model = DecisionTreeClassifier()

In [11]:
model.fit(x_train.fillna(0), y_train)

In [12]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [13]:
pred = model.predict(x_val.fillna(0))
get_clf_eval(y_val, pred)

오차행렬:
 [[  734   213]
 [  274 10639]]

정확도: 0.9589
정밀도: 0.7282
재현율: 0.7751
F1: 0.7509


In [14]:
x_test = df_test.drop(["is_converted", "id"], axis=1)

In [15]:
test_pred = model.predict(x_test.fillna(0))
sum(test_pred)

1185

In [17]:
test_file_path = '/content/drive/MyDrive/LGdata/submission.csv'
df_sub = pd.read_csv(test_file_path)
df_sub["is_converted"] = test_pred