In [2]:
!pip install catboost

Defaulting to user installation because normal site-packages is not writeable
Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting plotly
  Downloading plotly-5.18.0-py3-none-any.whl (15.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.6/15.6 MB[0m [31m67.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting graphviz
  Downloading graphviz-0.20.1-py3-none-any.whl (47 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.0/47.0 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
Collecting tenacity>=6.2.0
  Downloading tenacity-8.2.3-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, graphviz, plotly, catboost
Successfully installed catboost-1.2.2 graphviz-0.20.1 plotly-5.18.0 tenacity-8.2.3

[1m[[0m[34;49mnotice[0m[1;39;49m][

In [3]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    classification_report
)

In [4]:
df_train = pd.read_csv("train.csv") # 학습용 데이터
df_test = pd.read_csv("submission.csv") # 테스트 데이터(제출파일의 데이터)

In [5]:
def label_encoding(series: pd.Series) -> pd.Series:
    my_dict = {}

    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

In [6]:
label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_country.1",
    "customer_position",
    "response_corporate",
    "expected_timeline",
]

In [7]:
df_train.drop(['id_strategic_ver', 'it_strategic_ver', 'idit_strategic_ver', 'com_reg_ver_win_rate'], axis=1, inplace=True)
df_test.drop(['id_strategic_ver', 'it_strategic_ver', 'idit_strategic_ver', 'com_reg_ver_win_rate'], axis=1, inplace=True)

In [8]:
df_all = pd.concat([df_train[label_columns], df_test[label_columns]])

for col in label_columns:
    df_all[col] = label_encoding(df_all[col])

for col in label_columns:
    df_train[col] = df_all.iloc[:len(df_train)][col]
    df_test[col] = df_all.iloc[len(df_train):][col]

In [9]:
def fill_country_from_response(row):
    if pd.isnull(row['customer_country']):
        return country_mapping.get(row['response_corporate'], row['customer_country'])
    else:
        return row['customer_country']

country_mapping = {
    'LGEIL': 'India',
    'LGESP': 'Spain',
    'LGEUS': 'United States',
    'LGEMS': 'Malaysia',
    'LGEPH': 'Philippines',
    'LGEGF': 'France',
    'LGECB': 'Cameroon',
    'LGEUK': 'United Kingdom',
    'LGESJ': 'Saudi Arabia',
    'LGECL': 'Chile',
    'LGEPS': 'Slovenia',
    'LGEIS': 'Iceland',
    'LGEPR': 'Puerto Rico',
    'LGEDG': 'Algeria',
    'LGEPL': 'Poland',
    'LGEEG': 'Egypt',
    'LGEVH': 'Venezuela',
    'LGEES': 'Spain',
    'LGETK': 'Turkey',
    'LGEAR': 'Argentina',
    'LGEKR': 'South Korea',
    'LGEHK': 'Hong Kong',
    'LGEAP': 'United Arab Emirates',
    'LGESL': 'Slovakia',
    'LGEMK': 'Malta',
    'LGEFS': 'French Southern and Antarctic Lands',
    'LGEAF': 'Afghanistan',
    'LGEIN': 'India',
    'LGELF': 'Luxembourg',
    'LGESA': 'Samoa',
    'LGECI': 'Ivory Coast',
    'LGETH': 'Thailand',
    'LGEEF': 'Eswatini',
    'LGEPT': 'Portugal',
    'LGEML': 'Maldives',
    'LGEBN': 'Brunei',
    'LGEYK': 'Yemen',
    'LGECH': 'Switzerland',
    'LGEHS': 'Australia',
    'LGETT': 'Trinidad and Tobago',
    'LGEJP': 'Japan',
    'LGEAS': 'Asian',
    'LGESW': 'Sweden',
    'LGEMC': 'Montenegro',
    'LGERO': 'Romania',
    'LGEEB': 'Brunei',
    'LGERA': 'Argentina',
    'LGEAG': 'Angola',
    'LGECZ': 'Czech Republic',
    'LGELA': 'Laos',
    'LGEIR': 'Ireland',
    'LGEBT': 'Bhutan',
    'LGEUR': 'European Union'
}

In [10]:
df_train['customer_country'] = df_train.apply(fill_country_from_response, axis=1)
df_train['inquiry_type'].fillna('request for partnership', inplace=True)
df_train['business_area'].fillna('Unknown', inplace=True)
df_train['expected_timeline'] = df_train['expected_timeline'].fillna('nan').apply(lambda x: str(x).split()[0].lower())
# df_train['customer_type'].fillna(pd.Series(np.random.choice(['Specifier/Influencer', 'Unknown'], p=[0.3, 0.7], size=len(df_train))), inplace=True)
df_train['customer_type'].fillna('Unknown', inplace=True)
df_train['customer_job'].fillna('Unknown', inplace=True)
# df_train['product_category'].fillna(pd.Series(np.random.choice(['interactive digital board', 'Unknown'], p=[0.5, 0.5], size=len(df_train))), inplace=True)
df_train['product_category'].fillna('Unknown', inplace=True)

In [11]:
x_train, x_val, y_train, y_val = train_test_split(
    df_train.drop("is_converted", axis=1),
    df_train["is_converted"],
    test_size=0.2,
    shuffle=True,
    random_state=400,
)

In [12]:
numeric_features = df_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = df_train.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [16]:
catboost_model = CatBoostClassifier(verbose=0, iterations=1000, depth=6, learning_rate=0.1, loss_function='Logloss')

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', catboost_model)
])

pipeline.fit(x_train, y_train)

y_val_pred = pipeline.predict(x_val)

In [18]:
y_val_pred = np.array([True if pred == 'True' else False for pred in y_val_pred])

print("Classification Report:\n", classification_report(y_val, y_val_pred))
print("오차행렬:\n", confusion_matrix(y_val, y_val_pred, labels=[True, False]))
print("\n정확도: {:.4f}".format(accuracy_score(y_val, y_val_pred)))
print("정밀도: {:.4f}".format(precision_score(y_val, y_val_pred, labels=[True, False])))
print("재현율: {:.4f}".format(recall_score(y_val, y_val_pred)))
print("F1: {:.4f}".format(f1_score(y_val, y_val_pred, labels=[True, False])))

Classification Report:
               precision    recall  f1-score   support

       False       0.98      0.99      0.99     10913
        True       0.91      0.75      0.82       947

    accuracy                           0.97     11860
   macro avg       0.95      0.87      0.90     11860
weighted avg       0.97      0.97      0.97     11860

오차행렬:
 [[  708   239]
 [   68 10845]]

정확도: 0.9741
정밀도: 0.9124
재현율: 0.7476
F1: 0.8218


In [19]:
x_test = df_test.drop(["is_converted", "id"], axis=1)

In [20]:
x_test['customer_country'] = df_train.apply(fill_country_from_response, axis=1)
x_test['inquiry_type'].fillna('request for partnership', inplace=True)
x_test['business_area'].fillna('Unknown', inplace=True)
x_test['expected_timeline'] = df_train['expected_timeline'].fillna('nan').apply(lambda x: str(x).split()[0].lower())
# x_test['customer_type'].fillna(pd.Series(np.random.choice(['Specifier/Influencer', 'Unknown'], p=[0.3, 0.7], size=len(x_test))), inplace=True)
x_test['customer_type'].fillna('Unknown', inplace=True)
x_test['customer_job'].fillna('Unknown', inplace=True)
# x_test['product_category'].fillna(pd.Series(np.random.choice(['interactive digital board', 'Unknown'], p=[0.5, 0.5], size=len(x_test))), inplace=True)
x_test['product_category'].fillna('Unknown', inplace=True)

In [23]:
test_pred = pipeline.predict(x_test)
test_pred = np.array([True if pred == 'True' else False for pred in test_pred])

sum(test_pred) # True로 예측된 개수

589

In [24]:
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)
