In [1]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2


In [23]:
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    classification_report
)

In [3]:
from google.colab import drive
drive.mount('/content/drive')
train_file_path = '/content/drive/MyDrive/LGdata/train.csv'
test_file_path = '/content/drive/MyDrive/LGdata/submission.csv'
df_train = pd.read_csv(train_file_path)
df_test = pd.read_csv(test_file_path)

Mounted at /content/drive


In [5]:
def label_encoding(series: pd.Series) -> pd.Series:
    my_dict = {}

    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

In [6]:
label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_country.1",
    "customer_position",
    "response_corporate",
    "expected_timeline",
]

In [7]:
df_train.drop(['id_strategic_ver', 'it_strategic_ver', 'idit_strategic_ver'], axis=1, inplace=True)
df_test.drop(['id_strategic_ver', 'it_strategic_ver', 'idit_strategic_ver'], axis=1, inplace=True)

In [8]:
df_all = pd.concat([df_train[label_columns], df_test[label_columns]])

for col in label_columns:
    df_all[col] = label_encoding(df_all[col])

for col in label_columns:
    df_train[col] = df_all.iloc[:len(df_train)][col]
    df_test[col] = df_all.iloc[len(df_train):][col]

In [18]:
def fill_country_from_response(row):
    if pd.isnull(row['customer_country']):
        return country_mapping.get(row['response_corporate'], row['customer_country'])
    else:
        return row['customer_country']

country_mapping = {
    'LGEIL': 'India',
    'LGESP': 'Spain',
    'LGEUS': 'United States',
    'LGEMS': 'Malaysia',
    'LGEPH': 'Philippines',
    'LGEGF': 'France',
    'LGECB': 'Cameroon',
    'LGEUK': 'United Kingdom',
    'LGESJ': 'Saudi Arabia',
    'LGECL': 'Chile',
    'LGEPS': 'Slovenia',
    'LGEIS': 'Iceland',
    'LGEPR': 'Puerto Rico',
    'LGEDG': 'Algeria',
    'LGEPL': 'Poland',
    'LGEEG': 'Egypt',
    'LGEVH': 'Venezuela',
    'LGEES': 'Spain',
    'LGETK': 'Turkey',
    'LGEAR': 'Argentina',
    'LGEKR': 'South Korea',
    'LGEHK': 'Hong Kong',
    'LGEAP': 'United Arab Emirates',
    'LGESL': 'Slovakia',
    'LGEMK': 'Malta',
    'LGEFS': 'French Southern and Antarctic Lands',
    'LGEAF': 'Afghanistan',
    'LGEIN': 'India',
    'LGELF': 'Luxembourg',
    'LGESA': 'Samoa',
    'LGECI': 'Ivory Coast',
    'LGETH': 'Thailand',
    'LGEEF': 'Eswatini',
    'LGEPT': 'Portugal',
    'LGEML': 'Maldives',
    'LGEBN': 'Brunei',
    'LGEYK': 'Yemen',
    'LGECH': 'Switzerland',
    'LGEHS': 'Australia',
    'LGETT': 'Trinidad and Tobago',
    'LGEJP': 'Japan',
    'LGEAS': 'Asian',
    'LGESW': 'Sweden',
    'LGEMC': 'Montenegro',
    'LGERO': 'Romania',
    'LGEEB': 'Brunei',
    'LGERA': 'Argentina',
    'LGEAG': 'Angola',
    'LGECZ': 'Czech Republic',
    'LGELA': 'Laos',
    'LGEIR': 'Ireland',
    'LGEBT': 'Bhutan',
    'LGEUR': 'European Union'
}

In [39]:
df_train['customer_country'] = df_train.apply(fill_country_from_response, axis=1)
df_train['inquiry_type'].fillna('request for partnership', inplace=True)
df_train['business_area'].fillna('Unknown', inplace=True)
df_train['expected_timeline'] = df_train['expected_timeline'].fillna('nan').apply(lambda x: str(x).split()[0].lower())
# df_train['customer_type'].fillna(pd.Series(np.random.choice(['Specifier/Influencer', 'Unknown'], p=[0.3, 0.7], size=len(df_train))), inplace=True)
df_train['customer_type'].fillna('Unknown', inplace=True)
df_train['customer_job'].fillna('Unknown', inplace=True)
# df_train['product_category'].fillna(pd.Series(np.random.choice(['interactive digital board', 'Unknown'], p=[0.5, 0.5], size=len(df_train))), inplace=True)
df_train['product_category'].fillna('Unknown', inplace=True)

In [49]:
x_train, x_val, y_train, y_val = train_test_split(
    df_train.drop("is_converted", axis=1),
    df_train["is_converted"],
    test_size=0.4,
    shuffle=True,
    random_state=400,
)

In [50]:
numeric_features = df_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = df_train.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [51]:
catboost_model = CatBoostClassifier(random_state=400, verbose=0)

pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', catboost_model)
])

pipeline.fit(x_train, y_train)

y_val_pred = pipeline.predict(x_val)

In [52]:
y_val_pred_bool = np.array([True if pred == 'True' else False for pred in y_val_pred])

print("Classification Report:\n", classification_report(y_val, y_val_pred_bool))
print("오차행렬:\n", confusion_matrix(y_val, y_val_pred_bool, labels=[True, False]))
print("\n정확도: {:.4f}".format(accuracy_score(y_val, y_val_pred_bool)))
print("정밀도: {:.4f}".format(precision_score(y_val, y_val_pred_bool, labels=[True, False])))
print("재현율: {:.4f}".format(recall_score(y_val, y_val_pred_bool)))
print("F1: {:.4f}".format(f1_score(y_val, y_val_pred_bool, labels=[True, False])))

Classification Report:
               precision    recall  f1-score   support

       False       0.98      0.99      0.98     21756
        True       0.93      0.72      0.81      1964

    accuracy                           0.97     23720
   macro avg       0.95      0.86      0.90     23720
weighted avg       0.97      0.97      0.97     23720

오차행렬:
 [[ 1410   554]
 [  109 21647]]

정확도: 0.9720
정밀도: 0.9282
재현율: 0.7179
F1: 0.8096


In [53]:
x_test = df_test.drop(["is_converted", "id"], axis=1)

In [54]:
x_test['customer_country'] = df_train.apply(fill_country_from_response, axis=1)
x_test['inquiry_type'].fillna('request for partnership', inplace=True)
x_test['business_area'].fillna('Unknown', inplace=True)
x_test['expected_timeline'] = df_train['expected_timeline'].fillna('nan').apply(lambda x: str(x).split()[0].lower())
# x_test['customer_type'].fillna(pd.Series(np.random.choice(['Specifier/Influencer', 'Unknown'], p=[0.3, 0.7], size=len(x_test))), inplace=True)
x_test['customer_type'].fillna('Unknown', inplace=True)
x_test['customer_job'].fillna('Unknown', inplace=True)
# x_test['product_category'].fillna(pd.Series(np.random.choice(['interactive digital board', 'Unknown'], p=[0.5, 0.5], size=len(x_test))), inplace=True)
x_test['product_category'].fillna('Unknown', inplace=True)

In [55]:
test_pred = pipeline.predict(x_test)

test_pred_bool = np.array([True if pred == 'True' else False for pred in test_pred])

sum_true_predictions = sum(test_pred_bool)
print("True로 예측된 개수:", sum_true_predictions)

True로 예측된 개수: 549
