In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
    classification_report
)


In [None]:
from google.colab import drive

drive.mount('/content/drive')

train_file_path = '/content/drive/MyDrive/LGdata/train.csv'
test_file_path = '/content/drive/MyDrive/LGdata/submission_updated.csv'

df_train = pd.read_csv(train_file_path)
df_test = pd.read_csv(test_file_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def label_encoding(series: pd.Series) -> pd.Series:
    my_dict = {}

    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

In [None]:
label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_country.1",
    "customer_position",
    "response_corporate",
    "expected_timeline",
]

df_all = pd.concat([df_train[label_columns], df_test[label_columns]])

for col in label_columns:
    df_all[col] = label_encoding(df_all[col])

In [None]:
for col in label_columns:
    df_train[col] = df_all.iloc[: len(df_train)][col]
    df_test[col] = df_all.iloc[len(df_train) :][col]

In [None]:
x_train, x_val, y_train, y_val = train_test_split(
    df_train.drop("is_converted", axis=1),
    df_train["is_converted"],
    test_size=0.2,
    shuffle=True,
    random_state=400,
)

In [None]:
numeric_features = x_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = x_train.select_dtypes(include=['object']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

In [None]:
model = RandomForestClassifier(random_state=400)

pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('classifier', model)])

pipeline.fit(x_train, y_train)

y_val_pred = pipeline.predict(x_val) # F1_score : 0.8107
# y_val_pred = pipeline.predict(x_val.fillna(0)) # F1_score : 0.2725

In [None]:
print("Classification Report:\n", classification_report(y_val, y_val_pred))
print("오차행렬:\n", confusion_matrix(y_val, y_val_pred, labels=[True, False]))
print("\n정확도: {:.4f}".format(accuracy_score(y_val, y_val_pred)))
print("정밀도: {:.4f}".format(precision_score(y_val, y_val_pred), labels=[True, False]))
print("재현율: {:.4f}".format(recall_score(y_val, y_val_pred)))
print("F1: {:.4f}".format(f1_score(y_val, y_val_pred), labels=[True, False]))

Classification Report:
               precision    recall  f1-score   support

       False       0.97      1.00      0.99     10913
        True       0.96      0.70      0.81       947

    accuracy                           0.97     11860
   macro avg       0.97      0.85      0.90     11860
weighted avg       0.97      0.97      0.97     11860

오차행렬:
 [[  666   281]
 [   30 10883]]

정확도: 0.9738
정밀도: 0.9569
재현율: 0.7033
F1: 0.8107


In [None]:
x_test = df_test.drop(['is_converted', 'id'], axis=1)
y_test = df_test['is_converted']
y_test_pred = pipeline.predict(x_test.fillna(0))

# print("Classification Report:\n", classification_report(y_test, y_test_pred))
print("오차행렬:\n", confusion_matrix(y_test, y_test_pred, labels=[True, False]))
print("\n정확도: {:.4f}".format(accuracy_score(y_test, y_test_pred)))
print("정밀도: {:.4f}".format(precision_score(y_test, y_test_pred), labels=[True, False]))
print("재현율: {:.4f}".format(recall_score(y_test, y_test_pred)))
print("F1: {:.4f}".format(f1_score(y_test, y_test_pred), labels=[True, False]))


오차행렬:
 [[ 103 1138]
 [  28 4002]]

정확도: 0.7788
정밀도: 0.7863
재현율: 0.0830
F1: 0.1501
