### 1. 데이터 확인
### 필수 라이브러리

In [173]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import seaborn as sns
from sklearn.impute import SimpleImputer
import scipy.stats as stats
import statsmodels.api as sm
import matplotlib.pyplot as plt
import re

### 데이터셋 읽어오기

In [174]:
df_train = pd.read_csv("train.csv")  # 학습용 데이터
df_test = pd.read_csv("submission.csv")  # 테스트 데이터(제출 파일의 데이터)

In [175]:
# 지역 코드에 따른 지역명 매핑
region_mapping = {
    "EU": ["LGEAG", "LGECZ", "LGEFS", "LGEDG", "LGEHS", "LGEMK", "LGEIS", "LGESC", "LGEEH", "LGEBN", "LGEWR", "LGEPL", "LGEMA", "LGEPT", "LGERO", "LGEES", "LGENO", "LGESW", "LGEUK"],
    "RC": ["LGEAK", "LGERM", "LGERI", "LGERA", "LGEUR", "LGELV"],
    "MA": ["LGEAS", "LGEEG", "LGELF", "LGESK", "LGEMC", "LGESA", "LGETU", "LGEOT", "LGEDF", "LGEGF", "LGEME", "LGEAF", "LEAO", "LGENI", "LGETK", "LGEAT", "LGESJ", "LGEEF", "LGEYK", "LGEIR"],
    "AP": ["LGEAP", "LGEQA", "LGETL", "LGECH", "LGEYT", "LGETR", "LGETA", "LGESY", "LGESH", "LGEQH", "LGEQD", "LGEPN", "LGENE", "LGEKS", "LGEHZ", "LGEHN", "LGEHK", "LGEIL", "LGEPH", "LGEVH", "LGEKR", "LGESL", "LGEIN", "LGETH", "LGEML", "LGETT", "LGEJP"],
    "NA": ["LGECI", "LGERS", "LGEMX", "LGEMS", "LGEMM", "LGEMR", "LGEUS", "LGEMU", "LGEAI"],
    "LA": ["LGEAG", "LGEBR", "LGECL", "LGEVZ", "LGECB", "LGEPS", "LGEPR", "LGESP", "LGEAR"],
    "OT": ["LGEEB", "LGELA", "LGEBT", "MA", "RC"]
}


def categorize_region(code):
    for region, codes in region_mapping.items():
        if code in codes:
            return region
    return "ETC"  

df_train['region'] = df_train['response_corporate'].apply(categorize_region)
df_test['region'] = df_test['response_corporate'].apply(categorize_region)

In [176]:
def extract_country(value):
    if not isinstance(value, str):
        value = str(value)
    match = re.search(r'\/([^\/]+)$', value)
    if match:
        return match.group(1)
    return None

df_train['customer_country'] = df_train['customer_country'].apply(extract_country)
df_test['customer_country'] = df_test['customer_country'].apply(extract_country)

### 2. 데이터 전처리

In [177]:
### 결측치 확인

In [178]:
print(df_train.isna().sum()/len(df_train))

bant_submit                0.000000
customer_country           0.070473
business_unit              0.000000
com_reg_ver_win_rate       0.754330
customer_idx               0.000000
customer_type              0.741345
enterprise                 0.000000
historical_existing_cnt    0.768023
id_strategic_ver           0.941921
it_strategic_ver           0.981096
idit_strategic_ver         0.923017
customer_job               0.315908
lead_desc_length           0.000000
inquiry_type               0.015869
product_category           0.326717
product_subcategory        0.844264
product_modelname          0.844365
customer_country.1         0.016560
customer_position          0.000000
response_corporate         0.000000
expected_timeline          0.520464
ver_cus                    0.000000
ver_pro                    0.000000
ver_win_rate_x             0.689421
ver_win_ratio_per_bu       0.741918
business_area              0.689421
business_subarea           0.906811
lead_owner                 0

In [179]:
[df_train.isna().sum()/len(df_train) > 0.6]

[bant_submit                False
 customer_country           False
 business_unit              False
 com_reg_ver_win_rate        True
 customer_idx               False
 customer_type               True
 enterprise                 False
 historical_existing_cnt     True
 id_strategic_ver            True
 it_strategic_ver            True
 idit_strategic_ver          True
 customer_job               False
 lead_desc_length           False
 inquiry_type               False
 product_category           False
 product_subcategory         True
 product_modelname           True
 customer_country.1         False
 customer_position          False
 response_corporate         False
 expected_timeline          False
 ver_cus                    False
 ver_pro                    False
 ver_win_rate_x              True
 ver_win_ratio_per_bu        True
 business_area               True
 business_subarea            True
 lead_owner                 False
 is_converted               False
 region       

In [180]:
df_train.columns[df_train.isna().sum()/len(df_train) > 0.6]

Index(['com_reg_ver_win_rate', 'customer_type', 'historical_existing_cnt',
       'id_strategic_ver', 'it_strategic_ver', 'idit_strategic_ver',
       'product_subcategory', 'product_modelname', 'ver_win_rate_x',
       'ver_win_ratio_per_bu', 'business_area', 'business_subarea'],
      dtype='object')

In [181]:
# 제거 칼럼 
del_cols = ['customer_country.1','it_strategic_ver', 'id_strategic_ver', 'idit_strategic_ver','product_subcategory', 'product_modelname', 'ver_win_rate_x',
       'ver_win_ratio_per_bu', 'business_area', 'business_subarea']
df_train.drop(del_cols, axis=1, inplace=True)
df_test.drop(del_cols, axis=1, inplace=True)

In [182]:
# target('is_converted')의 분포 확인
target_distribution = df_train['is_converted'].value_counts()
min_samples = target_distribution.min()

In [183]:
# 언더 샘플링

#from imblearn.under_sampling import OneSidedSelection

#oss = OneSidedSelection(random_state=42)
#X_resampled, y_resampled = oss.fit_resample(df_train.drop('is_converted', axis=1), df_train['is_converted'])
#df_train_balanced = pd.concat([X_resampled, y_resampled], axis=1)
df_train_balanced = pd.concat([
    df_train[df_train['is_converted'] == True].sample(min_samples, random_state=42),
    df_train[df_train['is_converted'] == False].sample(min_samples, random_state=42)
], ignore_index=True)

In [184]:
df_train_balanced['is_converted'].value_counts()

True     4850
False    4850
Name: is_converted, dtype: int64

In [185]:
df_train = df_train_balanced

In [186]:
# 수치형 및 범주형 칼럼 리스트를 정의
numeric_columns = df_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_columns = df_train.select_dtypes(exclude=[np.number]).columns.tolist()

In [193]:
# 결측치 처리
numeric_imputer = SimpleImputer(strategy='mean')
categorical_imputer = SimpleImputer(strategy='most_frequent')

df_train[numeric_columns] = numeric_imputer.fit_transform(df_train[numeric_columns])
df_test[numeric_columns] = numeric_imputer.transform(df_test[numeric_columns])

df_train[categorical_columns] = categorical_imputer.fit_transform(df_train[categorical_columns])
df_test[categorical_columns] = categorical_imputer.transform(df_test[categorical_columns])

In [195]:
# customer_country안의 결측치 뒤의 값으로 대체
df_train['customer_country'].interpolate(inplace = True)
df_train.iloc[0:52299, 1:2]

Unnamed: 0,customer_country
0,62
1,93
2,91
3,173
4,93
...,...
9695,93
9696,80
9697,137
9698,91


In [196]:
def label_encoding(series: pd.Series) -> pd.Series:
    """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""

    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

In [169]:
# 레이블 인코딩할 칼럼들
#label_columns = [
#    "customer_country",
#    "business_unit",
#    "enterprise",
#    "customer_job",
#    "inquiry_type",
#    "product_category",
#    "customer_position",
#    "response_corporate",
#    "expected_timeline",
#]

#df_all = pd.concat([train[label_columns], test[label_columns]])

#for col in label_columns:
#    test[col] = label_encoding(test[col])

학습 데이터와 제출 데이터 분리

In [197]:
for col in categorical_columns:  
    df_train[col] = label_encoding(df_train[col])
    df_test[col] = label_encoding(df_test[col])

### 2-2. 학습, 검증 데이터 분리

In [198]:
x_train, x_val, y_train, y_val = train_test_split(
    df_train.drop("is_converted", axis=1),
    df_train["is_converted"],
    test_size=0.2,
    shuffle=True,
    random_state=42,
)

In [199]:
# 오버 샘플링
#from imblearn.over_sampling import ADASYN
#adasyn = ADASYN(random_state=42)
#x_train_oversampled, y_train_oversampled = adasyn.fit_resample(x_train, y_train)
#print(len(x_train_oversampled), len(y_train_oversampled))
#print(y_train_oversampled.value_counts())
from imblearn.over_sampling import BorderlineSMOTE
smote = BorderlineSMOTE(random_state=42)
x_train_smote, y_train_smote = smote.fit_resample(x_train, y_train)
print(len(x_train_smote), len(y_train_smote))
print(y_train_smote.value_counts())


7766 7766
1    3883
0    3883
Name: is_converted, dtype: int64


### 3. 모델 학습
### 모델 정의

In [200]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from mlxtend.classifier import StackingClassifier

In [201]:
models=[
    LogisticRegression(random_state=42),
    DecisionTreeClassifier(random_state=42),
    GradientBoostingClassifier(n_estimators=1000,learning_rate=0.05,random_state=42),
    RandomForestClassifier(n_estimators=1000,random_state=42),
    LGBMClassifier(n_estimators=1000,learning_rate=0.05,random_state=42),
    XGBClassifier(learning_rate=0.05,random_state=42,objective='binary:logistic',eval_metric='logloss'),
    CatBoostClassifier(random_seed=42),
    StackingClassifier(classifiers=(LogisticRegression(random_state=42),
                                    RandomForestClassifier(random_state=42),
                                    CatBoostClassifier(random_seed=42)),
                                    meta_classifier=CatBoostClassifier(random_seed=42))
]
model_names=[
    'LogisticRegression',
    'DecisionTreeClassifier',
    'GradientBoostingClassifier',
    'RandomForestClassifier',
    'LGBMClassifier',
    'XGBClassifier',
    'CatBoostClassifier',
    'StackingClassifier'
]

In [139]:
#model = DecisionTreeClassifier(class_weight='balanced')

In [202]:
# 모델 성능 테스트
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))
    return F1

In [203]:
# # test code
scores=[]
names=[]
ms=[]
for i,classifier in enumerate(models):
    print(model_names[i]+' start\n')
    model=classifier.fit(x_train, y_train)
    pred=model.predict(x_val)
    scores.append(get_clf_eval(y_val,pred))
    ms.append(model)
    names.append(model_names[i])
    
score=pd.DataFrame(data=names,columns=['Name'])
score['score']=scores
score

LogisticRegression start

오차행렬:
 [[555 418]
 [307 660]]

정확도: 0.6263
정밀도: 0.6439
재현율: 0.5704
F1: 0.6049
DecisionTreeClassifier start

오차행렬:
 [[891  82]
 [129 838]]

정확도: 0.8912
정밀도: 0.8735
재현율: 0.9157
F1: 0.8941
GradientBoostingClassifier start



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


오차행렬:
 [[915  58]
 [ 91 876]]

정확도: 0.9232
정밀도: 0.9095
재현율: 0.9404
F1: 0.9247
RandomForestClassifier start

오차행렬:
 [[907  66]
 [ 82 885]]

정확도: 0.9237
정밀도: 0.9171
재현율: 0.9322
F1: 0.9246
LGBMClassifier start

[LightGBM] [Info] Number of positive: 3877, number of negative: 3883
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000242 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1342
[LightGBM] [Info] Number of data points in the train set: 7760, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499613 -> initscore=-0.001546
[LightGBM] [Info] Start training from score -0.001546
오차행렬:
 [[920  53]
 [ 72 895]]

정확도: 0.9356
정밀도: 0.9274
재현율: 0.9455
F1: 0.9364
XGBClassifier start

오차행렬:
 [[898  75]
 [107 860]]

정확도: 0.9062
정밀도: 0.8935
재현율: 0.9229
F1: 0.9080
CatBoostClassifier start

Learning rate set to 0.024712
0:	l

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Learning rate set to 0.024712
0:	learn: 0.6720570	total: 3.27ms	remaining: 3.26s
1:	learn: 0.6544729	total: 6.38ms	remaining: 3.19s
2:	learn: 0.6370916	total: 9.37ms	remaining: 3.11s
3:	learn: 0.6172862	total: 12.9ms	remaining: 3.22s
4:	learn: 0.5993409	total: 16.5ms	remaining: 3.28s
5:	learn: 0.5816874	total: 19.4ms	remaining: 3.21s
6:	learn: 0.5668864	total: 22.5ms	remaining: 3.19s
7:	learn: 0.5444574	total: 25.5ms	remaining: 3.16s
8:	learn: 0.5320888	total: 29.1ms	remaining: 3.21s
9:	learn: 0.5174708	total: 32.9ms	remaining: 3.25s
10:	learn: 0.5077370	total: 36.3ms	remaining: 3.27s
11:	learn: 0.4967566	total: 39.3ms	remaining: 3.24s
12:	learn: 0.4892080	total: 42.7ms	remaining: 3.24s
13:	learn: 0.4783240	total: 47.2ms	remaining: 3.32s
14:	learn: 0.4708725	total: 50.3ms	remaining: 3.3s
15:	learn: 0.4642201	total: 53.3ms	remaining: 3.28s
16:	learn: 0.4522942	total: 56.4ms	remaining: 3.26s
17:	learn: 0.4461419	total: 60ms	remaining: 3.27s
18:	learn: 0.4430310	total: 63.1ms	remaining: 3

0.9295199182839632

In [208]:
best_model_name=score.sort_values(by=['score']).iloc[-1]['Name']
best_model=ms[model_names.index(best_model_name)]
print(best_model)

LGBMClassifier(learning_rate=0.05, n_estimators=1000, random_state=42)


In [209]:
model = LGBMClassifier(learning_rate=0.05, n_estimators=1000, random_state=42)

In [210]:
model.fit(x_train_smote, y_train_smote)

[LightGBM] [Info] Number of positive: 3883, number of negative: 3883
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000291 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1349
[LightGBM] [Info] Number of data points in the train set: 7766, number of used features: 19
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


### 모델 학습

In [140]:
#model.fit(x_train.fillna(0), y_train)

### 모델 성능 보기

In [211]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [212]:
pred = model.predict(x_val.fillna(0))
get_clf_eval(y_val, pred)

오차행렬:
 [[925  48]
 [ 70 897]]

정확도: 0.9392
정밀도: 0.9296
재현율: 0.9507
F1: 0.9400


# 4. 제출하기

## 테스트 데이터 예측

In [213]:
# 예측에 필요한 데이터 분리
x_test = df_test.drop(["is_converted", "id"], axis=1)

In [214]:
test_pred = model.predict(x_test.fillna(0))
sum(test_pred) # True로 예측된 개수

1405

## 제출파일 작성

In [215]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

### 우측 상단의 제출 버튼을 클릭해 결과를 확인하세요