# 영업 성공 여부 분류 경진대회

## 1. 데이터 확인

### 필수 라이브러리

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier 
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
import re
from sklearn.utils import shuffle

### 데이터 셋 읽어오기

In [2]:
# 데이터 로드
df_train = pd.read_csv("train.csv")  # 학습용 데이터
df_test = pd.read_csv("submission.csv")  # 테스트 데이터(제출 파일의 데이터)

In [3]:
print(df_train.describe())  # 수치형 데이터의 기술 통계 확인

        bant_submit  com_reg_ver_win_rate  customer_idx  \
count  59299.000000          14568.000000  59299.000000   
mean       0.634593              0.091685  27114.556333   
std        0.286066              0.150988  14653.911888   
min        0.000000              0.003788      2.000000   
25%        0.500000              0.019900  14913.000000   
50%        0.500000              0.049180  26774.000000   
75%        1.000000              0.074949  40368.500000   
max        1.000000              1.000000  47466.000000   

       historical_existing_cnt  id_strategic_ver  it_strategic_ver  \
count             13756.000000            3444.0            1121.0   
mean                 19.912184               1.0               1.0   
std                  44.697938               0.0               0.0   
min                   0.000000               1.0               1.0   
25%                   1.000000               1.0               1.0   
50%                   4.000000               1.0

In [4]:
df_train.iloc[30:60]

Unnamed: 0,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,it_strategic_ver,...,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted
30,1.0,/Johannesburh/South Africa,AS,0.040816,42282,Specifier/ Influencer,SMB,,,,...,LGESA,less than 3 months,0,0,0.003079,0.026846,corporate / office,Construction,26,False
31,1.0,/Posey/United States,AS,,22295,Specifier/ Influencer,SMB,,,,...,LGEUS,3 months ~ 6 months,0,0,0.003079,0.026846,corporate / office,Agriculture,27,False
32,1.0,/Riyadh/Saudi Arabia,AS,0.040816,981,End-Customer,Enterprise,,,,...,LGESJ,3 months ~ 6 months,1,0,0.003079,0.026846,corporate / office,Pharmaceutical,13,False
33,1.0,/Bauan Batangas/Philippines,AS,0.066667,31698,Specifier/ Influencer,Enterprise,1.0,,,...,LGEPH,less than 3 months,0,0,0.003079,0.026846,corporate / office,Engineering,28,False
34,1.0,/Metro manila/Philippines,AS,0.066667,22987,Specifier/ Influencer,Enterprise,,,,...,LGEPH,less than 3 months,0,0,0.003079,0.026846,corporate / office,Construction,29,False
35,1.0,/Taguig/Philippines,AS,0.066667,46824,,Enterprise,,,,...,LGEPH,less than 3 months,0,0,0.003079,0.026846,corporate / office,,29,False
36,1.0,/Mysore/India,AS,0.088889,17727,Specifier/ Influencer,Enterprise,,,,...,LGEIL,less than 3 months,0,0,0.003079,0.026846,corporate / office,Construction,30,False
37,1.0,/Kolkata/India,AS,0.088889,41106,End-Customer,SMB,,,,...,LGEIL,less than 3 months,1,0,0.003079,0.026846,corporate / office,IT/Software,31,False
38,1.0,/Chennai/India,AS,0.088889,14937,End-Customer,Enterprise,,,,...,LGEIL,less than 3 months,1,0,0.003079,0.026846,corporate / office,Engineering,32,False
39,1.0,/Bangalore/India,AS,0.088889,10939,End-Customer,Enterprise,,,,...,LGEIL,less than 3 months,1,0,0.003079,0.026846,corporate / office,Others,18,False


In [5]:
# lead_desc_length 존재 여부
if 'lead_desc_length' in df_train.columns:
    df_train['lead_desc_length'] = np.where(df_train['lead_desc_length'] > 0, 'o', 'x')
if 'lead_desc_length' in df_test.columns:
    df_test['lead_desc_length'] = np.where(df_test['lead_desc_length'] > 0, 'o', 'x')

In [6]:
# 지역 코드에 따른 지역명 매핑
region_mapping = {
    "EU": ["LGEAG", "LGECZ", "LGEFS", "LGEDG", "LGEHS", "LGEMK", "LGEIS", "LGESC", "LGEEH", "LGEBN", "LGEWR", "LGEPL", "LGEMA", "LGEPT", "LGERO", "LGEES", "LGENO", "LGESW", "LGEUK"],
    "RC": ["LGEAK", "LGERM", "LGERI", "LGERA", "LGEUR", "LGELV"],
    "MA": ["LGEAS", "LGEEG", "LGELF", "LGESK", "LGEMC", "LGESA", "LGETU", "LGEOT", "LGEDF", "LGEGF", "LGEME", "LGEAF", "LEAO", "LGENI", "LGETK", "LGEAT", "LGESJ", "LGEEF", "LGEYK", "LGEIR"],
    "AP": ["LGEAP", "LGEQA", "LGETL", "LGECH", "LGEYT", "LGETR", "LGETA", "LGESY", "LGESH", "LGEQH", "LGEQD", "LGEPN", "LGENE", "LGEKS", "LGEHZ", "LGEHN", "LGEHK", "LGEIL", "LGEPH", "LGEVH", "LGEKR", "LGESL", "LGEIN", "LGETH", "LGEML", "LGETT", "LGEJP"],
    "NA": ["LGECI", "LGERS", "LGEMX", "LGEMS", "LGEMM", "LGEMR", "LGEUS", "LGEMU", "LGEAI"],
    "LA": ["LGEAG", "LGEBR", "LGECL", "LGEVZ", "LGECB", "LGEPS", "LGEPR", "LGESP", "LGEAR"],
    "OT": ["LGEEB", "LGELA", "LGEBT", "MA", "RC"]
}


def categorize_region(code):
    for region, codes in region_mapping.items():
        if code in codes:
            return region
    return "ETC"  

df_train['region'] = df_train['response_corporate'].apply(categorize_region)
df_test['region'] = df_test['response_corporate'].apply(categorize_region)

In [7]:
country_names = [
    'India', 'Hong Kong', 'United States', 'Brazil', 'Poland', 'Mexico', 'United Kingdom', 
    'Saudi Arabia', 'Philippines', 'Indonesia', 'Canada', 'Taiwan', 'Egypt', 'Oman', 'Germany', 
    'Portugal', 'Australia', 'Panama', 'Chile', 'Laos', 'Netherlands', 'Papua New Guinea', 
    'Switzerland', 'Argentina', 'Nigeria', 'Burkina Faso', 'Morocco', 'Guatemala', 'Qatar', 
    'Turkey', 'Thailand', 'France', 'Iraq', 'Hungary', 'Congo', 'Pakistan', 'Peru', 'Israel', 
    'United Arab Emirates', 'Jordan', 'Italy', 'Singapore', 'Spain', 'Mozambique', 'Greece', 
    'Paraguay', 'Malaysia', 'Romania', 'Bolivia', 'El Salvador', 'Gambia', 'Ghana', 'Ireland', 
    'Costa Rica', 'Dominican Republic', 'Myanmar', 'Ecuador', 'Togo', 'Viet Nam', 'Botswana', 
    'Jamaica', 'South Africa', 'Suriname', 'Venezuela', 'Kuwait', 'Cambodia', 'Mauritius', 
    'Kenya', 'Czech Republic', 'Montenegro', 'China', 'Anguilla', 'Cameroon', 'Belgium', 
    'Senegal', 'Sierra Leone', 'Malta', 'Zimbabwe', 'Bulgaria', 'Bahrain', 'Ivory Coast', 
    'Denmark', 'Namibia', 'Barbados', 'Angola', 'Croatia', 'Bangladesh', 'Uganda', 'Puerto Rico', 
    'Uruguay', 'Algeria', 'Mali', 'Bahamas', 'Sudan', 'Honduras', 'Benin', 'Latvia', 'Maldives', 
    'Nicaragua', 'Tunisia', 'Guyana', 'Gabon', 'Cyprus', 'Syria', 'Lebanon', 'Georgia', 'Libya', 
    'Bermuda', 'Sweden', 'Japan', 'Afghanistan', 'Austria', 'Zambia', 'Sri Lanka', 'Ethiopia', 
    'Fiji', 'Serbia', 'Yemen', 'Macedonia', 'Slovenia', 'Brunei', 'Azerbaijan', 'South Korea', 
    'Norway', 'Cayman Islands', 'Kazakhstan', 'Bosnia and Herzegovina', 'New Zealand'
]

def extract_country(value):
    # 문자열이 아니면 문자열로 변환
    if not isinstance(value, str):
        value = str(value)
    # 마지막 슬래쉬 내용 -> 나라 
    match = re.search(r'\/([^\/]+)$', value)
    if match:
        country = match.group(1).strip()
        # country_names 목록에 있는지 확인
        if country in country_names:
            return country
        else:
            return 'ETC'
    return 'ETC'

df_train['customer_country'] = df_train['customer_country'].apply(extract_country)
df_test['customer_country'] = df_test['customer_country'].apply(extract_country)

In [8]:
# business_area
replace_values = ['hospital & health care', 'factory', 'government department', 'public facility', 'transportation', 'power plant / renewable energy']

for value in replace_values:
    df_train['business_unit'] = df_train['business_unit'].replace(value, 'ETC')


for value in replace_values:
    df_test['business_unit'] = df_test['business_unit'].replace(value, 'ETC')


In [9]:
# 제거 칼럼 
# customer_country.1의 경우 customer_country와 동일하여 제거 
# it_strategic_ver, id_strategic_ver,idit_strategic_ver 중요도 하위 삭제  
del_cols = ['customer_country.1','it_strategic_ver', 'id_strategic_ver', 'idit_strategic_ver','product_subcategory', 'product_modelname', 'ver_win_rate_x',
       'ver_win_ratio_per_bu', 'business_area', 'business_subarea']
df_train.drop(del_cols, axis=1, inplace=True)
df_test.drop(del_cols, axis=1, inplace=True)

In [10]:
# 언더샘플링 전 타겟 변수 'is_converted'의 분포 확인 및 언더샘플링
target_distribution = df_train['is_converted'].value_counts()
min_samples = target_distribution.min()

In [11]:
df_train_balanced = pd.concat([
    df_train[df_train['is_converted'] == True].sample(min_samples, random_state=42),
    df_train[df_train['is_converted'] == False].sample(min_samples, random_state=42)
], ignore_index=True)

In [12]:
df_train_balanced['is_converted'].value_counts()

is_converted
True     4850
False    4850
Name: count, dtype: int64

In [13]:
df_train = df_train_balanced

In [14]:
df_train.iloc[30:180]

Unnamed: 0,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,customer_job,lead_desc_length,inquiry_type,product_category,customer_position,response_corporate,expected_timeline,ver_cus,ver_pro,lead_owner,is_converted,region
30,1.00,Saudi Arabia,ID,,25096,,SMB,,engineering,o,Quotation or purchase consultation,hospital tv,c-level executive,LGESJ,less than 3 months,0,0,153,True,MA
31,0.75,India,AS,,25096,,Enterprise,,information technology,o,Sales Inquiry,multi-split,associate/analyst,LGEIL,3 months ~ 6 months,0,0,4,True,AP
32,0.50,ETC,ID,,25096,,Enterprise,,purchasing,o,Quotation or purchase consultation,hotel tv,none,LGEIL,they are having requirement of 40 displays. cu...,0,0,148,True,AP
33,0.25,Brazil,ID,0.124122,20253,,SMB,0.0,engineering,o,Quotation or Purchase Consultation,,none,LGESP,,0,0,487,True,LA
34,0.25,Canada,IT,,13269,,SMB,,,o,Quotation or Purchase Consultation,,none,LGECI,,0,0,410,True,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
175,1.00,Argentina,ID,0.042254,10510,End Customer,SMB,,arts and design,o,Quotation or Purchase Consultation,high brightness signage,other,LGEAR,less than 3 months,0,0,345,True,LA
176,1.00,Hong Kong,ID,,25096,,SMB,,arts and design,o,Quotation or purchase consultation,互動式顯示屏,ceo/founder,LGEHK,less than 3 months,0,0,195,True,AP
177,1.00,India,ID,,25096,,Enterprise,,information technology,o,Quotation or purchase consultation,video wall,manager,LGEIL,less than 3 months,0,0,220,True,AP
178,1.00,Brazil,AS,0.003937,43842,End-Customer,SMB,,purchasing,o,Quotation or purchase consultation,etc.,associate/analyst,LGESP,more than a year,1,0,682,True,LA


In [15]:
unique_country_count = len(df_train['customer_country'].unique())
print(f"Unique country count: {unique_country_count}")
df_train['customer_country'].unique()

Unique country count: 126


array(['ETC', 'India', 'Hong Kong', 'United States', 'Brazil', 'Poland',
       'Mexico', 'United Kingdom', 'Saudi Arabia', 'Philippines',
       'Indonesia', 'Canada', 'Taiwan', 'Egypt', 'Oman', 'Germany',
       'Portugal', 'Australia', 'Panama', 'Chile', 'Laos', 'Netherlands',
       'Papua New Guinea', 'Switzerland', 'Argentina', 'Nigeria',
       'Burkina Faso', 'Morocco', 'Guatemala', 'Qatar', 'Turkey',
       'Thailand', 'France', 'Iraq', 'Hungary', 'Congo', 'Pakistan',
       'Peru', 'Israel', 'Jordan', 'Italy', 'Singapore', 'Spain',
       'Mozambique', 'Greece', 'Paraguay', 'Malaysia', 'Romania',
       'Bolivia', 'El Salvador', 'Gambia', 'Ghana', 'Ireland',
       'Costa Rica', 'Dominican Republic', 'Myanmar', 'Ecuador', 'Togo',
       'Botswana', 'Jamaica', 'South Africa', 'Suriname', 'Venezuela',
       'Kuwait', 'Cambodia', 'Mauritius', 'Kenya', 'Montenegro', 'China',
       'Anguilla', 'Cameroon', 'Belgium', 'Senegal', 'Sierra Leone',
       'Malta', 'Zimbabwe', 'Bulgari

In [16]:
# # 'ver_cus', 'ver_pro' -> grant_weight' 
# grant = ['ver_cus', 'ver_pro']
# df_train['grant_weight'] = np.where(df_train['ver_cus'] > 0, 1, 0)
# df_train['grant_weight'] = np.where(df_train['ver_pro'] > 0, 1, df_train['grant_weight'])

# df_test['grant_weight'] = np.where(df_test['ver_cus'] > 0, 1, 0)
# df_test['grant_weight'] = np.where(df_test['ver_pro'] > 0, 1, df_test['grant_weight'])

# # 'ver_cus', 'ver_pro' 컬럼 삭제
# df_train.drop(grant, axis=1, inplace=True)
# df_test.drop(grant, axis=1, inplace=True)

In [17]:
# 수치형 및 범주형 칼럼 리스트를 정의
numeric_columns = df_train.select_dtypes(include=[np.number]).columns.tolist()
categorical_columns = df_train.select_dtypes(exclude=[np.number]).columns.tolist()

In [18]:
# 결측치 처리
numeric_imputer = SimpleImputer(strategy='mean')
categorical_imputer = SimpleImputer(strategy='most_frequent')

df_train[numeric_columns] = numeric_imputer.fit_transform(df_train[numeric_columns])
df_test[numeric_columns] = numeric_imputer.transform(df_test[numeric_columns])

df_train[categorical_columns] = categorical_imputer.fit_transform(df_train[categorical_columns])
df_test[categorical_columns] = categorical_imputer.transform(df_test[categorical_columns])

In [19]:
#결측치 처리 확인 
df_train.iloc[150:180]

Unnamed: 0,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,customer_job,lead_desc_length,inquiry_type,product_category,customer_position,response_corporate,expected_timeline,ver_cus,ver_pro,lead_owner,is_converted,region
150,0.5,Burkina Faso,AS,0.172867,25096.0,End-Customer,Enterprise,20.661812,administrative,o,Sales Inquiry,single-split,manager,LGEAF,less than 3 months,0.0,0.0,5.0,True,MA
151,0.5,Hong Kong,ID,0.172867,25096.0,End-Customer,SMB,6.0,education,o,Quotation or purchase consultation,led signage,none,LGEHK,less than 3 months,0.0,0.0,469.0,True,AP
152,1.0,India,ID,0.172867,29454.0,End-Customer,Enterprise,20.661812,media and communication,o,Technical Consultation,interactive digital board,entry level,LGEIL,more than a year,0.0,0.0,166.0,True,AP
153,0.25,United States,IT,0.172867,6479.0,End Customer,SMB,0.0,engineering,o,Quotation or Purchase Consultation,vrf,none,LGEUS,less than 3 months,0.0,0.0,437.0,True,
154,0.75,Brazil,ID,0.069565,29838.0,End-Customer,Enterprise,20.661812,engineering,o,Quotation or Purchase Consultation,video wall signage,trainee,LGESP,less than 3 months,0.0,0.0,167.0,True,LA
155,0.5,Argentina,ID,0.069565,8060.0,End-Customer,SMB,20.661812,purchasing,o,Quotation or Purchase Consultation,standard signage,c-level executive,LGEAR,less than 3 months,0.0,0.0,345.0,True,LA
156,0.5,India,AS,0.172867,25096.0,End-Customer,Enterprise,20.661812,other,o,Sales Inquiry,multi-split,other,LGEIL,less than 3 months,0.0,0.0,89.0,True,AP
157,0.25,United States,IT,0.172867,16107.0,End Customer,SMB,20.661812,engineering,o,Others,vrf,none,LGEUS,less than 3 months,0.0,0.0,831.0,True,
158,0.75,India,ID,0.172867,25096.0,End-Customer,Enterprise,20.661812,purchasing,o,Quotation or purchase consultation,hotel tv,manager,LGEIL,less than 3 months,0.0,0.0,445.0,True,AP
159,1.0,Brazil,ID,0.172867,25096.0,End-Customer,SMB,20.661812,administrative,o,Quotation or Purchase Consultation,video wall signage,ceo/founder,LGESP,less than 3 months,0.0,0.0,167.0,True,LA


## 2. 데이터 전처리

### 레이블 인코딩

In [20]:
# 범주형 데이터 레이블 인코딩
def label_encoding(series: pd.Series) -> pd.Series:
    """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""
    my_dict = {}
    series = series.astype(str)
    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)
    return series

In [21]:
# 레이블 인코딩할 칼럼들
# label_columns = [
#     "customer_country",
#     "business_subarea",
#     "business_area",
#     "business_unit",
#     "customer_type",
#     "enterprise",
#     "customer_job",
#     "inquiry_type",
#     "product_category",
#     "product_subcategory",
#     "product_modelname",
#     "customer_country.1",
#     "customer_position",
#     "response_corporate",
#     "expected_timeline",
# ]

# df_all = pd.concat([df_train[label_columns], df_test[label_columns]])

# for col in label_columns:
#     df_all[col] = label_encoding(df_all[col])


다시 학습 데이터와 제출 데이터를 분리합니다.

In [22]:
for col in categorical_columns:
    df_train[col] = label_encoding(df_train[col])
    df_test[col] = label_encoding(df_test[col])

In [23]:
# 수치형 데이터 스케일링
scaler = StandardScaler()
df_train[numeric_columns] = scaler.fit_transform(df_train[numeric_columns])
df_test[numeric_columns] = scaler.transform(df_test[numeric_columns])

### 2-2. 학습, 검증 데이터 분리

In [24]:
# 특성과 레이블 분리 및 훈련 데이터셋 분할
x_train, x_val, y_train, y_val = train_test_split(
    df_train.drop("is_converted", axis=1),
    df_train["is_converted"],
    test_size=0.2,
    shuffle=True,
    random_state=42,
)

In [25]:
# smote = SMOTE(random_state=42)
# x_train_smote, y_train_smote = smote.fit_resample(x_train, y_train)
# print("오버샘플링:" )
# print(len(x_train_smote), len(y_train_smote))
# print(y_train_smote.value_counts())

In [26]:
# 랜덤샘플링을 위해 데이터프레임 재구성
df_train_sampled = pd.concat([x_train, y_train], axis=1)

# 각 클래스별 샘플 수를 최소 샘플 수에 맞춤
min_samples = df_train_sampled['is_converted'].value_counts().min()
df_sampled = pd.concat([
    df_train_sampled[df_train_sampled['is_converted'] == True].sample(min_samples, random_state=42),
    df_train_sampled[df_train_sampled['is_converted'] == False].sample(min_samples, random_state=42)
], ignore_index=True)

df_sampled = shuffle(df_sampled, random_state=42)

# 특성과 레이블을 다시 분리
x_train_sampled = df_sampled.drop("is_converted", axis=1)
y_train_sampled = df_sampled["is_converted"]

## 3. 모델 학습

### 모델 정의 

In [27]:
# model = RandomForestClassifier(n_estimators=100, random_state=42)
model = LGBMClassifier(n_estimators=1000,learning_rate=0.05,random_state=42)

### 모델 학습

In [28]:
# model.fit(x_train_smote, y_train_smote)
model.fit(x_train_sampled, y_train_sampled)

[LightGBM] [Info] Number of positive: 3877, number of negative: 3877
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000724 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1067
[LightGBM] [Info] Number of data points in the train set: 7754, number of used features: 18
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


### 모델 성능 보기

In [29]:
# 모델 평가
def get_clf_eval(y_test, y_pred_probs):
    thresholds = np.linspace(0, 1, 100)
    best_f1 = 0
    best_threshold = 0.5
    
    for threshold in thresholds:
        y_pred = y_pred_probs >= threshold
        f1 = f1_score(y_test, y_pred)
        # 최고의 F1 점수와 임계값 찾기
        if f1 > best_f1:
            best_f1 = f1
            best_threshold = threshold
            
    # 최적의 임계값을 사용한 최종 예측
    y_pred_final = y_pred_probs >= best_threshold
    
    confusion = confusion_matrix(y_test, y_pred_final)
    accuracy = accuracy_score(y_test, y_pred_final)
    precision = precision_score(y_test, y_pred_final)
    recall = recall_score(y_test, y_pred_final)
    print("최적의 임계값:", best_threshold)
    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(best_f1))

In [30]:
y_pred_probs = model.predict_proba(x_val)[:, 1]
get_clf_eval(y_val, y_pred_probs)

최적의 임계값: 0.6767676767676768
오차행렬:
 [[913  54]
 [ 70 903]]

정확도: 0.9361
정밀도: 0.9436
재현율: 0.9281
F1: 0.9358


## 4. 제출하기

### 테스트 데이터 예측

In [31]:
# 예측에 필요한 데이터 분리
x_test = df_test.drop(["is_converted", "id"], axis=1)

In [32]:
test_pred = model.predict(x_test)
sum(test_pred) # True로 예측된 개수

2265

### 제출 파일 작성

In [33]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**