<a href="https://www.kaggle.com/code/kaiyoo88/tutorial-dacon-starter-code?scriptVersionId=236774940" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [22]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error
import numpy as np

In [59]:
data_path = "/kaggle/input/dacon-company-success-prediction"
train = pd.read_csv(f"{data_path}/train.csv")
test = pd.read_csv(f"{data_path}/test.csv")
sample_submission = pd.read_csv(f"{data_path}/sample_submission.csv")

In [53]:
train.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,ID,설립연도,국가,분야,투자단계,직원 수,인수여부,상장여부,고객수(백만명),총 투자금(억원),연매출(억원),SNS 팔로워 수(백만명),기업가치(백억원),성공확률
0,TRAIN_0000,2009,CT005,이커머스,Series A,4126.0,No,No,56.0,3365.0,4764.0,4.71,,0.3
1,TRAIN_0001,2023,CT006,핀테크,Seed,4167.0,Yes,No,80.0,4069.0,279.0,1.0,2500-3500,0.8
2,TRAIN_0002,2018,CT007,기술,Series A,3132.0,Yes,Yes,54.0,6453.0,12141.0,4.0,3500-4500,0.5
3,TRAIN_0003,2016,CT006,,Seed,3245.0,Yes,Yes,,665.0,10547.0,2.97,,0.7
4,TRAIN_0004,2020,CT002,에듀테크,Seed,1969.0,No,Yes,94.0,829.0,9810.0,1.0,1500-2500,0.1


In [60]:
# -------------------- 1. ID 컬럼 제거 --------------------
train = train.drop(columns=['ID'], axis=1)
test = test.drop(columns=['ID'], axis=1)

In [61]:
# --------------- 3. "기업가치(백억원)" 전처리 ---------------
def parse_value(val):
    if pd.isna(val):
        return np.nan
    if '-' in val:
        start, end = val.split('-')
        return (float(start) + float(end)) / 2
    try:
        return float(val)
    except:
        return np.nan

# 기업가치(백억원) 변환
train['기업가치(백억원)'] = train['기업가치(백억원)'].apply(parse_value)
test['기업가치(백억원)'] = test['기업가치(백억원)'].apply(parse_value)

In [62]:
# 투자단계 순서대로 mapping
investment_stage_mapping = {
    'Seed': 0,
    'Series A': 1,
    'Series B': 2,
    'Series C': 3,
    'IPO': 4
}

# Mapping 적용
train['투자단계_mapped'] = train['투자단계'].map(investment_stage_mapping)
test['투자단계_mapped'] = test['투자단계'].map(investment_stage_mapping)
# 필요하면 원본 드랍
train = train.drop(columns=['투자단계'])
test = test.drop(columns=['투자단계'])

In [63]:
# 현재 연도
CURRENT_YEAR = 2025

# 설립연도 처리: 경과 연수(years since founding)
train['설립연차'] = CURRENT_YEAR - pd.to_numeric(train['설립연도'], errors='coerce')
test['설립연차'] = CURRENT_YEAR - pd.to_numeric(test['설립연도'], errors='coerce')

# 불필요한 '설립연도' 원본 컬럼은 드랍
train = train.drop(columns=['설립연도'])
test = test.drop(columns=['설립연도'])

In [28]:
train['국가'].unique()

array([4, 5, 6, 1, 7, 9, 0, 8, 2, 3])

In [64]:
# -------------------- 3. 컬럼 리스트 정의 --------------------
category_features = ['국가', '분야']
numeric_features = [ '기업가치(백억원)', '설립연차', '투자단계_mapped', '직원 수', '고객수(백만명)', '총 투자금(억원)', '연매출(억원)', 'SNS 팔로워 수(백만명)']
bool_features = ['인수여부', '상장여부']

# -------------------- 4. 결측치 컬럼 탐색 --------------------
print("결측치 개수 (train set):")
print(train.isnull().sum())

결측치 개수 (train set):
국가                   0
분야                 857
직원 수               174
인수여부                 0
상장여부                 0
고객수(백만명)          1320
총 투자금(억원)            0
연매출(억원)              0
SNS 팔로워 수(백만명)       0
기업가치(백억원)         1813
성공확률                 0
투자단계_mapped          0
설립연차                 0
dtype: int64


In [65]:
# -------------------- 5. 범주형 데이터 인코딩 --------------------
encoders = {}

for feature in category_features:    
    train[feature] = train[feature].fillna('Missing')
    test[feature] = test[feature].fillna('Missing')
    # train과 test에서 해당 컬럼의 결측치를 '성공확률' 평균값 기준으로 채워넣기
    # mean_values = train.groupby(feature)['성공확률'].mean().to_dict()    
    # train[col] = train[col].fillna(mean_values)
    # test[col] = test[col].fillna(mean_values)    

    encoders[feature] = LabelEncoder()    
    train[feature] = encoders[feature].fit_transform(train[feature])
    test[feature] = encoders[feature].transform(test[feature])

# -------------------- 6. 불리언 데이터 인코딩 --------------------
bool_map = {'Yes': 1, 'No': 0}

for feature in bool_features:
    train[feature] = train[feature].map(bool_map)
    test[feature] = test[feature].map(bool_map)

In [66]:
# -------------------- 7. 결측치 채우기 (KNN Imputer 사용) --------------------
# 모든 feature를 대상으로 채운다
imputer = KNNImputer(n_neighbors=5)

# '성공확률' 컬럼은 제외하고 impute
X_train_full = train.drop(columns=['성공확률'])
X_train_full_imputed = pd.DataFrame(imputer.fit_transform(X_train_full), columns=X_train_full.columns)

X_test_imputed = pd.DataFrame(imputer.transform(test), columns=test.columns)

# 타겟은 따로
y = train['성공확률']

In [85]:
# -------------------- 8. 모델 학습 및 예측 --------------------
features = X_train_full_imputed.columns.tolist()

# KFold 교차검증
N_FOLDS = 5
kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

models = []
cv_scores = []

for fold, (train_idx, valid_idx) in enumerate(kf.split(X_train_full_imputed)):
    print(f"Fold {fold+1}")
    
    X_tr = X_train_full_imputed.iloc[train_idx]
    y_tr = y.iloc[train_idx]
    X_val = X_train_full_imputed.iloc[valid_idx]
    y_val = y.iloc[valid_idx]
    
    # model = RandomForestRegressor(
    #     n_estimators=100,
    #     random_state=42,
    #     n_jobs=-1
    # )

    # ✅ LGBM 모델로 교체
    # model = LGBMRegressor(
    #     n_estimators=500,
    #     learning_rate=0.03,
    #     max_depth=7,
    #     random_state=42,
    #     n_jobs=-1
    # )    

    # ✅ XGBRegressor로 변경
    model = XGBRegressor(
        n_estimators=500,
        learning_rate=0.03,
        max_depth=15,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    )
    
    model.fit(X_tr, y_tr)
    
    y_pred = model.predict(X_val)
    mae = mean_absolute_error(y_val, y_pred)
    
    models.append(model)
    cv_scores.append(mae)

    print(f"Fold {fold+1} MAE: {mae:.4f}")

print(f"\nAverage MAE across folds: {np.mean(cv_scores):.4f}")

Fold 1
Fold 1 MAE: 0.2033
Fold 2
Fold 2 MAE: 0.2024
Fold 3
Fold 3 MAE: 0.2020
Fold 4
Fold 4 MAE: 0.2014
Fold 5
Fold 5 MAE: 0.1988

Average MAE across folds: 0.2016


In [86]:
# -------------------- 9. 최종 모델 예측 (선택사항) --------------------
# 예를 들면, 가장 마지막 fold 모델을 사용하여 test셋 예측
# final_preds = models[-1].predict(X_test_imputed)

# 모든 모델의 평균 예측
test_preds = np.zeros(len(X_test_imputed))

for model in models:
    test_preds += model.predict(X_test_imputed) / N_FOLDS

final_preds = test_preds

print("\n✅ Test 예측 완료 (KFold 모델 평균)")
print(final_preds[:10])


✅ Test 예측 완료 (KFold 모델 평균)
[0.48622558 0.43746859 0.43987567 0.48630976 0.66038771 0.58702561
 0.4870545  0.64298956 0.58828261 0.55079674]


In [87]:
len(test) == len(final_preds)

True

In [88]:
sample_submission['성공확률'] = final_preds
sample_submission.to_csv('./baseline_submission_xgb_tuned.csv', index = False)

In [89]:
sample_submission.head()

Unnamed: 0,ID,성공확률
0,TEST_0000,0.486226
1,TEST_0001,0.437469
2,TEST_0002,0.439876
3,TEST_0003,0.48631
4,TEST_0004,0.660388
