In [12]:
%pip install pytorch-tabnet
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import KFold

import torch
from pytorch_tabnet.pretraining import TabNetPretrainer
from pytorch_tabnet.tab_model import TabNetRegressor

from sklearn.impute import KNNImputer


Note: you may need to restart the kernel to use updated packages.


In [13]:
train = pd.read_csv("/kaggle/input/dacon2/train.csv")
test=pd.read_csv("/kaggle/input/dacon2/test.csv")
sample_submission = pd.read_csv("/kaggle/input/dacon2/sample_submission.csv")

In [14]:
#특성과 타겟 변수 분리
train = train.drop(columns=['ID'], axis = 1)
test = test.drop(columns=['ID'], axis = 1)

In [15]:
# 설립연도 타입 변환 (int -> object)
train['설립연도'] =train['설립연도'].astype('object')
test['설립연도'] =test['설립연도'].astype('object')

category_features = ['설립연도','국가','분야','투자단계','기업가치(백억원)']
numeric_features = ['직원 수','고객수(백만명)','총 투자금(억원)','연매출(억원)','SNS 팔로워 수(백만명)']
bool_features = ['인수여부','상장여부']

# LabelEncoder 객체를 각 범주형 feature별로 따로 저장하여 사용
encoders = {}

# 범주형 데이터를 encoding
for feature in category_features:
    encoders[feature] = LabelEncoder()
    train[feature] = train[feature].fillna('Missing')
    test[feature] = test[feature].fillna('Missing')
    train[feature] = encoders[feature].fit_transform(train[feature])
    test[feature] = encoders[feature].transform(test[feature])

# 불리언 값을 0과 1로 변환 ('Yes' → 1, 'No' → 0 으로 변환)
bool_map = {'Yes': 1, 'No': 0}
for feature in bool_features:
    train[feature] = train[feature].map(bool_map)
    test[feature] = test[feature].map(bool_map)
#---
# 수치형 데이터 분리
train_numeric = train[numeric_features].copy()
test_numeric = test[numeric_features].copy()

# 스케일링 (KNN 거리에 민감하므로 표준화 필수)
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_numeric)
test_scaled = scaler.transform(test_numeric)

# KNN Imputer 적용
imputer = KNNImputer(n_neighbors=5)
train_imputed = imputer.fit_transform(train_scaled)  # 수정된 변수명
test_imputed = imputer.transform(test_scaled)  # 수정된 변수명

# 스케일 원복
train[numeric_features] = pd.DataFrame(scaler.inverse_transform(train_imputed), 
                                       columns=numeric_features, index=train.index)
test[numeric_features] = pd.DataFrame(scaler.inverse_transform(test_imputed), 
                                      columns=numeric_features, index=test.index)


features_without_investment = [col for col in train.columns if col not in ['인수여부', '상장여부', '성공확률']]

# TabNet용 범주형 변수 인덱스(cat_idxs) 및 차원(cat_dims) 설정
cat_idxs = [features_without_investment.index(col) for col in category_features if col in features_without_investment]
cat_dims = [train[col].nunique() + 1 for col in category_features if col in features_without_investment]



  train[feature] = train[feature].fillna('Missing')
  test[feature] = test[feature].fillna('Missing')


In [16]:
# 타겟 지정
target = train['성공확률']  
X = train[features_without_investment]
y = target

# KFold 설정
N_FOLDS = 5
kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

models = [] # 모델 저장 리스트
cv_scores = []

for fold, (train_idx, valid_idx) in enumerate(kf.split(X)):
    print(f"\n🔁 Fold {fold+1}/{N_FOLDS}")
    
    X_train = X.iloc[train_idx].values
    y_train = y.iloc[train_idx].values.reshape(-1, 1)
    
    X_valid = X.iloc[valid_idx].values
    y_valid = y.iloc[valid_idx].values.reshape(-1, 1)
    
    # 비지도 사전학습
    print("▶ Pretraining...")

    pretrainer = TabNetPretrainer(
        cat_idxs=cat_idxs,
        cat_dims=cat_dims,
        seed=42,
        verbose=0
    )

    pretrainer.fit(
        X_train=X_train,
        max_epochs=100,
        batch_size=512,
        virtual_batch_size=64
    )
    #       지도 학습 하이퍼 파라미터 개선
    print("▶ Fine-tuning...")
    model = TabNetRegressor(
        cat_idxs=[i for i, col in enumerate(features_without_investment) if col in category_features],
        cat_dims=[train[col].max() + 1 for col in category_features],
        seed=42,
        verbose=0,
        optimizer_fn=torch.optim.AdamW
    )
    model.fit(
        X_train=X_train, y_train=y_train,
        eval_set=[(X_valid, y_valid)],
        from_unsupervised=pretrainer,
        eval_metric=['mae'],
        max_epochs=100,
        patience=10
    )

    # 모델을 메모리에 저장
    models.append(model)
    cv_scores.append(model.best_cost)

print("\n✅ 모든 fold 모델 학습 완료!")


🔁 Fold 1/5
▶ Pretraining...




▶ Fine-tuning...





Early stopping occurred at epoch 43 with best_epoch = 33 and best_val_0_mae = 0.2073

🔁 Fold 2/5
▶ Pretraining...




▶ Fine-tuning...





Early stopping occurred at epoch 29 with best_epoch = 19 and best_val_0_mae = 0.20348

🔁 Fold 3/5
▶ Pretraining...




▶ Fine-tuning...





Early stopping occurred at epoch 56 with best_epoch = 46 and best_val_0_mae = 0.20294

🔁 Fold 4/5
▶ Pretraining...




▶ Fine-tuning...





Early stopping occurred at epoch 27 with best_epoch = 17 and best_val_0_mae = 0.20662

🔁 Fold 5/5
▶ Pretraining...




▶ Fine-tuning...





Early stopping occurred at epoch 56 with best_epoch = 46 and best_val_0_mae = 0.2064

✅ 모든 fold 모델 학습 완료!




In [22]:
# 저장된 모델들로 예측
X_test = test[features_without_investment].values 
predictions_list = []
for fold, model in enumerate(models):
    print(f"Predict with fold {fold+1}")
    preds = model.predict(X_test)  # 테스트 데이터에 대한 예측
    predictions_list.append(preds)

# 최종 예측값 계산 (미디안)
final_predictions = np.median(predictions_list, axis=0)

Predict with fold 1
Predict with fold 2
Predict with fold 3
Predict with fold 4
Predict with fold 5


In [23]:
sample_submission['성공확률'] = final_predictions
sample_submission.to_csv('./baseline_submission.csv', index = False, encoding = 'utf-8-sig')