In [14]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

import torch
from pytorch_tabnet.pretraining import TabNetPretrainer
from pytorch_tabnet.tab_model import TabNetRegressor

from sklearn.linear_model import LinearRegression

In [15]:
train = pd.read_csv('../data_files/train.csv')
test = pd.read_csv('../data_files/test.csv')
sample_submission = pd.read_csv('../data_files/sample_submission.csv')

In [16]:
train = train.drop(columns=['ID'], axis = 1)
test = test.drop(columns=['ID'], axis = 1)

In [17]:
# 백억단위 실수로 만들기

def convert_value(value):
    if isinstance(value, str):
        if '이상' in value:
            return 6000
        elif '-' in value:
            start, end = value.split('-')
            return (float(start) + float(end)) / 2
    return float(value)

# 1. train 데이터 가공
train = train.copy()
train['기업가치(백억원)'] = train['기업가치(백억원)'].apply(convert_value)

train_data = train.dropna(subset=['기업가치(백억원)'])
X = train_data[['총 투자금(억원)', '연매출(억원)', '직원 수']].copy()
X['설립연도'] = train_data['설립연도'].astype(int)
y = train_data['기업가치(백억원)']

model = LinearRegression()
model.fit(X, y)

# 2. 결측치 채우기
train_nan = train[train['기업가치(백억원)'].isna()].copy()
train_nan['직원 수'] = train_nan['직원 수'].fillna(train_data['직원 수'].mean())

X_test = train_nan[['총 투자금(억원)', '연매출(억원)', '직원 수']].copy()
X_test['설립연도'] = train_nan['설립연도'].astype(int)

y_pred = model.predict(X_test)
train.loc[train['기업가치(백억원)'].isna(), '기업가치(백억원)'] = y_pred

# 3. test 데이터도 동일하게
test = test.copy()
test['기업가치(백억원)'] = test['기업가치(백억원)'].apply(convert_value)

test_nan = test[test['기업가치(백억원)'].isna()].copy()
test_nan['직원 수'] = test_nan['직원 수'].fillna(train['직원 수'].mean())

X_test = test_nan[['총 투자금(억원)', '연매출(억원)', '직원 수']].copy()
X_test['설립연도'] = test_nan['설립연도'].astype(int)

y_pred = model.predict(X_test)

test.loc[test['기업가치(백억원)'].isna(), '기업가치(백억원)'] = y_pred

In [18]:
# train['기업가치(백억원)']

test['기업가치(백억원)']

0       2000.000000
1       4122.880997
2       6000.000000
3       2000.000000
4       4111.128008
           ...     
1750    2000.000000
1751    5250.000000
1752    2000.000000
1753    5250.000000
1754    3954.796522
Name: 기업가치(백억원), Length: 1755, dtype: float64

In [19]:
train['투자단계'].unique()


array(['Series A', 'Seed', 'Series C', 'Series B', 'IPO'], dtype=object)

In [20]:
stat = {'Series A':1, 'Seed':0, 'Series C':2, 'Series B':3, 'IPO':4}
train['투자단계'] = train['투자단계'].map(stat)
test['투자단계'] = test['투자단계'].map(stat)

In [21]:

# 범주형, 수치형, 불리언 feature 정의
category_features = ['국가', '분야']
numeric_features = ['투자단계','직원 수', '고객수(백만명)', '총 투자금(억원)', '연매출(억원)', 
                    'SNS 팔로워 수(백만명)', '기업가치(백억원)', '회사나이']
bool_features = ['인수여부', '상장여부']

encoders = {}

# 1. 회사나이(2025 - 설립연도) 계산
train['회사나이'] = 2025 - train['설립연도'].astype(float)
test['회사나이'] = 2025 - test['설립연도'].astype(float)

# 2. 불리언 값을 0과 1로 변환 ('Yes' → 1, 'No' → 0)
bool_map = {'Yes': 1, 'No': 0}
for feature in bool_features:
    train[feature] = train[feature].fillna('No').map(bool_map)
    test[feature] = test[feature].fillna('No').map(bool_map)

# 3. 범주형 데이터를 Label Encoding
for feature in category_features:
    encoders[feature] = LabelEncoder()
    train[feature] = train[feature].fillna('Missing')
    test[feature] = test[feature].fillna('Missing')
    train[feature] = encoders[feature].fit_transform(train[feature])
    test[feature] = encoders[feature].transform(test[feature])

# 4. 수치형 변수 결측치를 평균값으로 대체
for feature in numeric_features:
    mean_value = train[feature].mean()
    train[feature] = train[feature].fillna(mean_value)
    test[feature] = test[feature].fillna(mean_value)

# 5. TabNet용 범주형 변수 인덱스(cat_idxs) 및 차원(cat_dims) 설정
features = [col for col in train.columns if col != '성공확률']
cat_idxs = [features.index(col) for col in category_features]
cat_dims = [train[col].max() + 1 for col in category_features]


In [22]:
# 타겟 지정
target = train['성공확률']  
X = train[features]
y = target

# KFold 설정
N_FOLDS = 5
kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

models = [] # 모델 저장 리스트
cv_scores = []

for fold, (train_idx, valid_idx) in enumerate(kf.split(X)):
    print(f"\n🔁 Fold {fold+1}/{N_FOLDS}")
    
    X_train = X.iloc[train_idx].values
    y_train = y.iloc[train_idx].values.reshape(-1, 1)
    
    X_valid = X.iloc[valid_idx].values
    y_valid = y.iloc[valid_idx].values.reshape(-1, 1)
    
    # 비지도 사전학습
    print("▶ Pretraining...")

    pretrainer = TabNetPretrainer(
        cat_idxs=cat_idxs,
        cat_dims=cat_dims,
        seed=42,
        verbose=0
    )

    pretrainer.fit(
        X_train=X_train,
        max_epochs=100,
        batch_size=512,
        virtual_batch_size=64
    )

    # 지도 학습 
    print("▶ Fine-tuning...")
    model = TabNetRegressor(
        cat_idxs=cat_idxs,
        cat_dims=cat_dims,
        seed=42,
        verbose=0,
        optimizer_fn=torch.optim.AdamW 
    )

    model.fit(
        X_train=X_train, y_train=y_train,
        eval_set=[(X_valid, y_valid)],
        from_unsupervised=pretrainer,
        eval_metric=['mae'],
        max_epochs=100,
        patience=10
    )

    # 모델을 메모리에 저장
    models.append(model)
    cv_scores.append(model.best_cost)

print("\n✅ 모든 fold 모델 학습 완료!")


🔁 Fold 1/5
▶ Pretraining...




▶ Fine-tuning...





Early stopping occurred at epoch 35 with best_epoch = 25 and best_val_0_mae = 0.20431





🔁 Fold 2/5
▶ Pretraining...




▶ Fine-tuning...





Early stopping occurred at epoch 25 with best_epoch = 15 and best_val_0_mae = 0.20466

🔁 Fold 3/5
▶ Pretraining...




▶ Fine-tuning...





Early stopping occurred at epoch 31 with best_epoch = 21 and best_val_0_mae = 0.20309

🔁 Fold 4/5
▶ Pretraining...




▶ Fine-tuning...





Early stopping occurred at epoch 56 with best_epoch = 46 and best_val_0_mae = 0.20275

🔁 Fold 5/5
▶ Pretraining...




▶ Fine-tuning...





Early stopping occurred at epoch 19 with best_epoch = 9 and best_val_0_mae = 0.20557

✅ 모든 fold 모델 학습 완료!




In [23]:
# 저장된 모델들로 예측
predictions_list = []

for fold, model in enumerate(models):
    print(f"Predict with fold {fold+1}")
    preds = model.predict(test[features].values)
    predictions_list.append(preds)

# 평균 예측
final_predictions = np.mean(predictions_list, axis=0)

Predict with fold 1


Predict with fold 2
Predict with fold 3
Predict with fold 4
Predict with fold 5


In [24]:
sample_submission['성공확률'] = final_predictions
sample_submission.to_csv('../data_files/submission11.csv', index = False, encoding = 'utf-8-sig')