In [34]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

import torch
from pytorch_tabnet.pretraining import TabNetPretrainer
from pytorch_tabnet.tab_model import TabNetRegressor

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [35]:
train = pd.read_csv('data_files/train.csv')
test = pd.read_csv('data_files/test.csv')
sample_submission = pd.read_csv('data_files/sample_submission.csv')

In [36]:
train = train.drop(columns=['ID'], axis = 1)
test = test.drop(columns=['ID'], axis = 1)

In [37]:
train.isna().sum()
# 분야,직원 수, 고객수(백만명), 기업가치(백억원)

설립연도                 0
국가                   0
분야                 857
투자단계                 0
직원 수               174
인수여부                 0
상장여부                 0
고객수(백만명)          1320
총 투자금(억원)            0
연매출(억원)              0
SNS 팔로워 수(백만명)       0
기업가치(백억원)         1220
성공확률                 0
dtype: int64

In [38]:
# 1. '기업가치(백억원)' 전처리 함수 정의
def convert_value(value):
    if isinstance(value, str):
        if '이상' in value:
            return 6000
        elif '-' in value:
            start, end = value.split('-')
            return (float(start) + float(end)) / 2
    return float(value)

# 2. '기업가치(백억원)' 범주화 함수 정의
def categorize_value(value):
    if value < 2500:
        return '1500-2500'
    elif value < 3500:
        return '2500-3500'
    elif value < 4500:
        return '3500-4500'
    elif value < 6000:
        return '4500-6000'
    else:
        return '6000이상'

# 3. 복사해서 안전하게 처리
train_copy = train.copy()

# 4. 기업가치 숫자로 변환 (NaN 제외)
train_copy.loc[train_copy['기업가치(백억원)'].notna(), '기업가치(백억원)'] = \
    train_copy.loc[train_copy['기업가치(백억원)'].notna(), '기업가치(백억원)'].apply(convert_value)

# 5. 결측치 제거한 학습 데이터
train_data = train_copy.dropna(subset=['기업가치(백억원)']).copy()

# 6. 결측치 채우기
train_data['직원 수'] = train_data['직원 수'].fillna(train_data['직원 수'].mean())

# 7. 모델 학습
features = ['총 투자금(억원)', '연매출(억원)', '직원 수', '설립연도']
X_train = train_data[features]
y_train = train_data['기업가치(백억원)'].astype(float)

model = LinearRegression()
model.fit(X_train, y_train)

# 8. NaN 데이터 예측
test_data = train_copy[train_copy['기업가치(백억원)'].isna()].copy()
test_data['직원 수'] = test_data['직원 수'].fillna(train_data['직원 수'].mean())
X_test = test_data[features]
y_pred = model.predict(X_test)

# 9. 예측값 넣기
train_copy.loc[train_copy['기업가치(백억원)'].isna(), '기업가치(백억원)'] = y_pred

# 10. 숫자 → 범주로 다시 변환
train_copy['기업가치(백억원)'] = train_copy['기업가치(백억원)'].astype(float).apply(categorize_value)

# ✅ 결과: train_copy 에서 기업가치 컬럼이 문자열 범주로 완성됨
print(train_copy['기업가치(백억원)'].value_counts())

train['기업가치(백억원)'] = train_copy['기업가치(백억원)']

# test 데이터 복사
test_copy = test.copy()

# 1. 기업가치 컬럼 전처리
def convert_value(value):
    if isinstance(value, str):
        if '이상' in value:
            return 6000
        elif '-' in value:
            start, end = value.split('-')
            return (float(start) + float(end)) / 2
    return float(value)

def categorize_value(value):
    if value < 2500:
        return '1500-2500'
    elif value < 3500:
        return '2500-3500'
    elif value < 4500:
        return '3500-4500'
    elif value < 6000:
        return '4500-6000'
    else:
        return '6000이상'

# 2. 기존 값은 수치형으로 바꾸기
test_copy.loc[test_copy['기업가치(백억원)'].notna(), '기업가치(백억원)'] = \
    test_copy.loc[test_copy['기업가치(백억원)'].notna(), '기업가치(백억원)'].apply(convert_value)

# 3. 예측해야 할 샘플
test_nan = test_copy[test_copy['기업가치(백억원)'].isna()].copy()

# 4. 결측치 채우기 (train의 평균을 그대로 사용)
test_nan['직원 수'] = test_nan['직원 수'].fillna(train_data['직원 수'].mean())

# 5. 필요한 입력값 선택
X_test = test_nan[['총 투자금(억원)', '연매출(억원)', '직원 수', '설립연도']]

# 6. train에서 학습한 모델로 예측
y_pred_test = model.predict(X_test)

# 7. 예측 결과 채워넣기
test_copy.loc[test_copy['기업가치(백억원)'].isna(), '기업가치(백억원)'] = y_pred_test

# 8. 숫자 → 범주로 다시 변환
test_copy['기업가치(백억원)'] = test_copy['기업가치(백억원)'].astype(float).apply(categorize_value)

# ✅ 결과 확인
print(test_copy['기업가치(백억원)'].value_counts())

test['기업가치(백억원)'] = test_copy['기업가치(백억원)']


기업가치(백억원)
3500-4500    1848
4500-6000     679
2500-3500     635
1500-2500     621
6000이상        593
Name: count, dtype: int64
기업가치(백억원)
3500-4500    749
4500-6000    292
1500-2500    258
2500-3500    247
6000이상       209
Name: count, dtype: int64


In [39]:
most_common_field = train['분야'].mode()[0]
train['분야'] = train['분야'].fillna(most_common_field)
test['분야'] = train['분야'].fillna(most_common_field)

staff_means_by_field = train.groupby('분야')['직원 수'].mean()

train['직원 수'] = train.groupby('분야')['직원 수'].transform(lambda x: x.fillna(x.mean()))

def fill_staff_by_field(row):
    if pd.isna(row['직원 수']):
        return staff_means_by_field.get(row['분야'], train['직원 수'].mean())
    return row['직원 수']

test['직원 수'] = test.apply(fill_staff_by_field, axis=1)

valid = train.dropna(subset=['고객수(백만명)'])
X_train = valid[['연매출(억원)', '총 투자금(억원)']]
y_train = valid['고객수(백만명)']

model = LinearRegression()
model.fit(X_train, y_train)

# 예측할 결측치 행
missing = train[train['고객수(백만명)'].isna()]
X_missing = missing[['연매출(억원)', '총 투자금(억원)']]
y_pred = model.predict(X_missing)

# 채워 넣기
train.loc[train['고객수(백만명)'].isna(), '고객수(백만명)'] = y_pred

missing_test = test[test['고객수(백만명)'].isna()]
X_missing_test = missing_test[['연매출(억원)', '총 투자금(억원)']]
y_pred_test = model.predict(X_missing_test)

test.loc[test['고객수(백만명)'].isna(), '고객수(백만명)'] = y_pred_test

In [40]:
train.isna().sum()

설립연도              0
국가                0
분야                0
투자단계              0
직원 수              0
인수여부              0
상장여부              0
고객수(백만명)          0
총 투자금(억원)         0
연매출(억원)           0
SNS 팔로워 수(백만명)    0
기업가치(백억원)         0
성공확률              0
dtype: int64

In [41]:
train['설립연도'] =train['설립연도'].astype('object')
test['설립연도'] =test['설립연도'].astype('object')

# # 설립연도 -> 회사 나이
# train['회사나이'] = 2025 - train['설립연도'].astype(int)
# test['회사나이'] = 2025 - test['설립연도'].astype(int)

# # 투자금 대비 직원 수
# train['투자_직원_비율'] = train['총 투자금(억원)'] / (train['직원 수'] + 1)
# test['투자_직원_비율'] = test['총 투자금(억원)'] / (test['직원 수'] + 1)

def add_engineered_features(df):
    df['투자금_대비_연매출'] = df['연매출(억원)'] / (df['총 투자금(억원)'] + 1e-6)
    df['고객당_연매출'] = df['연매출(억원)'] / (df['고객수(백만명)'] + 1e-6)
    df['직원당_연매출'] = df['연매출(억원)'] / (df['직원 수'] + 1e-6)
    df['투자금_대비_SNS영향력'] = df['SNS 팔로워 수(백만명)'] / (df['총 투자금(억원)'] + 1e-6)
    df['회사나이'] = 2025 - df['설립연도'].astype(int)  # 회사 나이 추가
    df['투자_직원_비율'] = df['총 투자금(억원)'] / (df['직원 수'] + 1)  # 투자 대비 직원 비율 추가
    return df

# 사용 예시
train = add_engineered_features(train)
test = add_engineered_features(test)

category_features = ['설립연도','국가','분야','투자단계','기업가치(백억원)']
numeric_features = ['직원 수','고객수(백만명)','총 투자금(억원)','연매출(억원)','SNS 팔로워 수(백만명)',
                    '투자금_대비_연매출','고객당_연매출','직원당_연매출','투자금_대비_SNS영향력','회사나이','투자_직원_비율']
bool_features = ['인수여부','상장여부']

encoders = {}

# 불리언 값을 0과 1로 변환 ('Yes' → 1, 'No' → 0 으로 변환)
bool_map = {'Yes': 1, 'No': 0}

for feature in bool_features:
    train[feature] = train[feature].map(bool_map)
    test[feature] = test[feature].map(bool_map)

# 범주형 데이터를 encoding
for feature in category_features:
    encoders[feature] = LabelEncoder()
    train[feature] = train[feature].fillna('Missing')
    test[feature] = test[feature].fillna('Missing')
    train[feature] = encoders[feature].fit_transform(train[feature])
    test[feature] = encoders[feature].transform(test[feature])

# 수치형 변수 결측치를 평균값으로 대체
for feature in numeric_features:
    mean_value = train[feature].mean()
    train[feature] = train[feature].fillna(mean_value)
    test[feature] = test[feature].fillna(mean_value)

# TabNet용 범주형 변수 인덱스(cat_idxs) 및 차원(cat_dims) 설정
features = [col for col in train.columns if col != '성공확률']
cat_idxs = [features.index(col) for col in category_features]
cat_dims = [train[col].max() + 1 for col in category_features]


  train[feature] = train[feature].fillna('Missing')
  test[feature] = test[feature].fillna('Missing')


Unnamed: 0,설립연도,국가,분야,투자단계,직원 수,인수여부,상장여부,고객수(백만명),총 투자금(억원),연매출(억원),SNS 팔로워 수(백만명),기업가치(백억원),성공확률,투자금_대비_연매출,고객당_연매출,직원당_연매출,투자금_대비_SNS영향력,회사나이,투자_직원_비율
0,8,4,6,2,4126.0,0,0,56.000000,3365.0,4764.0,4.71,2,0.3,1.415750,85.071427,1.154629,0.001400,16,0.815362
1,22,5,8,1,4167.0,1,0,80.000000,4069.0,279.0,1.00,1,0.8,0.068567,3.487500,0.066955,0.000246,2,0.976248
2,17,6,2,2,3132.0,1,1,54.000000,6453.0,12141.0,4.00,2,0.5,1.881450,224.833329,3.876437,0.000620,7,2.059687
3,15,5,4,1,3245.0,1,1,49.721402,665.0,10547.0,2.97,2,0.7,15.860150,212.121930,3.250231,0.004466,9,0.204868
4,19,1,5,1,1969.0,0,1,94.000000,829.0,9810.0,1.00,0,0.1,11.833534,104.361701,4.982224,0.001206,5,0.420812
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4371,20,5,7,2,4841.0,1,0,90.000000,4187.0,9394.0,4.00,0,0.8,2.243611,104.377777,1.940508,0.000955,4,0.864725
4372,19,2,4,3,555.0,0,1,37.000000,796.0,2969.0,3.00,4,0.4,3.729899,80.243241,5.349550,0.003769,5,1.431655
4373,22,3,4,4,506.0,0,1,49.175674,3314.0,4512.0,1.47,2,0.6,1.361497,91.752681,8.916996,0.000444,2,6.536489
4374,0,6,0,0,1438.0,0,0,53.000000,2395.0,3755.0,5.00,3,0.9,1.567850,70.849055,2.611266,0.002088,24,1.664350


In [42]:
# 타겟 지정
target = train['성공확률']  
X = train[features]
y = target

# KFold 설정
N_FOLDS = 5
kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

models = [] # 모델 저장 리스트
cv_scores = []

for fold, (train_idx, valid_idx) in enumerate(kf.split(X)):
    print(f"\n🔁 Fold {fold+1}/{N_FOLDS}")
    
    X_train = X.iloc[train_idx].values
    y_train = y.iloc[train_idx].values.reshape(-1, 1)
    
    X_valid = X.iloc[valid_idx].values
    y_valid = y.iloc[valid_idx].values.reshape(-1, 1)
    
    # 비지도 사전학습
    print("▶ Pretraining...")

    pretrainer = TabNetPretrainer(
        cat_idxs=cat_idxs,
        cat_dims=cat_dims,
        seed=42,
        verbose=0
    )

    pretrainer.fit(
        X_train=X_train,
        max_epochs=100,
        batch_size=512,
        virtual_batch_size=64
    )

    # 지도 학습 
    print("▶ Fine-tuning...")
    model = TabNetRegressor(
        cat_idxs=cat_idxs,
        cat_dims=cat_dims,
        seed=42,
        verbose=0,
        optimizer_fn=torch.optim.AdamW 
    )

    model.fit(
        X_train=X_train, y_train=y_train,
        eval_set=[(X_valid, y_valid)],
        from_unsupervised=pretrainer,
        eval_metric=['mae'],
        max_epochs=100,
        patience=10
    )

    # 모델을 메모리에 저장
    models.append(model)
    cv_scores.append(model.best_cost)

print("\n✅ 모든 fold 모델 학습 완료!")


🔁 Fold 1/5
▶ Pretraining...




▶ Fine-tuning...





Early stopping occurred at epoch 42 with best_epoch = 32 and best_val_0_mae = 0.20662

🔁 Fold 2/5
▶ Pretraining...




▶ Fine-tuning...





Early stopping occurred at epoch 24 with best_epoch = 14 and best_val_0_mae = 0.20854

🔁 Fold 3/5
▶ Pretraining...




▶ Fine-tuning...





Early stopping occurred at epoch 31 with best_epoch = 21 and best_val_0_mae = 0.205

🔁 Fold 4/5
▶ Pretraining...




▶ Fine-tuning...





Early stopping occurred at epoch 39 with best_epoch = 29 and best_val_0_mae = 0.2025

🔁 Fold 5/5
▶ Pretraining...




▶ Fine-tuning...





Early stopping occurred at epoch 26 with best_epoch = 16 and best_val_0_mae = 0.20667

✅ 모든 fold 모델 학습 완료!




In [43]:
# 저장된 모델들로 예측
predictions_list = []

for fold, model in enumerate(models):
    print(f"Predict with fold {fold+1}")
    preds = model.predict(test[features].values)
    predictions_list.append(preds)

# 평균 예측
final_predictions = np.mean(predictions_list, axis=0)

Predict with fold 1
Predict with fold 2
Predict with fold 3
Predict with fold 4
Predict with fold 5


In [44]:
sample_submission['성공확률'] = final_predictions
sample_submission.to_csv('data_files/submission9.csv', index = False, encoding = 'utf-8-sig')