In [14]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

import torch
from pytorch_tabnet.pretraining import TabNetPretrainer
from pytorch_tabnet.tab_model import TabNetRegressor

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [15]:
train = pd.read_csv('data_files/train.csv')
test = pd.read_csv('data_files/test.csv')
sample_submission = pd.read_csv('data_files/sample_submission.csv')

In [16]:
train = train.drop(columns=['ID'], axis = 1)
test = test.drop(columns=['ID'], axis = 1)

In [None]:
train['설립연도'] =train['설립연도'].astype('object')
test['설립연도'] =test['설립연도'].astype('object')

category_features = ['설립연도','국가','분야','투자단계','기업가치(백억원)']
numeric_features = ['직원 수','고객수(백만명)','총 투자금(억원)','연매출(억원)','SNS 팔로워 수(백만명)']
bool_features = ['인수여부','상장여부']

encoders = {}

# 불리언 값을 0과 1로 변환 ('Yes' → 1, 'No' → 0 으로 변환)
bool_map = {'Yes': 1, 'No': 0}

for feature in bool_features:
    train[feature] = train[feature].map(bool_map)
    test[feature] = test[feature].map(bool_map)

# # 범주형 데이터를 encoding
# for feature in category_features:
#     encoders[feature] = LabelEncoder()
#     train[feature] = train[feature].fillna('Missing')
#     test[feature] = test[feature].fillna('Missing')
#     train[feature] = encoders[feature].fit_transform(train[feature])
#     test[feature] = encoders[feature].transform(test[feature])

# # 수치형 변수 결측치를 평균값으로 대체
# for feature in numeric_features:
#     mean_value = train[feature].mean()
#     train[feature] = train[feature].fillna(mean_value)
#     test[feature] = test[feature].fillna(mean_value)

# # TabNet용 범주형 변수 인덱스(cat_idxs) 및 차원(cat_dims) 설정
# features = [col for col in train.columns if col != '성공확률']
# cat_idxs = [features.index(col) for col in category_features]
# cat_dims = [train[col].max() + 1 for col in category_features]

In [19]:
train_data = train.dropna(subset=['기업가치(백억원)'])
def convert_value(value):
    # '이상'이 포함되면 6000으로 처리
    if isinstance(value, str):
        if '이상' in value:
            return 6000  # 6000이상 -> 6000으로 처리
        elif '-' in value:  # 범위값 처리
            start, end = value.split('-')
            return (float(start) + float(end)) / 2  # 범위의 평균값 처리
    return float(value)  # 그 외 숫자값 그대로 반환

# '기업가치(백억원)'에 적용
train['기업가치(백억원)'] = train['기업가치(백억원)'].apply(convert_value)

X = train_data[['총 투자금(억원)', '연매출(억원)', '직원 수', '설립연도']]  # 필요한 입력값
y = train_data['기업가치(백억원)']  # 예측할 값

model = LinearRegression()
model.fit(X, y)

test_data = train[train['기업가치(백억원)'].isna()]
test_data['직원 수'] = test_data['직원 수'].fillna(train_data['직원 수'].mean())
X_test = test_data[['총 투자금(억원)', '연매출(억원)', '직원 수', '설립연도']]

y_pred = model.predict(X_test)

# 5. 예측한 값 채우기
train.loc[train['기업가치(백억원)'].isna(), '기업가치(백억원)'] = y_pred



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['직원 수'] = test_data['직원 수'].fillna(train_data['직원 수'].mean())


In [20]:
test_data = test.dropna(subset=['기업가치(백억원)'])

# '기업가치(백억원)'에 범위나 조건 처리 적용

def convert_value(value):
    # '이상'이 포함되면 6000으로 처리
    if isinstance(value, str):
        if '이상' in value:
            return 6000  # 6000이상 -> 6000으로 처리
        elif '-' in value:  # 범위값 처리
            start, end = value.split('-')
            return (float(start) + float(end)) / 2  # 범위의 평균값 처리
    return float(value)  # 그 외 숫자값 그대로 반환

test['기업가치(백억원)'] = test['기업가치(백억원)'].apply(convert_value)


# test 데이터에서 '기업가치(백억원)'이 NaN인 행 찾기
test_data = test[test['기업가치(백억원)'].isna()]

# '직원 수' NaN 값 채우기 (train의 평균값으로)
test_data['직원 수'] = test_data['직원 수'].fillna(train['직원 수'].mean())

# 예측을 위한 X_test
X_test = test_data[['총 투자금(억원)', '연매출(억원)', '직원 수', '설립연도']]

# 예측값 계산
y_pred = model.predict(X_test)

# 예측한 값 채우기
test.loc[test['기업가치(백억원)'].isna(), '기업가치(백억원)'] = y_pred

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data['직원 수'] = test_data['직원 수'].fillna(train['직원 수'].mean())


In [24]:
train['기업가치(백억원)']
test['기업가치(백억원)']

0       2000.000000
1       4122.880997
2       6000.000000
3       2000.000000
4       4111.128008
           ...     
1750    2000.000000
1751    5250.000000
1752    2000.000000
1753    5250.000000
1754    3954.796522
Name: 기업가치(백억원), Length: 1755, dtype: float64

In [25]:
def categorize_value(x):
    if 2500 <= x < 3500:
        return '2500-3500'
    elif 3500 <= x < 4500:
        return '3500-4500'
    elif 1500 <= x < 2500:
        return '1500-2500'
    elif 4500 <= x < 6000:
        return '4500-6000'
    elif x >= 6000:
        return '6000이상'

# 적용
train['기업가치(백억원)'] = train['기업가치(백억원)'].apply(categorize_value)
test['기업가치(백억원)'] = test['기업가치(백억원)'].apply(categorize_value)

In [26]:
 # 범주형 데이터를 encoding
for feature in category_features:
    encoders[feature] = LabelEncoder()
    train[feature] = train[feature].fillna('Missing')
    test[feature] = test[feature].fillna('Missing')
    train[feature] = encoders[feature].fit_transform(train[feature])
    test[feature] = encoders[feature].transform(test[feature])

# 수치형 변수 결측치를 평균값으로 대체
for feature in numeric_features:
    mean_value = train[feature].mean()
    train[feature] = train[feature].fillna(mean_value)
    test[feature] = test[feature].fillna(mean_value)

# TabNet용 범주형 변수 인덱스(cat_idxs) 및 차원(cat_dims) 설정
features = [col for col in train.columns if col != '성공확률']
cat_idxs = [features.index(col) for col in category_features]
cat_dims = [train[col].max() + 1 for col in category_features]

  train[feature] = train[feature].fillna('Missing')
  test[feature] = test[feature].fillna('Missing')


In [27]:
# 타겟 지정
target = train['성공확률']  
X = train[features]
y = target

# KFold 설정
N_FOLDS = 5
kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

models = [] # 모델 저장 리스트
cv_scores = []

for fold, (train_idx, valid_idx) in enumerate(kf.split(X)):
    print(f"\n🔁 Fold {fold+1}/{N_FOLDS}")
    
    X_train = X.iloc[train_idx].values
    y_train = y.iloc[train_idx].values.reshape(-1, 1)
    
    X_valid = X.iloc[valid_idx].values
    y_valid = y.iloc[valid_idx].values.reshape(-1, 1)
    
    # 비지도 사전학습
    print("▶ Pretraining...")

    pretrainer = TabNetPretrainer(
        cat_idxs=cat_idxs,
        cat_dims=cat_dims,
        seed=42,
        verbose=0
    )

    pretrainer.fit(
        X_train=X_train,
        max_epochs=100,
        batch_size=512,
        virtual_batch_size=64
    )

    # 지도 학습 
    print("▶ Fine-tuning...")
    model = TabNetRegressor(
        cat_idxs=cat_idxs,
        cat_dims=cat_dims,
        seed=42,
        verbose=0,
        optimizer_fn=torch.optim.AdamW 
    )

    model.fit(
        X_train=X_train, y_train=y_train,
        eval_set=[(X_valid, y_valid)],
        from_unsupervised=pretrainer,
        eval_metric=['mae'],
        max_epochs=100,
        patience=10
    )

    # 모델을 메모리에 저장
    models.append(model)
    cv_scores.append(model.best_cost)

print("\n✅ 모든 fold 모델 학습 완료!")


🔁 Fold 1/5
▶ Pretraining...




▶ Fine-tuning...





Early stopping occurred at epoch 26 with best_epoch = 16 and best_val_0_mae = 0.20484

🔁 Fold 2/5
▶ Pretraining...




▶ Fine-tuning...





Early stopping occurred at epoch 33 with best_epoch = 23 and best_val_0_mae = 0.20731

🔁 Fold 3/5
▶ Pretraining...




▶ Fine-tuning...





Early stopping occurred at epoch 58 with best_epoch = 48 and best_val_0_mae = 0.20356

🔁 Fold 4/5
▶ Pretraining...




▶ Fine-tuning...





Early stopping occurred at epoch 71 with best_epoch = 61 and best_val_0_mae = 0.20185

🔁 Fold 5/5
▶ Pretraining...




▶ Fine-tuning...





Early stopping occurred at epoch 47 with best_epoch = 37 and best_val_0_mae = 0.20626

✅ 모든 fold 모델 학습 완료!




In [28]:
# 저장된 모델들로 예측
predictions_list = []

for fold, model in enumerate(models):
    print(f"Predict with fold {fold+1}")
    preds = model.predict(test[features].values)
    predictions_list.append(preds)

# 평균 예측
final_predictions = np.mean(predictions_list, axis=0)

Predict with fold 1
Predict with fold 2
Predict with fold 3
Predict with fold 4
Predict with fold 5


In [29]:
sample_submission['성공확률'] = final_predictions
sample_submission.to_csv('data_files/submission6.csv', index = False, encoding = 'utf-8-sig')