In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

import torch
from pytorch_tabnet.pretraining import TabNetPretrainer
from pytorch_tabnet.tab_model import TabNetRegressor

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split


In [2]:
train = pd.read_csv('data_files/train.csv')
test = pd.read_csv('data_files/test.csv')
sample_submission = pd.read_csv('data_files/sample_submission.csv')

In [3]:
train = train.drop(columns=['ID'], axis = 1)
test = test.drop(columns=['ID'], axis = 1)

In [None]:
train['설립연도'] =train['설립연도'].astype('object')
test['설립연도'] =test['설립연도'].astype('object')

category_features = ['설립연도','국가','분야','투자단계','기업가치(백억원)']
numeric_features = ['직원 수','고객수(백만명)','총 투자금(억원)','연매출(억원)','SNS 팔로워 수(백만명)']
bool_features = ['인수여부','상장여부']

encoders = {}

# 불리언 값을 0과 1로 변환 ('Yes' → 1, 'No' → 0 으로 변환)
bool_map = {'Yes': 1, 'No': 0}

for feature in bool_features:
    train[feature] = train[feature].map(bool_map)
    test[feature] = test[feature].map(bool_map)



In [None]:

# 1. '기업가치(백억원)' 전처리 함수 정의
def convert_value(value):
    if isinstance(value, str):
        if '이상' in value:
            return 6000
        elif '-' in value:
            start, end = value.split('-')
            return (float(start) + float(end)) / 2
    return float(value)

# 2. '기업가치(백억원)' 범주화 함수 정의
def categorize_value(value):
    if value < 2500:
        return '1500-2500'
    elif value < 3500:
        return '2500-3500'
    elif value < 4500:
        return '3500-4500'
    elif value < 6000:
        return '4500-6000'
    else:
        return '6000이상'

# 3. 복사해서 안전하게 처리
train_copy = train.copy()

# 4. 기업가치 숫자로 변환 (NaN 제외)
train_copy.loc[train_copy['기업가치(백억원)'].notna(), '기업가치(백억원)'] = \
    train_copy.loc[train_copy['기업가치(백억원)'].notna(), '기업가치(백억원)'].apply(convert_value)

# 5. 결측치 제거한 학습 데이터
train_data = train_copy.dropna(subset=['기업가치(백억원)']).copy()

# 6. 결측치 채우기
train_data['직원 수'] = train_data['직원 수'].fillna(train_data['직원 수'].mean())

# 7. 모델 학습
features = ['총 투자금(억원)', '연매출(억원)', '직원 수', '설립연도']
X_train = train_data[features]
y_train = train_data['기업가치(백억원)'].astype(float)

model = LinearRegression()
model.fit(X_train, y_train)

# 8. NaN 데이터 예측
test_data = train_copy[train_copy['기업가치(백억원)'].isna()].copy()
test_data['직원 수'] = test_data['직원 수'].fillna(train_data['직원 수'].mean())
X_test = test_data[features]
y_pred = model.predict(X_test)

# 9. 예측값 넣기
train_copy.loc[train_copy['기업가치(백억원)'].isna(), '기업가치(백억원)'] = y_pred

# 10. 숫자 → 범주로 다시 변환
train_copy['기업가치(백억원)'] = train_copy['기업가치(백억원)'].astype(float).apply(categorize_value)

# ✅ 결과: train_copy 에서 기업가치 컬럼이 문자열 범주로 완성됨
print(train_copy['기업가치(백억원)'].value_counts())



기업가치(백억원)
3500-4500    1848
4500-6000     679
2500-3500     635
1500-2500     621
6000이상        593
Name: count, dtype: int64


In [None]:
train['기업가치(백억원)'] = train_copy['기업가치(백억원)']


In [27]:
# test 데이터 복사
test_copy = test.copy()

# 1. 기업가치 컬럼 전처리
def convert_value(value):
    if isinstance(value, str):
        if '이상' in value:
            return 6000
        elif '-' in value:
            start, end = value.split('-')
            return (float(start) + float(end)) / 2
    return float(value)

def categorize_value(value):
    if value < 2500:
        return '1500-2500'
    elif value < 3500:
        return '2500-3500'
    elif value < 4500:
        return '3500-4500'
    elif value < 6000:
        return '4500-6000'
    else:
        return '6000이상'

# 2. 기존 값은 수치형으로 바꾸기
test_copy.loc[test_copy['기업가치(백억원)'].notna(), '기업가치(백억원)'] = \
    test_copy.loc[test_copy['기업가치(백억원)'].notna(), '기업가치(백억원)'].apply(convert_value)

# 3. 예측해야 할 샘플
test_nan = test_copy[test_copy['기업가치(백억원)'].isna()].copy()

# 4. 결측치 채우기 (train의 평균을 그대로 사용)
test_nan['직원 수'] = test_nan['직원 수'].fillna(train_data['직원 수'].mean())

# 5. 필요한 입력값 선택
X_test = test_nan[['총 투자금(억원)', '연매출(억원)', '직원 수', '설립연도']]

# 6. train에서 학습한 모델로 예측
y_pred_test = model.predict(X_test)

# 7. 예측 결과 채워넣기
test_copy.loc[test_copy['기업가치(백억원)'].isna(), '기업가치(백억원)'] = y_pred_test

# 8. 숫자 → 범주로 다시 변환
test_copy['기업가치(백억원)'] = test_copy['기업가치(백억원)'].astype(float).apply(categorize_value)

# ✅ 결과 확인
print(test_copy['기업가치(백억원)'].value_counts())


기업가치(백억원)
3500-4500    749
4500-6000    292
1500-2500    258
2500-3500    247
6000이상       209
Name: count, dtype: int64


In [29]:
test['기업가치(백억원)'] = test_copy['기업가치(백억원)']

In [34]:
most_common_field = train['분야'].mode()[0]
train['분야'] = train['분야'].fillna(most_common_field)
test['분야'] = train['분야'].fillna(most_common_field)

In [35]:
staff_means_by_field = train.groupby('분야')['직원 수'].mean()

train['직원 수'] = train.groupby('분야')['직원 수'].transform(lambda x: x.fillna(x.mean()))

def fill_staff_by_field(row):
    if pd.isna(row['직원 수']):
        return staff_means_by_field.get(row['분야'], train['직원 수'].mean())
    return row['직원 수']

test['직원 수'] = test.apply(fill_staff_by_field, axis=1)


In [36]:
valid = train.dropna(subset=['고객수(백만명)'])
X_train = valid[['연매출(억원)', '총 투자금(억원)']]
y_train = valid['고객수(백만명)']

model = LinearRegression()
model.fit(X_train, y_train)

# 예측할 결측치 행
missing = train[train['고객수(백만명)'].isna()]
X_missing = missing[['연매출(억원)', '총 투자금(억원)']]
y_pred = model.predict(X_missing)

# 채워 넣기
train.loc[train['고객수(백만명)'].isna(), '고객수(백만명)'] = y_pred

missing_test = test[test['고객수(백만명)'].isna()]
X_missing_test = missing_test[['연매출(억원)', '총 투자금(억원)']]
y_pred_test = model.predict(X_missing_test)

test.loc[test['고객수(백만명)'].isna(), '고객수(백만명)'] = y_pred_test

In [None]:


# -------------------------
# 1. 범주형 변수 라벨 인코딩
# -------------------------
category_features = ['설립연도', '국가', '분야', '투자단계', '기업가치(백억원)']  # 수정한 부분

# LabelEncoder를 컬럼별로 따로 관리
label_encoders = {}

# train과 test 둘 다 변환
for col in category_features:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])  # 주의: test는 transform만
    label_encoders[col] = le

# -------------------------
# 2. Feature 준비
# -------------------------
# 타겟 컬럼 제외한 feature 리스트
features = [col for col in train.columns if col != '성공확률']

# 범주형 feature의 index, 차원
cat_idxs = [features.index(col) for col in category_features]
cat_dims = [train[col].nunique() for col in category_features]

# X, y 준비
X = train[features]
y = train['성공확률']

# -------------------------
# 3. KFold + Pretraining + Fine-tuning
# -------------------------
N_FOLDS = 5
kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

models = []      # 학습된 모델 저장
cv_scores = []   # fold별 best cost 저장

for fold, (train_idx, valid_idx) in enumerate(kf.split(X)):
    print(f"\n🔁 Fold {fold+1}/{N_FOLDS}")

    # Fold별 데이터 분리
    X_train = X.iloc[train_idx].values
    y_train = y.iloc[train_idx].values.reshape(-1, 1)

    X_valid = X.iloc[valid_idx].values
    y_valid = y.iloc[valid_idx].values.reshape(-1, 1)

    # 비지도 사전학습
    print("▶ Pretraining...")
    pretrainer = TabNetPretrainer(
        cat_idxs=cat_idxs,
        cat_dims=cat_dims,
        seed=42,
        verbose=0
    )

    pretrainer.fit(
        X_train=X_train,
        max_epochs=100,
        batch_size=512,
        virtual_batch_size=64
    )

    # 지도학습 (Fine-tuning)
    print("▶ Fine-tuning...")
    model = TabNetRegressor(
        cat_idxs=cat_idxs,
        cat_dims=cat_dims,
        seed=42,
        verbose=0,
        optimizer_fn=torch.optim.AdamW
    )

    model.fit(
        X_train=X_train, y_train=y_train,
        eval_set=[(X_valid, y_valid)],
        from_unsupervised=pretrainer,
        eval_metric=['mae'],
        max_epochs=100,
        patience=10
    )

    # 모델과 score 저장
    models.append(model)
    cv_scores.append(model.best_cost)

print("\n✅ 모든 fold 모델 학습 완료!")

# 최종 Cross Validation 성능 출력
print(f"🔥 평균 MAE: {np.mean(cv_scores):.4f}")



🔁 Fold 1/5
▶ Pretraining...




▶ Fine-tuning...





Early stopping occurred at epoch 32 with best_epoch = 22 and best_val_0_mae = 0.20864

🔁 Fold 2/5
▶ Pretraining...




▶ Fine-tuning...





Early stopping occurred at epoch 68 with best_epoch = 58 and best_val_0_mae = 0.20425

🔁 Fold 3/5
▶ Pretraining...




▶ Fine-tuning...





Early stopping occurred at epoch 35 with best_epoch = 25 and best_val_0_mae = 0.20348

🔁 Fold 4/5
▶ Pretraining...




▶ Fine-tuning...





Early stopping occurred at epoch 39 with best_epoch = 29 and best_val_0_mae = 0.20388

🔁 Fold 5/5
▶ Pretraining...




▶ Fine-tuning...





Early stopping occurred at epoch 66 with best_epoch = 56 and best_val_0_mae = 0.2066

✅ 모든 fold 모델 학습 완료!
🔥 평균 MAE: 0.2054




In [41]:
# 저장된 모델들로 예측
predictions_list = []

for fold, model in enumerate(models):
    print(f"Predict with fold {fold+1}")
    preds = model.predict(test[features].values)
    predictions_list.append(preds)

# 평균 예측
final_predictions = np.mean(predictions_list, axis=0)

# (선택) 0~1 범위로 클리핑
final_predictions = np.clip(final_predictions, 0, 1)

sample_submission['성공확률'] = final_predictions
sample_submission.to_csv('data_files/submission7.csv', index=False, encoding='utf-8-sig')


Predict with fold 1
Predict with fold 2
Predict with fold 3
Predict with fold 4
Predict with fold 5


In [42]:
# 저장된 모델들로 예측
predictions_list = []
fold_scores = []

for fold, model in enumerate(models):
    print(f"Predict with fold {fold+1}")
    preds = model.predict(test[features].values)
    predictions_list.append(preds)
    fold_scores.append(model.best_cost)  # fold마다 validation MAE 기록

# fold 별 MAE가 작을수록 weight를 크게
fold_scores = np.array(fold_scores)
fold_weights = 1 / (fold_scores + 1e-6)  # 0 나누기 방지용 1e-6 추가
fold_weights = fold_weights / fold_weights.sum()  # 합이 1이 되게 정규화

# 가중 평균
final_predictions = np.average(predictions_list, axis=0, weights=fold_weights)

# 0~1 범위로 클리핑
final_predictions = np.clip(final_predictions, 0, 1)

# 저장
sample_submission['성공확률'] = final_predictions
sample_submission.to_csv('data_files/submission8.csv', index=False, encoding='utf-8-sig')

Predict with fold 1
Predict with fold 2
Predict with fold 3
Predict with fold 4
Predict with fold 5
