데이터 로드

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.linear_model import Ridge
from xgboost import XGBRegressor
from sklearn.isotonic import IsotonicRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import mean_absolute_error
import optuna
import sklearn
import xgboost
# 데이터 로드
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
submission = pd.read_csv('./data/sample_submission.csv')

In [3]:
print("numpy:", np.__version__)
print("pandas:", pd.__version__)
print("scikit-learn:", sklearn.__version__)
print("xgboost:", xgboost.__version__)
print("optuna:", optuna.__version__)

numpy: 1.26.4
pandas: 2.3.0
scikit-learn: 1.7.1
xgboost: 3.0.3
optuna: 4.3.0


피쳐 엔지니어링

In [4]:
# 수치형 문자열 처리
def convert_numeric_columns(df):
    to_float_cols = ['기업가치(백억원)', '총 투자금(억원)', '연매출(억원)']
    for col in to_float_cols:
        temp = df[col].astype(str).str.replace(",", "", regex=False).str.extract(r"(\d+(?:\.\d+)?)(?:-(\d+(?:\.\d+)?))?").astype(float)
        df[col] = temp.mean(axis=1)
    return df

train = convert_numeric_columns(train)
test = convert_numeric_columns(test)

# 결측값 처리
train['분야'] = train['분야'].fillna(train['분야'].mode()[0])
test['분야'] = test['분야'].fillna(test['분야'].mode()[0])
for col in ['직원 수', '고객수(백만명)', '기업가치(백억원)']:
    train[col] = train[col].fillna(train[col].median())
    test[col] = test[col].fillna(test[col].median())

# 파생 변수
def create_features(df):
    df['기업나이'] = 2025 - df['설립연도']
    df['1인당_투자금'] = df['총 투자금(억원)'] / (df['직원 수'] + 1)
    df['1인당_연매출'] = df['연매출(억원)'] / (df['직원 수'] + 1)
    df['고객당_연매출'] = df['연매출(억원)'] / (df['고객수(백만명)'] + 1)
    df['투자금_대비_연매출'] = df['총 투자금(억원)'] / (df['연매출(억원)'] + 1)
    df['매출_대비_기업가치'] = df['기업가치(백억원)'] / (df['연매출(억원)'] + 1)
    df['고객당_기업가치'] = df['기업가치(백억원)'] / (df['고객수(백만명)'] + 1)
    return df

train = create_features(train)
test = create_features(test)

# 인코딩
categorical_features = ['국가', '분야', '투자단계', '인수여부', '상장여부']
for col in categorical_features:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col].astype(str))
    test[col] = le.transform(test[col].astype(str))

# 플래그 변수
train['상장여부_flag'] = (train['상장여부'] == 1).astype(int)
train['인수여부_flag'] = (train['인수여부'] == 1).astype(int)
train['flag_sum'] = train[['상장여부_flag', '인수여부_flag']].sum(axis=1)
test['상장여부_flag'] = (test['상장여부'] == 1).astype(int)
test['인수여부_flag'] = (test['인수여부'] == 1).astype(int)
test['flag_sum'] = test[['상장여부_flag', '인수여부_flag']].sum(axis=1)

# 투자단계 순서 인코딩
investment_order = {'Seed': 0, 'Series A': 1, 'Series B': 2, 'Series C': 3, 'IPO': 4}
train['투자단계_level'] = train['투자단계'].map(investment_order)
test['투자단계_level'] = test['투자단계'].map(investment_order)

# 최종 학습 데이터
X = train.drop(columns=['ID', '성공확률'])
y = train['성공확률']
X_test = test.drop(columns=['ID'])

피쳐셀렉션

In [5]:
# 피처 셀렉션
rf = RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1)
rf.fit(X, y)
sfm = SelectFromModel(rf, threshold="median", prefit=True)
X_sel = sfm.transform(X)
X_test_sel = sfm.transform(X_test)




모델 학습

In [6]:
# 모델 정의 (최적 파라미터 적용)
xgb_model = XGBRegressor(
    n_estimators=389,
    learning_rate=0.04346710916403895,
    max_depth=30,
    min_child_weight=3,
    subsample=0.9278355340822033,
    colsample_bytree=0.7251569161402825,
    reg_alpha=0.00021194002224243433,
    reg_lambda=0.7341560851318312,
    random_state=42,
    n_jobs=-1
)
rf_model = RandomForestRegressor(
    n_estimators=688,
    max_depth=27,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='sqrt',
    random_state=42,
    n_jobs=-1
)
voting_model = VotingRegressor([('xgb', xgb_model), ('rf', rf_model)])

# 학습
voting_model.fit(X_sel, y)

0,1,2
,estimators,"[('xgb', ...), ('rf', ...)]"
,weights,
,n_jobs,
,verbose,False

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.7251569161402825
,device,
,early_stopping_rounds,
,enable_categorical,False

0,1,2
,n_estimators,688
,criterion,'squared_error'
,max_depth,27
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


결과로 나온 확률 보정

In [8]:
# 예측 및 보정
y_pred = voting_model.predict(X_sel)
ir = IsotonicRegression(out_of_bounds='clip')
ir.fit(y_pred, y)
test_pred = voting_model.predict(X_test_sel)
test_pred_calibrated = ir.predict(test_pred)

# 제출
submission['성공확률'] = test_pred_calibrated
submission.to_csv("./data/fiinal_submissionn.csv", index=False)