In [79]:
import pandas as pd

df = pd.read_csv('../../data/raw/train.csv')

In [80]:
df.drop(columns=['ID'],inplace=True)
df.head()
# 컬럼은 ID 포함해서 모두 13개

Unnamed: 0,설립연도,국가,분야,투자단계,직원 수,인수여부,상장여부,고객수(백만명),총 투자금(억원),연매출(억원),SNS 팔로워 수(백만명),기업가치(백억원),성공확률
0,2009,CT005,이커머스,Series A,4126.0,No,No,56.0,3365.0,4764.0,4.71,,0.3
1,2023,CT006,핀테크,Seed,4167.0,Yes,No,80.0,4069.0,279.0,1.0,2500-3500,0.8
2,2018,CT007,기술,Series A,3132.0,Yes,Yes,54.0,6453.0,12141.0,4.0,3500-4500,0.5
3,2016,CT006,,Seed,3245.0,Yes,Yes,,665.0,10547.0,2.97,,0.7
4,2020,CT002,에듀테크,Seed,1969.0,No,Yes,94.0,829.0,9810.0,1.0,1500-2500,0.1


In [81]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4376 entries, 0 to 4375
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   설립연도            4376 non-null   int64  
 1   국가              4376 non-null   object 
 2   분야              3519 non-null   object 
 3   투자단계            4376 non-null   object 
 4   직원 수            4202 non-null   float64
 5   인수여부            4376 non-null   object 
 6   상장여부            4376 non-null   object 
 7   고객수(백만명)        3056 non-null   float64
 8   총 투자금(억원)       4376 non-null   float64
 9   연매출(억원)         4376 non-null   float64
 10  SNS 팔로워 수(백만명)  4376 non-null   float64
 11  기업가치(백억원)       3156 non-null   object 
 12  성공확률            4376 non-null   float64
dtypes: float64(6), int64(1), object(6)
memory usage: 444.6+ KB


In [82]:
# 국가 : 원핫 인코딩
df = pd.get_dummies(df, columns=['국가'])

# 분야 : 결측치 -> 'Unknown' , one-hot encoding
#df['분여'].fillna('Unknown', inplace=True)
df = pd.get_dummies(df, columns=['분야'])

# 투자 단계 : 수동 매핑
investment_state_order = {
    'Seed' : 0,
    'Series A' : 1,
    'Series B' : 2,
    'Series C' : 3,
    'IPO' : 4
}
df['투자단계'] = df['투자단계'].map(investment_state_order)

# 인수 여부, 상장 여부 : one-hot encoding
df = pd.get_dummies(df, columns=['인수여부', '상장여부'])

# 기업 가치 : 수치 매핑
df['기업가치(백억원)'].fillna(0, inplace=True)
value_map = {
    '1500~2500' : 2000,
    '2500~3500' : 3000,
    '3500~4500' : 4000,
    '4500~6000' : 5250,
    '6000이상':6500
}
df['기업가치(백억원)'] = df['기업가치(백억원)'].map(value_map)

# 아래 오류는 내부적으로 Pandas는 df['기업가치(백억원)']가 원본이 아닐수 있다고 판단하여 경고를 보여줌
# inplace=True 를 써도 실제로 원본 df 가 바뀌지 않을 수 있다는 의미

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['기업가치(백억원)'].fillna(0, inplace=True)


In [83]:
df.head()


Unnamed: 0,설립연도,투자단계,직원 수,고객수(백만명),총 투자금(억원),연매출(억원),SNS 팔로워 수(백만명),기업가치(백억원),성공확률,국가_CT001,...,분야_에너지,분야_에듀테크,분야_이커머스,분야_푸드테크,분야_핀테크,분야_헬스케어,인수여부_No,인수여부_Yes,상장여부_No,상장여부_Yes
0,2009,1,4126.0,56.0,3365.0,4764.0,4.71,,0.3,False,...,False,False,True,False,False,False,True,False,True,False
1,2023,0,4167.0,80.0,4069.0,279.0,1.0,,0.8,False,...,False,False,False,False,True,False,False,True,True,False
2,2018,1,3132.0,54.0,6453.0,12141.0,4.0,,0.5,False,...,False,False,False,False,False,False,False,True,False,True
3,2016,0,3245.0,,665.0,10547.0,2.97,,0.7,False,...,False,False,False,False,False,False,False,True,False,True
4,2020,0,1969.0,94.0,829.0,9810.0,1.0,,0.1,False,...,False,True,False,False,False,False,True,False,False,True


In [90]:
import pandas as pd
import numpy as np
from itertools import product
from random import sample
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from tqdm import tqdm

# === 전처리 원본 데이터 보존 ===
df = pd.read_csv('../../data/raw/train.csv')
df_raw = df.copy()  # df는 원래 불러온 상태라고 가정
df_raw.drop(columns=['ID'], inplace=True)

# === 수치형/범주형 컬럼 정의 ===
num_col = ['설립연도','직원 수','고객수(백만명)','총 투자금(억원)','연매출(억원)','SNS 팔로워 수(백만명)']
ca_col = ['국가','분야','투자단계','인수여부','상장여부']

# === 문자열 매핑 테이블 ===
value_map = {
    '1500~2500': 2000,
    '2500~3500': 3000,
    '3500~4500': 4000,
    '4500~6000': 5250,
    '6000이상': 6500
}

investment_state_order = {
    'Seed': 0,
    'Series A': 1,
    'Series B': 2,
    'Series C': 3,
    'IPO': 4
}

# === 결측치 처리 전략 함수 ===

def filling_zero(df): return df.copy().fillna(0)
def filling_minus1(df): return df.copy().fillna(-1)
def filling_mean(df):
    df = df.copy()
    for col in df.select_dtypes(include='number'):
        df[col] = df[col].fillna(df[col].mean())
    return df
def filling_median(df):
    df = df.copy()
    for col in df.select_dtypes(include='number'):
        df[col] = df[col].fillna(df[col].median())
    return df
def filling_cluster_mean(df):
    df = df.copy()
    if 'cluster' not in df.columns:
        raise ValueError("'cluster' column is required for filling_cluster_mean")
    for col in df.select_dtypes(include='number'):
        if col == 'cluster': continue
        for cl in df['cluster'].unique():
            mean_val = df[df['cluster'] == cl][col].mean()
            df.loc[(df['cluster'] == cl) & (df[col].isnull()), col] = mean_val
    return df
def filling_knn(df, n_neighbors=5):
    df = df.copy()
    numeric_cols = df.select_dtypes(include='number').columns
    imputer = KNNImputer(n_neighbors=n_neighbors)
    df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
    return df
def filling_mode(df):
    df = df.copy()
    for col in df.columns:
        if df[col].isnull().sum() > 0:
            df[col] = df[col].fillna(df[col].mode()[0])
    return df
def filling_forward(df): return df.ffill()
def filling_backward(df): return df.bfill()
def filling_interpolate(df):
    df = df.copy()
    numeric_cols = df.select_dtypes(include='number').columns
    df[numeric_cols] = df[numeric_cols].interpolate(method='linear', limit_direction='both')
    return df
def filling_random_sample(df):
    df = df.copy()
    for col in df.columns:
        if df[col].isnull().sum() > 0:
            non_null = df[col].dropna().values
            sampled = np.random.choice(non_null, size=df[col].isnull().sum(), replace=True)
            df.loc[df[col].isnull(), col] = sampled
    return df
def filling_constant(df, constant=-999): return df.copy().fillna(constant)

# === 전략 코드 매핑 ===
strategy_dict = {
    'A': filling_zero,
    'B': filling_mean,
    'C': filling_median,
    'D': filling_minus1,
    'E': filling_cluster_mean,
    'F': filling_knn,
    'G': filling_mode,
    'H': filling_forward,
    'I': filling_backward,
    'J': filling_interpolate,
    'K': filling_random_sample,
    'L': filling_constant,
}

# === 컬럼별 전략 적용 함수 ===
def apply_strategies_by_column(df, column_strategy_map):
    df_filled = df.copy()
    for col, strat_code in column_strategy_map.items():
        if col not in df_filled.columns:
            continue
        strategy_fn = strategy_dict.get(strat_code)
        if not strategy_fn:
            continue
        if strat_code == 'E' and 'cluster' not in df_filled.columns:
            continue
        temp = strategy_fn(df_filled[[col, 'cluster']] if strat_code == 'E' else df_filled[[col]])
        df_filled[col] = temp[col]
    return df_filled

# === 전략 조합 생성 ===
numeric_strategies = ['A', 'B', 'C', 'D', 'F', 'J', 'K', 'L']
categorical_strategies = ['G', 'H', 'I', 'K', 'L']

num_combinations = list(product(numeric_strategies, repeat=len(num_col)))
cat_combinations = list(product(categorical_strategies, repeat=len(ca_col)))

num_combinations_sample = sample(num_combinations, 20)
cat_combinations_sample = sample(cat_combinations, 20)

# === 실험 실행 ===
results = []
loop = tqdm(total=len(num_combinations_sample) * len(cat_combinations_sample), desc="Running Experiments")

for n_strats in num_combinations_sample:
    for c_strats in cat_combinations_sample:
        column_strategy_map = dict(zip(num_col + ca_col, list(n_strats) + list(c_strats)))

        df = df_raw.copy()

        # 문자열 전처리
        if '기업가치(백억원)' in df.columns:
            df['기업가치(백억원)'] = df['기업가치(백억원)'].map(value_map).fillna(0)
        if '투자단계' in df.columns:
            df['투자단계'] = df['투자단계'].map(investment_state_order)
        if '분야' in df.columns:
            df['분야'] = df['분야'].fillna('Unknown')

        # 결측치 처리
        df_filled = apply_strategies_by_column(df, column_strategy_map)

        # 인코딩
        df_encoded = pd.get_dummies(df_filled, columns=['국가', '분야', '인수여부', '상장여부'], drop_first=True)

        # 모델 학습
        if '성공확률' not in df_encoded.columns:
            continue  # 타겟이 없으면 건너뜀

        X = df_encoded.drop(columns=['성공확률'])
        y = df_encoded['성공확률']

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        model = RandomForestRegressor()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        rmse = sqrt(mean_squared_error(y_test, y_pred))

        results.append({
            'num_strategies': n_strats,
            'cat_strategies': c_strats,
            'rmse': rmse
        })

        loop.update(1)

loop.close()

# === 결과 정리 ===
results_df = pd.DataFrame(results).sort_values('rmse')
print(results_df.head())  # 가장 좋은 조합 5개

Running Experiments:   1%|          | 3/400 [00:04<10:38,  1.61s/it]

KeyboardInterrupt: 