In [2]:
import pandas as pd
import numpy as np
from itertools import product
from random import sample
from sklearn.impute import KNNImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler
from math import sqrt
from tqdm import tqdm

# === 전처리 원본 데이터 보존 ===
df = pd.read_csv('../../data/raw/train.csv')
df_raw = df.copy()
df_raw.drop(columns=['ID'], inplace=True)

# === 수치형/범주형 컬럼 정의 ===
num_col = ['설립연도','직원 수','고객수(백만명)','총 투자금(억원)','연매출(억원)','SNS 팔로워 수(백만명)']
ca_col = ['국가','분야','투자단계','인수여부','상장여부']

# === 문자열 매핑 테이블 ===
value_map = {
    '1500~2500': 2000,
    '2500~3500': 3000,
    '3500~4500': 4000,
    '4500~6000': 5250,
    '6000이상': 6500
}

investment_state_order = {
    'Seed': 0,
    'Series A': 1,
    'Series B': 2,
    'Series C': 3,
    'IPO': 4
}

# === 결측치 처리 전략 함수 ===

def filling_zero(df): return df.copy().fillna(0)
def filling_minus1(df): return df.copy().fillna(-1)
def filling_mean(df):
    df = df.copy()
    for col in df.select_dtypes(include='number'):
        df[col] = df[col].fillna(df[col].mean())
    return df
def filling_median(df):
    df = df.copy()
    for col in df.select_dtypes(include='number'):
        df[col] = df[col].fillna(df[col].median())
    return df
def filling_cluster_mean(df):
    df = df.copy()
    if 'cluster' not in df.columns:
        raise ValueError("'cluster' column is required for filling_cluster_mean")
    for col in df.select_dtypes(include='number'):
        if col == 'cluster': continue
        for cl in df['cluster'].unique():
            mean_val = df[df['cluster'] == cl][col].mean()
            df.loc[(df['cluster'] == cl) & (df[col].isnull()), col] = mean_val
    return df
def filling_knn(df, n_neighbors=5):
    df = df.copy()
    numeric_cols = df.select_dtypes(include='number').columns
    imputer = KNNImputer(n_neighbors=n_neighbors)
    df[numeric_cols] = imputer.fit_transform(df[numeric_cols])
    return df
def filling_mode(df):
    df = df.copy()
    for col in df.columns:
        if df[col].isnull().sum() > 0:
            df[col] = df[col].fillna(df[col].mode()[0])
    return df
def filling_forward(df): return df.ffill()
def filling_backward(df): return df.bfill()
def filling_interpolate(df):
    df = df.copy()
    numeric_cols = df.select_dtypes(include='number').columns
    df[numeric_cols] = df[numeric_cols].interpolate(method='linear', limit_direction='both')
    return df
def filling_random_sample(df):
    df = df.copy()
    for col in df.columns:
        if df[col].isnull().sum() > 0:
            non_null = df[col].dropna().values
            sampled = np.random.choice(non_null, size=df[col].isnull().sum(), replace=True)
            df.loc[df[col].isnull(), col] = sampled
    return df
def filling_constant(df, constant=-999): return df.copy().fillna(constant)

# === 전략 코드 매핑 ===
strategy_dict = {
    'A': filling_zero,
    'B': filling_mean,
    'C': filling_median,
    'D': filling_minus1,
    'E': filling_cluster_mean,
    'F': filling_knn,
    'G': filling_mode,
    'H': filling_forward,
    'I': filling_backward,
    'J': filling_interpolate,
    'K': filling_random_sample,
    'L': filling_constant,
}

# === 컬럼별 전략 적용 함수 ===
def apply_strategies_by_column(df, column_strategy_map):
    df_filled = df.copy()
    for col, strat_code in column_strategy_map.items():
        if col not in df_filled.columns:
            continue
        if df_filled[col].isnull().sum() == 0:
            continue  # 결측치 없으면 건너뜀

        strategy_fn = strategy_dict.get(strat_code)
        if not strategy_fn:
            continue
        if strat_code == 'E' and 'cluster' not in df_filled.columns:
            continue
        temp = strategy_fn(df_filled[[col, 'cluster']] if strat_code == 'E' else df_filled[[col]])
        df_filled[col] = temp[col]
    return df_filled

# === 전략 조합 생성 ===
numeric_strategies = ['A', 'B', 'C', 'D', 'F', 'J', 'K', 'L']
categorical_strategies = ['G', 'H', 'I', 'K', 'L']

num_combinations = list(product(numeric_strategies, repeat=len(num_col)))
cat_combinations = list(product(categorical_strategies, repeat=len(ca_col)))

num_combinations_sample = sample(num_combinations, 500)
cat_combinations_sample = sample(cat_combinations, 500)

# === 실험 실행 ===
results = []
loop = tqdm(total=len(num_combinations_sample) * len(cat_combinations_sample), desc="Running Experiments")

for n_strats in num_combinations_sample:
    for c_strats in cat_combinations_sample:
        column_strategy_map = dict(zip(num_col + ca_col, list(n_strats) + list(c_strats)))

        df = df_raw.copy()

        # 문자열 전처리
        if '기업가치(백억원)' in df.columns:
            df['기업가치(백억원)'] = df['기업가치(백억원)'].map(value_map).fillna(0)
        if '투자단계' in df.columns:
            df['투자단계'] = df['투자단계'].map(investment_state_order)
        if '분야' in df.columns:
            df['분야'] = df['분야'].fillna('Unknown')

        # 결측치 처리
        df_filled = apply_strategies_by_column(df, column_strategy_map)

        # 인코딩
        df_encoded = pd.get_dummies(df_filled, columns=['국가', '분야', '인수여부', '상장여부'], drop_first=True)

        # 모델 학습
        if '성공확률' not in df_encoded.columns:
            continue  # 타겟이 없으면 건너뜀

        X = df_encoded.drop(columns=['성공확률'])
        y = df_encoded['성공확률']

        # 스케일링
        scaler = MinMaxScaler()
        num_features = [col for col in X.columns if col in num_col]  # 수치형 컬럼만 선택
        X[num_features] = scaler.fit_transform(X[num_features])

        # 타겟 로그 변환 (0은 로그가 안되므로 log1p 사용)
        y = y.apply(lambda x: np.log1p(x))

        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        model = LinearRegression()
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        MAE = sqrt(mean_absolute_error(y_test, y_pred))

        results.append({
            'num_strategies': n_strats,
            'cat_strategies': c_strats,
            'MAE': MAE
        })

        loop.update(1)

loop.close()

# === 결과 정리 ===
results_df = pd.DataFrame(results).sort_values('MAE')
print(results_df.head())  # 가장 좋은 조합 5개

Running Experiments:  18%|█▊        | 45974/250000 [34:00<1:19:15, 42.91it/s] Exception ignored in: <function _xla_gc_callback at 0x14774f560>
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.12/site-packages/jax/_src/lib/__init__.py", line 112, in _xla_gc_callback
    def _xla_gc_callback(*args):
    
KeyboardInterrupt: 


KeyboardInterrupt: 