# 00. 환경 설정 및 전처리 (Setup & Preprocessing)
# Environment Setup and Preprocessing Utilities

---

## 목차 / Table of Contents
1. 라이브러리 임포트 / Library Imports
2. 토이 데이터 생성 / Toy Data Generation
3. 결측치 처리 / Missing Value Handling
4. 이상치 탐지 및 처리 / Outlier Detection & Handling
5. 스케일링 / Scaling
6. 인코딩 / Encoding
7. 데이터 분할 / Train-Test Split
8. 불균형 데이터 처리 / Imbalanced Data Handling

---
## 1. 라이브러리 임포트 / Library Imports

In [None]:
# ============================================================
# 필수 라이브러리 임포트 / Essential Library Imports
# ============================================================
# 복사하여 사용 / Copy and use

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# 시각화 설정 / Visualization settings
plt.rcParams['figure.figsize'] = (10, 6)
plt.rcParams['font.size'] = 12
sns.set_style('whitegrid')

# 랜덤 시드 고정 / Fix random seed for reproducibility
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

print("✅ 기본 라이브러리 로드 완료 / Basic libraries loaded")

In [None]:
# ============================================================
# 사이킷런 라이브러리 / Scikit-learn Libraries
# ============================================================

# 전처리 / Preprocessing
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer, KNNImputer

# 데이터 분할 / Data splitting
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold, TimeSeriesSplit

# 회귀 모델 / Regression models
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# 분류 모델 / Classification models
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

# 평가 지표 / Evaluation metrics
from sklearn.metrics import (
    # 회귀 / Regression
    mean_absolute_error, mean_squared_error, r2_score,
    # 분류 / Classification
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_auc_score, roc_curve
)

print("✅ Scikit-learn 라이브러리 로드 완료 / Scikit-learn libraries loaded")

In [None]:
# ============================================================
# XGBoost / XGBoost
# ============================================================

try:
    import xgboost as xgb
    from xgboost import XGBClassifier, XGBRegressor
    print("✅ XGBoost 로드 완료 / XGBoost loaded")
except ImportError:
    print("⚠️ XGBoost 미설치. 설치: !pip install xgboost")
    print("⚠️ XGBoost not installed. Install: !pip install xgboost")

In [None]:
# ============================================================
# 불균형 데이터 처리 / Imbalanced Data Handling
# ============================================================

try:
    from imblearn.over_sampling import SMOTE, RandomOverSampler
    from imblearn.under_sampling import RandomUnderSampler
    from imblearn.combine import SMOTETomek
    print("✅ imbalanced-learn 로드 완료 / imbalanced-learn loaded")
except ImportError:
    print("⚠️ imbalanced-learn 미설치. 설치: !pip install imbalanced-learn")
    print("⚠️ imbalanced-learn not installed. Install: !pip install imbalanced-learn")

---
## 2. 토이 데이터 생성 / Toy Data Generation

시험에서 제공되는 데이터를 대신하여 연습용 토이 데이터를 생성합니다.
Generate toy data for practice instead of exam-provided data.

In [None]:
# ============================================================
# 회귀용 토이 데이터 생성 / Toy Data for Regression
# ============================================================

def create_regression_data(n_samples=1000, n_features=5, noise=0.1, random_state=42):
    """
    회귀 문제용 토이 데이터 생성 / Create toy data for regression
    
    Args:
        n_samples: 샘플 수 / Number of samples
        n_features: 피처 수 / Number of features
        noise: 노이즈 수준 / Noise level
        random_state: 랜덤 시드 / Random seed
    
    Returns:
        DataFrame with features and target
    """
    np.random.seed(random_state)
    
    # 피처 생성 / Generate features
    X = np.random.randn(n_samples, n_features)
    
    # 타겟 생성 (선형 조합 + 노이즈) / Generate target (linear combination + noise)
    coefficients = np.random.randn(n_features)
    y = X @ coefficients + noise * np.random.randn(n_samples)
    
    # DataFrame 생성 / Create DataFrame
    feature_names = [f'feature_{i}' for i in range(n_features)]
    df = pd.DataFrame(X, columns=feature_names)
    df['target'] = y
    
    # 일부 결측치 추가 (현실적 데이터) / Add some missing values (realistic)
    mask = np.random.random(df.shape) < 0.02  # 2% 결측치
    df = df.mask(mask)
    df['target'] = y  # 타겟은 결측 없음 / Target has no missing
    
    return df

# 사용 예시 / Usage example
df_reg = create_regression_data(n_samples=1000, n_features=5)
print(f"회귀 데이터 shape: {df_reg.shape}")
print(f"Regression data shape: {df_reg.shape}")
df_reg.head()

In [None]:
# ============================================================
# 분류용 토이 데이터 생성 (불균형 포함) / Toy Data for Classification (Imbalanced)
# ============================================================

def create_classification_data(n_samples=1000, n_features=5, imbalance_ratio=0.1, random_state=42):
    """
    분류 문제용 토이 데이터 생성 (불균형 데이터)
    Create toy data for classification (imbalanced)
    
    Args:
        n_samples: 샘플 수 / Number of samples
        n_features: 피처 수 / Number of features
        imbalance_ratio: 양성 클래스 비율 (0.1 = 10%) / Positive class ratio
        random_state: 랜덤 시드 / Random seed
    
    Returns:
        DataFrame with features and target
    """
    np.random.seed(random_state)
    
    # 클래스별 샘플 수 / Samples per class
    n_positive = int(n_samples * imbalance_ratio)
    n_negative = n_samples - n_positive
    
    # 음성 클래스 (정상) / Negative class (normal)
    X_neg = np.random.randn(n_negative, n_features)
    y_neg = np.zeros(n_negative)
    
    # 양성 클래스 (이상) - 약간 다른 분포 / Positive class (anomaly) - slightly different distribution
    X_pos = np.random.randn(n_positive, n_features) + 1.5
    y_pos = np.ones(n_positive)
    
    # 합치기 / Combine
    X = np.vstack([X_neg, X_pos])
    y = np.hstack([y_neg, y_pos])
    
    # 셔플 / Shuffle
    shuffle_idx = np.random.permutation(n_samples)
    X = X[shuffle_idx]
    y = y[shuffle_idx]
    
    # DataFrame 생성 / Create DataFrame
    feature_names = [f'feature_{i}' for i in range(n_features)]
    df = pd.DataFrame(X, columns=feature_names)
    df['target'] = y.astype(int)
    
    return df

# 사용 예시 / Usage example
df_clf = create_classification_data(n_samples=1000, imbalance_ratio=0.1)
print(f"분류 데이터 shape: {df_clf.shape}")
print(f"Classification data shape: {df_clf.shape}")
print(f"\n클래스 분포 / Class distribution:")
print(df_clf['target'].value_counts())
df_clf.head()

In [None]:
# ============================================================
# 시계열용 토이 데이터 생성 / Toy Data for Time Series
# ============================================================

def create_timeseries_data(n_samples=500, freq='D', trend=0.01, seasonality=True, random_state=42):
    """
    시계열 문제용 토이 데이터 생성
    Create toy data for time series analysis
    
    Args:
        n_samples: 샘플 수 (일 수) / Number of samples (days)
        freq: 주기 ('D'=일, 'H'=시간) / Frequency
        trend: 추세 강도 / Trend strength
        seasonality: 계절성 포함 여부 / Include seasonality
        random_state: 랜덤 시드 / Random seed
    
    Returns:
        DataFrame with datetime index and value column
    """
    np.random.seed(random_state)
    
    # 날짜 인덱스 생성 / Create date index
    dates = pd.date_range(start='2023-01-01', periods=n_samples, freq=freq)
    
    # 기본값 / Base value
    t = np.arange(n_samples)
    
    # 추세 / Trend
    trend_component = trend * t
    
    # 계절성 (주간 패턴) / Seasonality (weekly pattern)
    if seasonality:
        seasonal_component = 5 * np.sin(2 * np.pi * t / 7)
    else:
        seasonal_component = 0
    
    # 노이즈 / Noise
    noise = np.random.randn(n_samples) * 2
    
    # 최종 값 / Final value
    values = 100 + trend_component + seasonal_component + noise
    
    # DataFrame 생성 / Create DataFrame
    df = pd.DataFrame({'date': dates, 'value': values})
    df.set_index('date', inplace=True)
    
    return df

# 사용 예시 / Usage example
df_ts = create_timeseries_data(n_samples=365)
print(f"시계열 데이터 shape: {df_ts.shape}")
print(f"Time series data shape: {df_ts.shape}")
df_ts.head()

In [None]:
# ============================================================
# 제조 데이터 (이상 탐지용) / Manufacturing Data (for Anomaly Detection)
# ============================================================

def create_manufacturing_data(n_samples=1000, n_sensors=10, anomaly_ratio=0.05, random_state=42):
    """
    제조 이상 탐지용 토이 데이터 생성
    Create toy data for manufacturing anomaly detection
    
    Args:
        n_samples: 샘플 수 / Number of samples
        n_sensors: 센서 수 / Number of sensors
        anomaly_ratio: 이상 비율 / Anomaly ratio
        random_state: 랜덤 시드 / Random seed
    
    Returns:
        DataFrame with sensor readings and label
    """
    np.random.seed(random_state)
    
    # 정상 샘플 / Normal samples
    n_normal = int(n_samples * (1 - anomaly_ratio))
    n_anomaly = n_samples - n_normal
    
    # 정상 데이터 (각 센서별 정상 범위) / Normal data
    normal_data = []
    for i in range(n_sensors):
        mean = 50 + i * 5
        std = 5
        normal_data.append(np.random.normal(mean, std, n_normal))
    X_normal = np.array(normal_data).T
    
    # 이상 데이터 (일부 센서에서 비정상 값) / Anomaly data
    anomaly_data = []
    for i in range(n_sensors):
        mean = 50 + i * 5
        std = 5
        values = np.random.normal(mean, std, n_anomaly)
        # 일부 센서에서 이상값 / Abnormal values in some sensors
        if i < 3:  # 처음 3개 센서에서 이상 발생
            values += np.random.choice([-20, 20], n_anomaly) * np.random.random(n_anomaly)
        anomaly_data.append(values)
    X_anomaly = np.array(anomaly_data).T
    
    # 합치기 / Combine
    X = np.vstack([X_normal, X_anomaly])
    y = np.hstack([np.zeros(n_normal), np.ones(n_anomaly)])
    
    # 셔플 / Shuffle
    shuffle_idx = np.random.permutation(n_samples)
    X = X[shuffle_idx]
    y = y[shuffle_idx]
    
    # DataFrame 생성 / Create DataFrame
    sensor_names = [f'sensor_{i}' for i in range(n_sensors)]
    df = pd.DataFrame(X, columns=sensor_names)
    df['label'] = y.astype(int)  # 0: 정상, 1: 이상 / 0: Normal, 1: Anomaly
    
    # 일부 결측치 추가 / Add some missing values
    mask = np.random.random((n_samples, n_sensors)) < 0.01
    df.iloc[:, :n_sensors] = df.iloc[:, :n_sensors].mask(mask)
    
    return df

# 사용 예시 / Usage example
df_mfg = create_manufacturing_data(n_samples=1000, anomaly_ratio=0.05)
print(f"제조 데이터 shape: {df_mfg.shape}")
print(f"Manufacturing data shape: {df_mfg.shape}")
print(f"\n레이블 분포 / Label distribution:")
print(df_mfg['label'].value_counts())
df_mfg.head()

---
## 3. 결측치 처리 / Missing Value Handling

In [None]:
# ============================================================
# 결측치 확인 / Check Missing Values
# ============================================================

def check_missing(df):
    """
    결측치 현황 확인 / Check missing value status
    """
    missing = df.isnull().sum()
    missing_pct = (df.isnull().sum() / len(df)) * 100
    
    result = pd.DataFrame({
        'missing_count': missing,
        'missing_pct': missing_pct
    })
    result = result[result['missing_count'] > 0].sort_values('missing_count', ascending=False)
    
    print(f"전체 결측치 수: {df.isnull().sum().sum()}")
    print(f"Total missing values: {df.isnull().sum().sum()}")
    return result

# 사용 예시 / Usage example
check_missing(df_reg)

In [None]:
# ============================================================
# 결측치 처리 방법들 / Missing Value Imputation Methods
# ============================================================

# 방법 1: 평균/중앙값/최빈값 대체 / Method 1: Mean/Median/Mode imputation
def impute_simple(df, strategy='mean', columns=None):
    """
    단순 대체법으로 결측치 처리
    Impute missing values with simple strategy
    
    Args:
        df: DataFrame
        strategy: 'mean', 'median', 'most_frequent'
        columns: 처리할 컬럼 (None이면 수치형 전체) / Columns to impute
    """
    df_copy = df.copy()
    
    if columns is None:
        columns = df_copy.select_dtypes(include=[np.number]).columns
    
    imputer = SimpleImputer(strategy=strategy)
    df_copy[columns] = imputer.fit_transform(df_copy[columns])
    
    return df_copy

# 방법 2: KNN 대체 / Method 2: KNN imputation
def impute_knn(df, n_neighbors=5, columns=None):
    """
    KNN 대체법으로 결측치 처리
    Impute missing values with KNN
    """
    df_copy = df.copy()
    
    if columns is None:
        columns = df_copy.select_dtypes(include=[np.number]).columns
    
    imputer = KNNImputer(n_neighbors=n_neighbors)
    df_copy[columns] = imputer.fit_transform(df_copy[columns])
    
    return df_copy

# 방법 3: 앞/뒤 값으로 채우기 (시계열용) / Method 3: Forward/Backward fill (for time series)
def impute_ffill_bfill(df, columns=None):
    """
    앞/뒤 값으로 결측치 채우기 (시계열 데이터에 적합)
    Fill missing values with forward/backward fill
    """
    df_copy = df.copy()
    
    if columns is None:
        columns = df_copy.columns
    
    df_copy[columns] = df_copy[columns].ffill().bfill()
    
    return df_copy

# 사용 예시 / Usage example
print("원본 결측치 / Original missing:")
print(df_reg.isnull().sum().sum())

df_imputed = impute_simple(df_reg, strategy='mean')
print("\n대체 후 결측치 / After imputation:")
print(df_imputed.isnull().sum().sum())

---
## 4. 이상치 탐지 및 처리 / Outlier Detection & Handling

In [None]:
# ============================================================
# IQR 방식 이상치 탐지 / IQR-based Outlier Detection
# ============================================================

def detect_outliers_iqr(df, columns=None, k=1.5):
    """
    IQR 방식으로 이상치 탐지
    Detect outliers using IQR method
    
    Args:
        df: DataFrame
        columns: 검사할 컬럼 / Columns to check
        k: IQR 배수 (기본 1.5) / IQR multiplier
    
    Returns:
        이상치 마스크 DataFrame / Outlier mask DataFrame
    """
    if columns is None:
        columns = df.select_dtypes(include=[np.number]).columns
    
    outlier_mask = pd.DataFrame(False, index=df.index, columns=columns)
    
    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        
        lower_bound = Q1 - k * IQR
        upper_bound = Q3 + k * IQR
        
        outlier_mask[col] = (df[col] < lower_bound) | (df[col] > upper_bound)
    
    # 결과 출력 / Print results
    print("컬럼별 이상치 수 / Outliers per column:")
    print(outlier_mask.sum())
    print(f"\n전체 이상치 행 수: {outlier_mask.any(axis=1).sum()}")
    
    return outlier_mask

# 사용 예시 / Usage example
outlier_mask = detect_outliers_iqr(df_reg)

In [None]:
# ============================================================
# Z-score 방식 이상치 탐지 / Z-score based Outlier Detection
# ============================================================

def detect_outliers_zscore(df, columns=None, threshold=3):
    """
    Z-score 방식으로 이상치 탐지
    Detect outliers using Z-score method
    
    Args:
        df: DataFrame
        columns: 검사할 컬럼 / Columns to check
        threshold: Z-score 임계값 (기본 3) / Z-score threshold
    """
    if columns is None:
        columns = df.select_dtypes(include=[np.number]).columns
    
    outlier_mask = pd.DataFrame(False, index=df.index, columns=columns)
    
    for col in columns:
        z_scores = np.abs(stats.zscore(df[col].dropna()))
        outlier_mask.loc[df[col].dropna().index, col] = z_scores > threshold
    
    print("컬럼별 이상치 수 / Outliers per column:")
    print(outlier_mask.sum())
    
    return outlier_mask

# 사용 예시 / Usage example
outlier_mask_z = detect_outliers_zscore(df_reg)

In [None]:
# ============================================================
# 이상치 처리 방법 / Outlier Handling Methods
# ============================================================

def handle_outliers(df, columns=None, method='clip', k=1.5):
    """
    이상치 처리
    Handle outliers
    
    Args:
        df: DataFrame
        columns: 처리할 컬럼 / Columns to handle
        method: 'clip' (경계값으로 대체), 'remove' (제거), 'nan' (NaN으로 대체)
        k: IQR 배수 / IQR multiplier
    """
    df_copy = df.copy()
    
    if columns is None:
        columns = df_copy.select_dtypes(include=[np.number]).columns
    
    for col in columns:
        Q1 = df_copy[col].quantile(0.25)
        Q3 = df_copy[col].quantile(0.75)
        IQR = Q3 - Q1
        
        lower_bound = Q1 - k * IQR
        upper_bound = Q3 + k * IQR
        
        if method == 'clip':
            # 경계값으로 클리핑 / Clip to bounds
            df_copy[col] = df_copy[col].clip(lower_bound, upper_bound)
        elif method == 'nan':
            # NaN으로 대체 / Replace with NaN
            mask = (df_copy[col] < lower_bound) | (df_copy[col] > upper_bound)
            df_copy.loc[mask, col] = np.nan
        elif method == 'remove':
            # 이상치 행 제거는 루프 외부에서 처리 / Remove rows outside loop
            pass
    
    if method == 'remove':
        # 이상치가 있는 행 제거 / Remove rows with outliers
        outlier_mask = detect_outliers_iqr(df, columns, k)
        df_copy = df_copy[~outlier_mask.any(axis=1)]
    
    return df_copy

# 사용 예시 / Usage example
print(f"원본 shape: {df_reg.shape}")
df_clipped = handle_outliers(df_reg, method='clip')
print(f"클리핑 후 shape: {df_clipped.shape}")

---
## 5. 스케일링 / Scaling

In [None]:
# ============================================================
# 스케일링 방법 비교 / Scaling Methods Comparison
# ============================================================

def scale_data(X_train, X_test, method='standard'):
    """
    데이터 스케일링
    Scale data
    
    Args:
        X_train: 훈련 데이터 / Training data
        X_test: 테스트 데이터 / Test data
        method: 'standard', 'minmax', 'robust'
    
    Returns:
        스케일링된 X_train, X_test, scaler 객체
    
    Note:
        ⚠️ 반드시 훈련 데이터로만 fit하고 테스트 데이터는 transform만!
        ⚠️ Always fit on training data only, then transform test data!
    """
    if method == 'standard':
        # 평균 0, 표준편차 1로 변환 / Transform to mean=0, std=1
        scaler = StandardScaler()
    elif method == 'minmax':
        # 0~1 범위로 변환 / Transform to 0-1 range
        scaler = MinMaxScaler()
    elif method == 'robust':
        # 중앙값과 IQR 사용 (이상치에 강건) / Use median and IQR (robust to outliers)
        scaler = RobustScaler()
    else:
        raise ValueError(f"Unknown method: {method}")
    
    # 훈련 데이터로 fit, 훈련/테스트 모두 transform
    # Fit on training data, transform both
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    return X_train_scaled, X_test_scaled, scaler

# 사용 예시 / Usage example
# 데이터 준비 / Prepare data
df_clean = impute_simple(df_reg, strategy='mean')
X = df_clean.drop('target', axis=1)
y = df_clean['target']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 스케일링 / Scaling
X_train_scaled, X_test_scaled, scaler = scale_data(X_train, X_test, method='standard')

print("스케일링 전 훈련 데이터 통계 / Before scaling:")
print(f"Mean: {X_train.mean().mean():.4f}, Std: {X_train.std().mean():.4f}")
print("\n스케일링 후 훈련 데이터 통계 / After scaling:")
print(f"Mean: {X_train_scaled.mean():.4f}, Std: {X_train_scaled.std():.4f}")

---
## 6. 인코딩 / Encoding

In [None]:
# ============================================================
# 범주형 변수 인코딩 / Categorical Variable Encoding
# ============================================================

# 테스트용 범주형 데이터 생성 / Create categorical test data
df_cat = pd.DataFrame({
    'color': ['red', 'blue', 'green', 'red', 'blue'] * 20,
    'size': ['S', 'M', 'L', 'M', 'S'] * 20,
    'value': np.random.randn(100)
})

print("원본 데이터 / Original data:")
print(df_cat.head())

In [None]:
# ============================================================
# Label Encoding (순서가 있는 범주형) / Label Encoding (ordinal)
# ============================================================

def encode_labels(df, columns):
    """
    레이블 인코딩 (순서가 있는 범주형에 적합)
    Label encoding (suitable for ordinal categories)
    """
    df_copy = df.copy()
    encoders = {}
    
    for col in columns:
        le = LabelEncoder()
        df_copy[col] = le.fit_transform(df_copy[col])
        encoders[col] = le
    
    return df_copy, encoders

# 사용 예시 / Usage example
df_encoded, encoders = encode_labels(df_cat, ['color', 'size'])
print("레이블 인코딩 후 / After label encoding:")
print(df_encoded.head())
print(f"\ncolor 매핑: {list(encoders['color'].classes_)}")

In [None]:
# ============================================================
# One-Hot Encoding (순서가 없는 범주형) / One-Hot Encoding (nominal)
# ============================================================

def encode_onehot(df, columns):
    """
    원-핫 인코딩 (순서가 없는 범주형에 적합)
    One-hot encoding (suitable for nominal categories)
    """
    df_copy = df.copy()
    df_copy = pd.get_dummies(df_copy, columns=columns, drop_first=True)
    return df_copy

# 사용 예시 / Usage example
df_onehot = encode_onehot(df_cat, ['color', 'size'])
print("원-핫 인코딩 후 / After one-hot encoding:")
print(df_onehot.head())
print(f"\n컬럼 수 변화: {len(df_cat.columns)} -> {len(df_onehot.columns)}")

---
## 7. 데이터 분할 / Train-Test Split

In [None]:
# ============================================================
# 기본 Train-Test Split / Basic Train-Test Split
# ============================================================

def split_data(df, target_col, test_size=0.2, val_size=0.1, random_state=42, stratify=False):
    """
    데이터를 Train/Validation/Test로 분할
    Split data into Train/Validation/Test
    
    Args:
        df: DataFrame
        target_col: 타겟 컬럼명 / Target column name
        test_size: 테스트 비율 / Test ratio
        val_size: 검증 비율 (0이면 없음) / Validation ratio
        stratify: 층화 샘플링 여부 (분류에 사용) / Stratified sampling
    
    Returns:
        X_train, X_val, X_test, y_train, y_val, y_test
    """
    X = df.drop(target_col, axis=1)
    y = df[target_col]
    
    # 층화 샘플링 설정 / Stratified sampling setting
    strat = y if stratify else None
    
    # Train + (Val+Test) 분할 / Split Train + (Val+Test)
    X_train, X_temp, y_train, y_temp = train_test_split(
        X, y, test_size=(test_size + val_size), random_state=random_state, stratify=strat
    )
    
    if val_size > 0:
        # Val과 Test 분할 / Split Val and Test
        val_ratio = val_size / (test_size + val_size)
        strat_temp = y_temp if stratify else None
        X_val, X_test, y_val, y_test = train_test_split(
            X_temp, y_temp, test_size=(1-val_ratio), random_state=random_state, stratify=strat_temp
        )
    else:
        X_val, y_val = None, None
        X_test, y_test = X_temp, y_temp
    
    print(f"Train: {len(X_train)} samples ({len(X_train)/len(X)*100:.1f}%)")
    if val_size > 0:
        print(f"Val:   {len(X_val)} samples ({len(X_val)/len(X)*100:.1f}%)")
    print(f"Test:  {len(X_test)} samples ({len(X_test)/len(X)*100:.1f}%)")
    
    return X_train, X_val, X_test, y_train, y_val, y_test

# 사용 예시 / Usage example
df_clean = impute_simple(df_reg, strategy='mean')
X_train, X_val, X_test, y_train, y_val, y_test = split_data(
    df_clean, 'target', test_size=0.2, val_size=0.1
)

In [None]:
# ============================================================
# 시계열 데이터 분할 / Time Series Split
# ============================================================

def split_timeseries(df, target_col, test_size=0.2, val_size=0.1):
    """
    시계열 데이터 분할 (시간 순서 유지)
    Time series split (maintaining temporal order)
    
    ⚠️ 시계열은 랜덤 분할하면 안 됨! 시간 순서대로 분할해야 함
    ⚠️ Time series must not be randomly split! Must split by time order
    """
    n = len(df)
    
    # 분할 지점 계산 / Calculate split points
    train_end = int(n * (1 - test_size - val_size))
    val_end = int(n * (1 - test_size))
    
    # 분할 / Split
    train = df.iloc[:train_end]
    val = df.iloc[train_end:val_end] if val_size > 0 else None
    test = df.iloc[val_end:]
    
    print(f"Train: {len(train)} samples (시작: {train.index[0]}, 끝: {train.index[-1]})")
    if val_size > 0:
        print(f"Val:   {len(val)} samples (시작: {val.index[0]}, 끝: {val.index[-1]})")
    print(f"Test:  {len(test)} samples (시작: {test.index[0]}, 끝: {test.index[-1]})")
    
    return train, val, test

# 사용 예시 / Usage example
train_ts, val_ts, test_ts = split_timeseries(df_ts, 'value', test_size=0.2, val_size=0.1)

---
## 8. 불균형 데이터 처리 / Imbalanced Data Handling

In [None]:
# ============================================================
# 불균형 데이터 확인 / Check Data Imbalance
# ============================================================

def check_imbalance(y, plot=True):
    """
    클래스 불균형 확인
    Check class imbalance
    """
    value_counts = pd.Series(y).value_counts()
    imbalance_ratio = value_counts.min() / value_counts.max()
    
    print("클래스 분포 / Class distribution:")
    print(value_counts)
    print(f"\n불균형 비율 (소수/다수): {imbalance_ratio:.4f}")
    print(f"Imbalance ratio (minority/majority): {imbalance_ratio:.4f}")
    
    if plot:
        plt.figure(figsize=(6, 4))
        value_counts.plot(kind='bar')
        plt.title('Class Distribution / 클래스 분포')
        plt.xlabel('Class')
        plt.ylabel('Count')
        plt.tight_layout()
        plt.show()
    
    return imbalance_ratio

# 사용 예시 / Usage example
check_imbalance(df_clf['target'])

In [None]:
# ============================================================
# 불균형 처리 방법들 / Imbalanced Data Handling Methods
# ============================================================

def handle_imbalance(X, y, method='smote', random_state=42):
    """
    불균형 데이터 처리
    Handle imbalanced data
    
    Args:
        X: 피처 / Features
        y: 타겟 / Target
        method: 'smote', 'oversample', 'undersample', 'smote_tomek'
    
    Returns:
        리샘플링된 X, y / Resampled X, y
    
    ⚠️ 주의: 반드시 훈련 데이터에만 적용! 테스트 데이터에는 적용하지 않음!
    ⚠️ Warning: Apply only to training data! Never apply to test data!
    """
    print(f"처리 전 / Before: {pd.Series(y).value_counts().to_dict()}")
    
    if method == 'smote':
        # SMOTE: 합성 소수 클래스 오버샘플링
        # SMOTE: Synthetic Minority Oversampling Technique
        sampler = SMOTE(random_state=random_state)
    elif method == 'oversample':
        # 랜덤 오버샘플링 / Random oversampling
        sampler = RandomOverSampler(random_state=random_state)
    elif method == 'undersample':
        # 랜덤 언더샘플링 / Random undersampling
        sampler = RandomUnderSampler(random_state=random_state)
    elif method == 'smote_tomek':
        # SMOTE + Tomek links (하이브리드)
        sampler = SMOTETomek(random_state=random_state)
    else:
        raise ValueError(f"Unknown method: {method}")
    
    X_resampled, y_resampled = sampler.fit_resample(X, y)
    
    print(f"처리 후 / After:  {pd.Series(y_resampled).value_counts().to_dict()}")
    
    return X_resampled, y_resampled

# 사용 예시 / Usage example
X_clf = df_clf.drop('target', axis=1)
y_clf = df_clf['target']

X_train_clf, X_test_clf, y_train_clf, y_test_clf = train_test_split(
    X_clf, y_clf, test_size=0.2, random_state=42, stratify=y_clf
)

print("\n=== SMOTE 적용 ===")
X_train_resampled, y_train_resampled = handle_imbalance(X_train_clf, y_train_clf, method='smote')

In [None]:
# ============================================================
# class_weight 사용법 (모델 내부 가중치 조정)
# Using class_weight (internal model weighting)
# ============================================================

# 방법 1: 직접 가중치 계산 / Method 1: Calculate weights manually
from sklearn.utils.class_weight import compute_class_weight

classes = np.unique(y_train_clf)
weights = compute_class_weight('balanced', classes=classes, y=y_train_clf)
class_weight_dict = dict(zip(classes, weights))

print("계산된 클래스 가중치 / Computed class weights:")
print(class_weight_dict)

# 방법 2: 모델에 직접 적용 / Method 2: Apply directly to model
# ⚠️ XGBoost는 scale_pos_weight 사용
# ⚠️ XGBoost uses scale_pos_weight instead

# RandomForest 예시 / RandomForest example
rf_weighted = RandomForestClassifier(
    class_weight='balanced',  # 또는 class_weight_dict
    random_state=42
)

# XGBoost 예시 / XGBoost example
# scale_pos_weight = 다수 클래스 수 / 소수 클래스 수
# scale_pos_weight = majority count / minority count
neg_count = (y_train_clf == 0).sum()
pos_count = (y_train_clf == 1).sum()
scale_pos_weight = neg_count / pos_count

print(f"\nXGBoost scale_pos_weight: {scale_pos_weight:.2f}")

xgb_weighted = XGBClassifier(
    scale_pos_weight=scale_pos_weight,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

---
## 전처리 파이프라인 요약 / Preprocessing Pipeline Summary

```python
# ============================================================
# 전체 전처리 파이프라인 예시 / Complete Preprocessing Pipeline
# ============================================================

# 1. 데이터 로드 / Load data
df = pd.read_csv('data.csv')

# 2. 결측치 확인 및 처리 / Check and handle missing values
check_missing(df)
df = impute_simple(df, strategy='mean')

# 3. 이상치 탐지 및 처리 / Detect and handle outliers
detect_outliers_iqr(df)
df = handle_outliers(df, method='clip')

# 4. 데이터 분할 / Split data
X_train, X_val, X_test, y_train, y_val, y_test = split_data(df, 'target')

# 5. 스케일링 / Scaling
X_train_scaled, X_test_scaled, scaler = scale_data(X_train, X_test, method='standard')

# 6. (분류) 불균형 처리 / (Classification) Handle imbalance
X_train_balanced, y_train_balanced = handle_imbalance(X_train_scaled, y_train, method='smote')

# 7. 모델 학습 / Train model
model.fit(X_train_balanced, y_train_balanced)
```

In [None]:
print("✅ 전처리 노트북 완료 / Preprocessing notebook complete!")
print("\n다음 노트북: 01_regression_models.ipynb")
print("Next notebook: 01_regression_models.ipynb")