In [None]:
import pandas as pd
import numpy as np
import gc

def performance_preprocess(parquet_path, save_path, var_guide_path='../정리/T성과정보_변수명세서.xlsx'):
    """
    성과정보 데이터 전처리 함수
    parquet_path: 불러올 데이터 경로
    save_path: 저장할 parquet 파일명
    var_guide_path: 변수명세서 경로 (기본값)
    """
    # [1] 데이터 병합/불러오기
    var_guide = pd.read_excel(var_guide_path)
    data = pd.read_parquet(parquet_path)
    print(f'[{parquet_path}] 데이터 shape:', data.shape)

    # [2] 전처리
    for col in data.select_dtypes(include=['float64']).columns:
        data[col] = pd.to_numeric(data[col], downcast='float')
    for col in data.select_dtypes(include=['int64']).columns:
        data[col] = pd.to_numeric(data[col], downcast='integer')
    gc.collect()

    drop_cols = []
    for idx, row in var_guide.iterrows():
        col = row.get('컬럼명', row[0])
        처리값 = row.get('처리', None)
        if col in data.columns and 처리값 is not None and pd.notnull(처리값) and '삭제' in str(처리값):
            drop_cols.append(col)
    if drop_cols:
        data.drop(columns=drop_cols, inplace=True)
        gc.collect()

    cat_cols = [row['컬럼명'] for _, row in var_guide.iterrows()
                if (str(row['데이터 타입']).startswith('object') or str(row['변수 유형']).startswith('범주')) and row['컬럼명'] in data.columns]
    num_cols = [row['컬럼명'] for _, row in var_guide.iterrows()
                if not (str(row['데이터 타입']).startswith('object') or str(row['변수 유형']).startswith('범주')) and row['컬럼명'] in data.columns and row['컬럼명'] != 'ID']
    if cat_cols:
        data[cat_cols] = data[cat_cols].fillna('기타')
    for col in num_cols:
        mean = data[col].mean()
        data[col] = data[col].fillna(mean)
        gc.collect()

    def try_datetime_conversion(df, col):
        try:
            if '년월' in col and df[col].dtype in ['int32', 'int64', 'float32', 'float64']:
                df[col] = pd.to_datetime(df[col].astype(int).astype(str), format='%Y%m', errors='coerce')
            elif '일자' in col or 'date' in col.lower() or 'ym' in col.lower():
                df[col] = pd.to_datetime(df[col], errors='coerce')
        except Exception as e:
            print(f'  - {col} 변환 실패: {e}')

    date_cols = [col for col in data.columns if '일자' in col or '년월' in col or 'date' in col.lower() or 'ym' in col.lower()]
    for col in date_cols:
        try_datetime_conversion(data, col)
        print(f'    > {col}: {data[col].dtype}')

    # [3] 저장 및 샘플 확인
    data.to_parquet(save_path, index=False)
    print(f'[{save_path}] 저장 완료')
    print(data.head())

if __name__ == "__main__":
    # train
    performance_preprocess(
        parquet_path = '../정리/performance_train.parquet',
        save_path = '8.성과정보_train_전처리완.parquet'
    )
    # test
    performance_preprocess(
        parquet_path = '../정리/performance_test.parquet',
        save_path = '8.성과정보_test_전처리완.parquet'
    ) 