In [None]:
# LightBGM
import os, random
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs, Descriptors
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import KFold
import lightgbm as lgb
import optuna

CFG = {
    'SEED': 42, 
    'NBITS': 2024,
    'N_SPLITS': 5,
    'LEARNING_RATE': 0.01,
    'EARLY_STOPPING_ROUNDS': 200,
    'N_ESTIMATORS': 1000
}

def get_lgbm_params(seed):
    return dict(
        objective='regression',
        learning_rate=CFG['LEARNING_RATE'],
        num_leaves=64,
        feature_fraction=0.9,
        bagging_fraction=0.8,
        bagging_freq=1,
        min_child_samples=20,
        random_state=seed,
        verbose=-1
    )

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(CFG['SEED'])

# ========== Utis ==========
def IC50_to_pIC50(ic50_nM):
    ic50_nM = np.clip(ic50_nM, 1e-12, None)
    return 9 - np.log10(ic50_nM)
def pIC50_to_IC50(pIC50): return 10**(9 - pIC50)

def bv_to_np(bitvect, nbits):
    arr = np.zeros((nbits,), dtype=np.uint8)
    DataStructs.ConvertToNumpyArray(bitvect, arr)
    return arr

# ========== Data ==========
def preprocess_data():
    # Load data
    chembl = pd.read_csv("./ChEMBL_ASK1(IC50).csv", sep=';')
    pubchem = pd.read_csv("./Pubchem_ASK1.csv")
    
    chembl.columns = chembl.columns.str.strip().str.replace('"','')
    chembl = chembl[['Smiles', 'Standard Value']].rename(columns={'Smiles': 'smiles', 'Standard Value': 'ic50'})
    if 'Standard Type' in chembl.columns and chembl['Standard Type'].nunique() > 1:
        chembl = chembl[chembl['Standard Type'] == 'IC50']
    pubchem = pubchem[['SMILES', 'Activity_Value']].rename(columns={'SMILES': 'smiles', 'Activity_Value': 'ic50'})
    
    chembl['ic50'] = pd.to_numeric(chembl['ic50'], errors='coerce')
    pubchem['ic50'] = pd.to_numeric(pubchem['ic50'], errors='coerce')

    df = pd.concat([chembl, pubchem], ignore_index=True)
    df = df.dropna(subset=['smiles', 'ic50'])
    df = df[df['ic50'] > 0]
    df = df.drop_duplicates(subset='smiles').reset_index(drop=True)

    df['pIC50'] = 9 - np.log10(df['ic50'])
    print(f"전처리 완료: 총 {len(df)}개 샘플")
    return df


# ========= 2) Fingerprint 생성기 =========
def fp_morgan(smiles, radius=2, nBits=CFG['NBITS']):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    bv = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=nBits)
    return bv_to_np(bv, nBits)

def smiles_to_morgan_fp(smiles, radius=2, nBits=CFG['NBITS']):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nBits)
    arr = np.zeros((nBits,), dtype=np.uint8)   # <-- 길이 nBits로 수정
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

def calculate_rdkit_descriptors(smiles): 
    mol = Chem.MolFromSmiles(smiles) 
    if mol is None:
        return np.full((len(Descriptors._descList),), np.nan) 
    descriptors = [desc_func(mol) for _, desc_func in Descriptors._descList] 
    return np.array(descriptors)

RD_DESC_NAMES = [name for name, _ in Descriptors._descList]

def calculate_rdkit_descriptions(smiles):
    mol = Chem.MoleFromSmiles(smiles)
    if mol is None:
        return np.full((len(Descriptions._descList),), np.nan, dtype=float)
        try:
            vals = [func(mol) for _, func in Descriptors._descList]
        except Exception:
            return np.full((len(Descriptors._descList),), np.nan, dtype=float)
        return np.array(vals, dtype=float)

# ========== Morgan + RDkit 결합 피처 생성 ==========
def build_X_morgan_plus_rdkit(df, radius=2, nBits=CFG['NBITS']):
    # Morgan FP
    morgan_list, keep_idx = [], []
    for i, smi in enumerate(df['smiles']):
        x = fp_morgan(smi, radius, nBits=nBits)
        if x is not None:
            morgan_list.append(x); keep_idx.append(i)
    sub = df.iloc[keep_idx].copy()
    X_morgan = np.stack(morgan_list) # (N, nBits) -uint8(0/1)

    # RDkit descriptors (연속형)
    rd_list = [calculate_rdkit_descriptors(smi) for smi in sub['smiles']]
    X_rd = np.vstack(rd_list)

    # 결측치 대체 스케일링
    imputer = SimpleImputer(strategy='median')
    scaler = StandardScaler()
    X_rd_imputed = imputer.fit_transform(X_rd)
    X_rd_scaled  = scaler.fit_transform(X_rd_imputed)

    # 결합
    X = np.hstack([X_morgan.astype(np.float32), X_rd_scaled.astype(np.float32)])

    # 타깃
    y_pic50 = sub['pIC50'].values.astype(float)
    y_ic50 = sub['ic50'].values.astype(float)

    desc_mean = np.nanmean(X_rd, axis=0)
    
    return X, y_pic50, y_ic50, sub, imputer, scaler, desc_mean

def comp_metric_sklearn(y_true, y_pred):
    y_true_ic50 = pIC50_to_IC50(y_true)
    y_pred_ic50 = pIC50_to_IC50(y_pred)

    score = get_score(y_true_ic50, y_pred_ic50, y_true, y_pred)
    return 'comp_score', score, True  # True: 높을수록 좋음


# ========= 4) 5-Fold CV(교차검증) =========
def cv_eval(X, y_pic50, y_ic50, seed=CFG['SEED'], n_splits=CFG['N_SPLITS']):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    oof_pred_pic50 = np.zeros_like(y_pic50, dtype=float)

    print(" 교차검증 시작")


    for fold, (tr_idx, va_idx) in enumerate (kf.split(X), 1):
        print(f"Fold {fold} 학습 중...")
        X_tr, X_va = X[tr_idx], X[va_idx]
        y_tr, y_va = y_pic50[tr_idx], y_pic50[va_idx]

        model = lgb.LGBMRegressor(**get_lgbm_params(seed=CFG['SEED']), n_estimators=CFG['N_ESTIMATORS'])
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_va, y_va)],
            eval_metric=comp_metric_sklearn,
            callbacks=[lgb.early_stopping(stopping_rounds=CFG['EARLY_STOPPING_ROUNDS'], verbose=False)]
        )
        oof_pred_pic50[va_idx] = model.predict(X_va, num_iteration=model.best_iteration_)
        print(f"Fold {fold} 완료 - best_iter: {model.best_iteration_}")

    # 최종 점수 (OOF 기준)
    y_pred_ic50 = pIC50_to_IC50(oof_pred_pic50)
    score, A, B, rmse = comp_score(y_ic50, y_pred_ic50, y_pic50, oof_pred_pic50)
    print("전체 OOF 평가 완료")
    return dict(score=score, A=A, B=B, rmse_ic50=rmse)

def comp_score(y_true_ic50, y_pred_ic50, y_true_pic50, y_pred_pic50):
    rmse = mean_squared_error(y_true_ic50, y_pred_ic50)
    nrmse = rmse / (np.max(y_true_ic50) - np.min(y_true_ic50))
    A = 1 - min(nrmse, 1)
    B = r2_score(y_true_pic50, y_pred_pic50)
    score = 0.4 * A + 0.6 * B
    return score,  A, B, rmse

if __name__ == "__main__":
    df = preprocess_data()

    Xc, y_pic50c, y_ic50c, subc, imputer, scaler, desc_mean = build_X_morgan_plus_rdkit(df, radius=2, nBits=CFG['NBITS'])
    
    # === ① Feature name 정의 ===
    morgan_feature_names = [f"morgan_{i}" for i in range(CFG['NBITS'])]
    rdkit_feature_names = [f"rdkit_{i}" for i in range(Xc.shape[1] - CFG['NBITS'])]
    feature_names = morgan_feature_names + rdkit_feature_names
    Xc_df = pd.DataFrame(Xc, columns=feature_names)


    resc = cv_eval(Xc_df.values, y_pic50c, y_ic50c)
    print(f"[lgbm] Score={resc['score']:.4f} | A={resc['A']:.4f} | B={resc['B']:.4f} | RMSE(IC50)={resc['rmse_ic50']:.4f}")


    output_path="result_jy/lgbm_submission.csv"
    print("\n 예측 결과를 제출 파일로 변환 중...")


    test_df = pd.read_csv("./test.csv")
    test_df['fingerprint'] = test_df['Smiles'].apply(smiles_to_morgan_fp)
    test_df['descriptors'] = test_df['Smiles'].apply(calculate_rdkit_descriptors)


    valid_test_mask = test_df['fingerprint'].notna() & test_df['descriptors'].notna()


    fp_test_stack = np.stack(test_df.loc[valid_test_mask, 'fingerprint'].values)
    desc_test_stack = np.stack(test_df.loc[valid_test_mask, 'descriptors'].values)
    desc_test_stack = np.nan_to_num(desc_test_stack, nan=desc_mean)
    desc_test_scaled = scaler.transform(desc_test_stack)
    X_test = np.hstack([fp_test_stack, desc_test_scaled])
    X_test_df = pd.DataFrame(X_test, columns=feature_names)


    test_preds = np.zeros(len(X_test_df))
    kf = KFold(n_splits=CFG['N_SPLITS'], shuffle=True, random_state=CFG['SEED'])

    for fold, (train_idx, val_idx) in enumerate(kf.split(Xc_df), 1):
        #print(f"추론 Fold {fold}")
        model = lgb.LGBMRegressor(**get_lgbm_params(seed=CFG['SEED']), n_estimators=CFG['N_ESTIMATORS'])
        model.fit(Xc_df.iloc[train_idx], y_pic50c[train_idx])
        test_preds += model.predict(X_test_df) / CFG['N_SPLITS']
    
    ic50_preds = pIC50_to_IC50(test_preds)

    submission_df = pd.read_csv("./sample_submission.csv")

    pred_df = pd.DataFrame({
        'ID': test_df.loc[valid_test_mask, 'ID'].values,
        'ASK1_IC50_nM': ic50_preds
    })
    submission_df = submission_df[['ID']].merge(pred_df, on='ID', how='left')

    ic50_mean = df['ic50'].mean()
    submission_df['ASK1_IC50_nM'].fillna(ic50_mean, inplace=True)

    submission_df.to_csv(output_path, index=False)
    print(f"제출 파일 : {output_path}")
    
    