In [None]:
# Catboost
import os, random
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs, Descriptors
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import KFold
from catboost import CatBoostRegressor

CFG = {
    'SEED': 42,
    'NBITS': 2024,
    'N_SPLITS': 5,
    'LEARNING_RATE': 0.01,
    'EARLY_STOPPING_ROUNDS': 200,
    'N_ESTIMATORS': 1000
}

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(CFG['SEED'])

# ========== Utis ==========
def IC50_to_pIC50(ic50_nM):
    ic50_nM = np.clip(ic50_nM, 1e-12, None)
    return 9 - np.log10(ic50_nM)
def pIC50_to_IC50(pIC50): return 10**(9 - pIC50)

def bv_to_np(bitvect, nbits):
    arr = np.zeros((nbits,), dtype=np.uint8)
    DataStructs.ConvertToNumpyArray(bitvect, arr)
    return arr

# ========== Data ==========
def preprocess_data():
    # Load data
    chembl = pd.read_csv("./ChEMBL_ASK1(IC50).csv", sep=';')
    pubchem = pd.read_csv("./Pubchem_ASK1.csv")
    
    chembl.columns = chembl.columns.str.strip().str.replace('"','')
    chembl = chembl[['Smiles', 'Standard Value']].rename(columns={'Smiles': 'smiles', 'Standard Value': 'ic50'})
    if 'Standard Type' in chembl.columns and chembl['Standard Type'].nunique() > 1:
        chembl = chembl[chembl['Standard Type'] == 'IC50']
    pubchem = pubchem[['SMILES', 'Activity_Value']].rename(columns={'SMILES': 'smiles', 'Activity_Value': 'ic50'})
    
    chembl['ic50'] = pd.to_numeric(chembl['ic50'], errors='coerce') 
    pubchem['ic50'] = pd.to_numeric(pubchem['ic50'], errors='coerce')

    df = pd.concat([chembl, pubchem], ignore_index=True)
    df = df.dropna(subset=['smiles', 'ic50'])
    df = df[df['ic50'] > 0]
    df = df.drop_duplicates(subset='smiles').reset_index(drop=True)

    df['pIC50'] = 9 - np.log10(df['ic50'])
    print(f"✅ 전처리 완료: 총 {len(df)}개 샘플")
    return df


# ========= 2) Fingerprint 생성기 =========
def fp_morgan(smiles, radius=2, nBits=CFG['NBITS']):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    bv = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=nBits)
    return bv_to_np(bv, nBits)

def smiles_to_morgan_fp(smiles, radius=2, nBits=CFG['NBITS']):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=nBits)
    arr = np.zeros((nBits,), dtype=np.uint8)
    DataStructs.ConvertToNumpyArray(fp, arr)
    return arr

def calculate_rdkit_descriptors(smiles): 
    mol = Chem.MolFromSmiles(smiles) 
    if mol is None:
        return np.full((len(Descriptors._descList),), np.nan) 
    descriptors = [desc_func(mol) for _, desc_func in Descriptors._descList] 
    return np.array(descriptors)

def get_score(y_true_ic50, y_pred_ic50, y_true_pic50, y_pred_pic50):
    rmse = mean_squared_error(y_true_ic50, y_pred_ic50)
    nrmse = rmse / (np.max(y_true_ic50) - np.min(y_true_ic50))
    A = 1 - min(nrmse, 1)
    B = r2_score(y_true_pic50, y_pred_pic50)
    score = 0.4 * A + 0.6 * B
    return score


# ========== Morgan + RDkit 결합 피처 생성 ==========
def build_X_morgan_plus_rdkit(df, radius=2, nBits=CFG['NBITS']):
    # Morgan FP
    morgan_list, keep_idx = [], []
    for i, smi in enumerate(df['smiles']):
        x = fp_morgan(smi, radius, nBits=nBits)
        if x is not None:
            morgan_list.append(x); keep_idx.append(i)
    sub = df.iloc[keep_idx].copy()
    X_morgan = np.stack(morgan_list)

    # RDkit descriptors (연속형)
    rd_list = [calculate_rdkit_descriptors(smi) for smi in sub['smiles']]
    X_rd = np.vstack(rd_list)

    # 결측치 대체 스케일링
    imputer = SimpleImputer(strategy='median')
    scaler = StandardScaler()
    X_rd_imputed = imputer.fit_transform(X_rd)
    X_rd_scaled  = scaler.fit_transform(X_rd_imputed)

    # 결합
    X = np.hstack([X_morgan.astype(np.float32), X_rd_scaled.astype(np.float32)])

    # 타깃
    y_pic50 = sub['pIC50'].values.astype(float)
    y_ic50 = sub['ic50'].values.astype(float)

    desc_mean = np.nanmean(X_rd, axis=0)
    
    return X, y_pic50, y_ic50, sub, imputer, scaler, desc_mean, np.stack(morgan_list)


# 연산 실패한 값에 대한 처리
def get_similar_ic50(smiles, train_smiles_list, train_fp_array, train_ic50_array, top_k=5):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return np.nan
    try:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=CFG['NBITS'])
    except:
        return np.nan

    sims = []
    for i, ref_fp in enumerate(train_fp_array):
        sim = DataStructs.TanimotoSimilarity(fp, DataStructs.ExplicitBitVect(ref_fp))
        sims.append((sim, train_ic50_array[i]))

    # 상위 top_k 유사한 샘플 평균 반환
    sims.sort(reverse=True)
    top_sim_vals = [ic50 for sim, ic50 in sims[:top_k] if sim > 0.3] 
    if len(top_sim_vals) == 0:
        return np.nan
    return np.mean(top_sim_vals)


# ========= 4) 5-Fold CV(교차검증) =========
def cv_eval(X, y_pic50, y_ic50, seed=CFG['SEED'], n_splits=CFG['N_SPLITS']):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    oof_pred_pic50 = np.zeros_like(y_pic50, dtype=float)

    print(" 교차검증 시작")

    for fold, (tr_idx, va_idx) in enumerate(kf.split(X), 1):
        X_tr, X_va = X[tr_idx], X[va_idx]
        y_tr, y_va = y_pic50[tr_idx], y_pic50[va_idx]

        model = CatBoostRegressor(
            iterations=1000,
            learning_rate=0.01,
            depth=6,
            loss_function='RMSE',
            eval_metric='RMSE',
            early_stopping_rounds=100,
            random_seed=seed,
            verbose=False
        )

        model.fit(
            X_tr, y_tr,
            eval_set=(X_va, y_va)
        )

        oof_pred_pic50[va_idx] = model.predict(X_va)
        print(f"Fold {fold} 완료 - best_iter: {model.get_best_iteration()}")

    y_pred_ic50 = pIC50_to_IC50(oof_pred_pic50)
    score, A, B, rmse = comp_score(y_ic50, y_pred_ic50, y_pic50, oof_pred_pic50)
    print("전체 OOF 평가 완료")
    return dict(score=score, A=A, B=B, rmse_ic50=rmse)

def comp_score(y_true_ic50, y_pred_ic50, y_true_pic50, y_pred_pic50):
    
    rmse = mean_squared_error(y_true_ic50, y_pred_ic50)
    nrmse = rmse / (np.max(y_true_ic50) - np.min(y_true_ic50))
    A = 1 - min(nrmse, 1)
    B = r2_score(y_true_pic50, y_pred_pic50)
    score = 0.4 * A + 0.6 * B
    return score, A, B, rmse

if __name__ == "__main__":
    df = preprocess_data()

    Xc, y_pic50c, y_ic50c, subc, imputer, scaler, desc_mean, Xc_fp = build_X_morgan_plus_rdkit(df, radius=2, nBits=CFG['NBITS'])
    
    morgan_feature_names = [f"morgan_{i}" for i in range(CFG['NBITS'])]
    rdkit_feature_names = [f"rdkit_{i}" for i in range(Xc.shape[1] - CFG['NBITS'])]
    feature_names = morgan_feature_names + rdkit_feature_names
    Xc_df = pd.DataFrame(Xc, columns=feature_names)


    resc = cv_eval(Xc_df.values, y_pic50c, y_ic50c)
    print(f"[catboost] Score={resc['score']:.4f} | A={resc['A']:.4f} | B={resc['B']:.4f} | RMSE(IC50)={resc['rmse_ic50']:.4f}")


    output_path="result_jy/cat_submission.csv"
    print("\n 예측 결과를 제출 파일로 변환 중...")


    test_df = pd.read_csv("./test.csv")
    test_df['fingerprint'] = test_df['Smiles'].apply(smiles_to_morgan_fp)
    test_df['descriptors'] = test_df['Smiles'].apply(calculate_rdkit_descriptors)

    valid_test_mask = test_df['fingerprint'].apply(lambda x: x is not None) & test_df['descriptors'].apply(lambda x: x is not None)

    fp_test_stack = np.stack(test_df.loc[valid_test_mask, 'fingerprint'].values)
    desc_test_stack = np.stack(test_df.loc[valid_test_mask, 'descriptors'].values)
    desc_test_stack = np.nan_to_num(desc_test_stack, nan=desc_mean)
    desc_test_scaled = scaler.transform(desc_test_stack)
    X_test = np.hstack([fp_test_stack, desc_test_scaled])
    X_test_df = pd.DataFrame(X_test, columns=feature_names)


    test_preds = np.zeros(len(X_test_df))
    kf = KFold(n_splits=CFG['N_SPLITS'], shuffle=True, random_state=CFG['SEED'])

    for fold, (train_idx, val_idx) in enumerate(kf.split(Xc_df), 1):
        model = CatBoostRegressor(
            iterations=1000,
            learning_rate=0.01,
            depth=6,
            loss_function='RMSE',
            random_seed=CFG['SEED'],
            verbose=False
        )
        model.fit(Xc_df.iloc[train_idx], y_pic50c[train_idx])
        test_preds += model.predict(X_test_df) / CFG['N_SPLITS']

    ic50_preds = pIC50_to_IC50(test_preds)

    submission_df = pd.read_csv("./sample_submission.csv")

    pred_df = pd.DataFrame({
        'ID': test_df.loc[valid_test_mask, 'ID'].values,
        'ASK1_IC50_nM': ic50_preds
    })

    failed_mask = submission_df['ASK1_IC50_nM'].isna()
    
    for idx in submission_df[failed_mask].index:
        test_id = submission_df.loc[idx, 'ID']
        test_smiles = test_df.loc[test_df['ID'] == test_id, 'Smiles'].values[0]
        
        fallback_ic50 = get_similar_ic50(
            test_smiles,
            subc['smiles'].tolist(),
            Xc_fp,
            y_ic50c
        )
    
        if np.isnan(fallback_ic50):
            ic50_mean = df['ic50'].mean()  
            fallback_ic50 = ic50_mean
    
        submission_df.loc[idx, 'ASK1_IC50_nM'] = fallback_ic50

    # 저장
    submission_df.to_csv(output_path, index=False)
    print(f"✅ 제출 파일 저장 완료: {output_path}")


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.6 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "c:\Users\user\anaconda3\envs\cat2\lib\runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "c:\Users\user\anaconda3\envs\cat2\lib\runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "c:\Users\user\anaconda3\envs\cat2\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\user\anaconda3\envs\cat2\lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.6 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "c:\Users\user\anaconda3\envs\cat2\lib\runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "c:\Users\user\anaconda3\envs\cat2\lib\runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "c:\Users\user\anaconda3\envs\cat2\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\user\anaconda3\envs\cat2\lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.6 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "c:\Users\user\anaconda3\envs\cat2\lib\runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "c:\Users\user\anaconda3\envs\cat2\lib\runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "c:\Users\user\anaconda3\envs\cat2\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\user\anaconda3\envs\cat2\lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.6 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "c:\Users\user\anaconda3\envs\cat2\lib\runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "c:\Users\user\anaconda3\envs\cat2\lib\runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "c:\Users\user\anaconda3\envs\cat2\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\user\anaconda3\envs\cat2\lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.6 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "c:\Users\user\anaconda3\envs\cat2\lib\runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "c:\Users\user\anaconda3\envs\cat2\lib\runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "c:\Users\user\anaconda3\envs\cat2\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\user\anaconda3\envs\cat2\lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.6 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "c:\Users\user\anaconda3\envs\cat2\lib\runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "c:\Users\user\anaconda3\envs\cat2\lib\runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "c:\Users\user\anaconda3\envs\cat2\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\user\anaconda3\envs\cat2\lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.6 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "c:\Users\user\anaconda3\envs\cat2\lib\runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "c:\Users\user\anaconda3\envs\cat2\lib\runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "c:\Users\user\anaconda3\envs\cat2\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\user\anaconda3\envs\cat2\lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.6 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "c:\Users\user\anaconda3\envs\cat2\lib\runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "c:\Users\user\anaconda3\envs\cat2\lib\runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "c:\Users\user\anaconda3\envs\cat2\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\user\anaconda3\envs\cat2\lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.6 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "c:\Users\user\anaconda3\envs\cat2\lib\runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "c:\Users\user\anaconda3\envs\cat2\lib\runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "c:\Users\user\anaconda3\envs\cat2\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\user\anaconda3\envs\cat2\lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.6 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "c:\Users\user\anaconda3\envs\cat2\lib\runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "c:\Users\user\anaconda3\envs\cat2\lib\runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "c:\Users\user\anaconda3\envs\cat2\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\user\anaconda3\envs\cat2\lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c

AttributeError: _ARRAY_API not found


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.2.6 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "c:\Users\user\anaconda3\envs\cat2\lib\runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "c:\Users\user\anaconda3\envs\cat2\lib\runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "c:\Users\user\anaconda3\envs\cat2\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\Users\user\anaconda3\envs\cat2\lib\site-packages\traitlets\config\application.py", line 1075, in launch_instance
    app.start()
  File "c

AttributeError: _ARRAY_API not found