In [2]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.2-py3-none-macosx_12_0_arm64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.2-py3-none-macosx_12_0_arm64.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: xgboost
Successfully installed xgboost-3.0.2
Note: you may need to restart the kernel to use updated packages.


In [5]:
# 이미 설정하신 SEED 값 활용
SEED = 42

# 필수 라이브러리 import
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
import xgboost as xgb
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

# 재현성 보장
np.random.seed(SEED)


In [18]:
def extract_molecular_features(smiles_list, verbose=True):
    """
    Morgan FP + 15개 RDKit descriptor 추출
    (FractionCSP3 철자 수정, 실패 시 0 반환)
    """
    features, failed = [], 0
    frac_csp3_fn = getattr(Descriptors, "FractionCSP3", None)

    for i, smi in enumerate(smiles_list):
        if verbose and i % 1000 == 0:
            print(f"처리 {i}/{len(smiles_list)}")

        mol = Chem.MolFromSmiles(smi)
        if mol is None:
            features.append(np.zeros(2048 + 15))
            failed += 1
            continue

        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
        desc = [
            Descriptors.MolWt(mol),
            Descriptors.MolLogP(mol),
            Descriptors.NumHDonors(mol),
            Descriptors.NumHAcceptors(mol),
            Descriptors.TPSA(mol),
            Descriptors.NumRotatableBonds(mol),
            Descriptors.NumAromaticRings(mol),
            Descriptors.NumSaturatedRings(mol),
            Descriptors.NumAliphaticRings(mol),
            safe_call(frac_csp3_fn, mol) if frac_csp3_fn else 0.0,
            Descriptors.BertzCT(mol),
            Descriptors.BalabanJ(mol),
            Descriptors.HallKierAlpha(mol),
            Descriptors.HeavyAtomCount(mol),
            Descriptors.FpDensityMorgan2(mol),
        ]
        desc = [0 if pd.isna(x) else x for x in desc]
        features.append(np.concatenate([np.array(fp), np.array(desc)]))

    print(f"완료 – 실패 {failed}/{len(smiles_list)}")
    return np.array(features)


In [19]:
# 데이터 파일 로드
def load_and_preprocess_data():
    """
    학습 데이터와 테스트 데이터를 로드하고 전처리
    """
    # 학습 데이터 로드 (파일명은 실제 데이터에 맞게 수정)
    try:
        train_data = pd.read_csv('/Users/junu/Documents/Project/Jump_Team_Project/Data/ChEMBL_ASK1(IC50).csv', sep=';')
        print(f"학습 데이터 로드 완료: {train_data.shape}")
    except:
        print("ChEMBL_ASK1(IC50).csv 파일을 찾을 수 없습니다.")
        return None, None, None, None
    
    # 추가 데이터가 있다면 병합
    try:
        pubchem_data = pd.read_csv('/Users/junu/Documents/Project/Jump_Team_Project/Data/Pubchem_ASK1.csv')
        train_data = pd.concat([train_data, pubchem_data], ignore_index=True)
        print(f"PubChem 데이터 병합 완료: {train_data.shape}")
    except:
        print("PubChem 데이터 없음, 기본 데이터만 사용")
    
    # 테스트 데이터 로드
    try:
        test_data = pd.read_csv('/Users/junu/Documents/Project/Jump_Team_Project/Data/test.csv')
        print(f"테스트 데이터 로드 완료: {test_data.shape}")
    except:
        print("test.csv 파일을 찾을 수 없습니다.")
        return None, None, None, None
    
    # 데이터 정리
    train_data = train_data.dropna(subset=['Smiles'])
    test_data = test_data.dropna(subset=['Smiles'])
    
    # IC50 값 처리 (컬럼명이 다를 수 있으니 확인)
    if 'IC50_nM' in train_data.columns:
        ic50_col = 'IC50_nM'
    elif 'Value' in train_data.columns:
        ic50_col = 'Value'
    else:
        ic50_col = train_data.columns[-1]  # 마지막 컬럼 사용
    
    # 이상치 제거 (IC50 값)
    Q1 = train_data[ic50_col].quantile(0.25)
    Q3 = train_data[ic50_col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    train_data = train_data[(train_data[ic50_col] >= lower_bound) & 
                           (train_data[ic50_col] <= upper_bound)]
    
    print(f"이상치 제거 후 학습 데이터: {train_data.shape}")
    
    return train_data, test_data, ic50_col, train_data['Smiles'].tolist(), test_data['Smiles'].tolist()

# 데이터 로드 실행
train_data, test_data, ic50_col, train_smiles, test_smiles = load_and_preprocess_data()


학습 데이터 로드 완료: (824, 48)
PubChem 데이터 병합 완료: (24619, 81)
테스트 데이터 로드 완료: (127, 2)
이상치 제거 후 학습 데이터: (657, 81)


In [21]:
if train_data is not None:
    # 분자 특성 추출
    print("학습 데이터 특성 추출 중...")
    X_train = extract_molecular_features(train_smiles)
    
    print("테스트 데이터 특성 추출 중...")
    X_test = extract_molecular_features(test_smiles)
    
    # 타겟 변수 처리 (log 변환으로 분포 안정화)
    y_train = np.log1p(train_data[ic50_col].values)
    
    # 특성 스케일링
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    print(f"특성 추출 완료:")
    print(f"- 학습 특성: {X_train.shape}")
    print(f"- 테스트 특성: {X_test.shape}")
    print(f"- 타겟 분포: 평균={y_train.mean():.3f}, 표준편차={y_train.std():.3f}")


학습 데이터 특성 추출 중...
처리 0/657




NameError: name 'safe_call' is not defined

In [22]:
import lightgbm as lgb
print(lgb.__version__)

4.6.0


In [24]:
pip install --upgrade lightgbm

Note: you may need to restart the kernel to use updated packages.


In [25]:
pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-macosx_11_0_universal2.whl.metadata (1.4 kB)
Collecting graphviz (from catboost)
  Downloading graphviz-0.21-py3-none-any.whl.metadata (12 kB)
Downloading catboost-1.2.8-cp312-cp312-macosx_11_0_universal2.whl (27.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.8/27.8 MB[0m [31m18.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading graphviz-0.21-py3-none-any.whl (47 kB)
Installing collected packages: graphviz, catboost
Successfully installed catboost-1.2.8 graphviz-0.21
Note: you may need to restart the kernel to use updated packages.


In [26]:
pip install optuna

Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
Downloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.9.0 optuna-4.4.0
Note: you may need to restart the kernel to use updated packages.


In [1]:
pip install mordred

Collecting mordred
  Downloading mordred-1.2.0.tar.gz (128 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting networkx==2.* (from mordred)
  Downloading networkx-2.8.8-py3-none-any.whl.metadata (5.1 kB)
Downloading networkx-2.8.8-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: mordred
  Building wheel for mordred (setup.py) ... [?25ldone
[?25h  Created wheel for mordred: filename=mordred-1.2.0-py3-none-any.whl size=176718 sha256=4e5dde1fe30b97de1e1cd1447f91e7df9019b0489289241dd0188e39cfd7071e
  Stored in directory: /Users/junu/Library/Caches/pip/wheels/e8/79/b8/f4f1dfbb736c2b8605cf5068cd633f4d2869defb89908aef93
Successfully built mordred
Installing collected packages: networkx, mordred
  Attempting uninstall: networkx
    Found existing installation: networkx 3.3
    Uninstalling networkx-3.3:
      Successfully uninstalled networkx-3

In [1]:
conda install -c conda-forge lightgbm xgboost catboost

Retrieving notices: done
Channels:
 - conda-forge
 - defaults
Platform: osx-arm64
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /opt/anaconda3

  added / updated specs:
    - catboost
    - lightgbm
    - xgboost


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    atk-1.0-2.38.0             |       hcb7b3dd_1         364 KB  conda-forge
    boost-cpp-1.84.0           |       hca5e981_3          16 KB  conda-forge
    ca-certificates-2025.7.9   |       hbd8a1cb_0         149 KB  conda-forge
    cairo-1.18.0               |       hd1e100b_0         877 KB  conda-forge
    catboost-1.2.7             |cpu_py312hc95783c_1         8.5 MB  conda-forge
    certifi-2025.7.9           |     pyhd8ed1ab_0         153 KB  conda-forge
    conda-24.11.3              |  py312h81bd7bf_0         1.1 MB  conda-forge
    font-ttf-dejav

In [2]:
pip install optuna shap rdkit-pypi mordred-community

[31mERROR: Could not find a version that satisfies the requirement rdkit-pypi (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for rdkit-pypi[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.
