In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# 데이터 로드
!unzip '/content/drive/MyDrive/data/신약개발데이터.zip' -d data

Archive:  /content/drive/MyDrive/data/신약개발데이터.zip
  inflating: data/sample_submission.csv  
  inflating: data/test.csv           
  inflating: data/train.csv          


### Import

In [4]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.9 kB)
Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl (33.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.1/33.1 MB[0m [31m48.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.3.5


In [2]:
import pandas as pd
import numpy as np
import os
import random

from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [3]:
CFG = {
    'NBITS':2048,
    'SEED':42,
}

In [4]:
def seed_everything(seed):  # 코드에서 사용되는 모든 랜덤관련된 seed값 고정
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(CFG['SEED']) # Seed 고정

### DataLoad

In [5]:
# SMILES 데이터를 분자 지문으로 변환
def smiles_to_fingerprint(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=CFG['NBITS'])
        return np.array(fp)
    else:
        return np.zeros((CFG['NBITS'],))

In [6]:
# 학습 ChEMBL 데이터 로드
chembl_data = pd.read_csv('/content/data/train.csv')  # 예시 파일 이름
# chembl_data.head()

In [7]:
# chembl_data.describe(include='all')

### Data Pre-processing

In [8]:
train = chembl_data[['Smiles', 'pIC50']]
train['Fingerprint'] = train['Smiles'].apply(smiles_to_fingerprint)

train_x = np.stack(train['Fingerprint'].values)
train_y = train['pIC50'].values

# 학습 및 검증 데이터 분리
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.3, random_state=42)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Fingerprint'] = train['Smiles'].apply(smiles_to_fingerprint)


### Train & Validation

In [35]:
!pip install catboost
!pip install lightgbm
!pip install xgboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.5
Collecting nvidia-nccl-cu12 (from xgboost)
  Downloading nvidia_nccl_cu12-2.22.3-py3-none-manylinux2014_x86_64.whl.metadata (1.8 kB)
Downloading nvidia_nccl_cu12-2.22.3-py3-none-manylinux2014_x86_64.whl (190.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.9/190.9 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nvidia-nccl-cu12
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
torch 2.3.1+cu121 requires nvidia-cublas-cu12==12.1.

In [9]:
# 기본 선형 회귀
from sklearn.linear_model import LinearRegression
# 다항 회귀 (PolynomialFeatures는 다항 특성 생성을 위한 전처리)
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
# 릿지 회귀
from sklearn.linear_model import Ridge
# 라쏘 회귀
from sklearn.linear_model import Lasso
# 엘라스틱 넷
from sklearn.linear_model import ElasticNet
# 주성분 회귀 (PCR)
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
# 편묘 회귀 (PLS)
from sklearn.cross_decomposition import PLSRegression
# 서포트 벡터 회귀
from sklearn.svm import SVR
# 결정 트리 회귀
from sklearn.tree import DecisionTreeRegressor
# 랜덤 포레스트 회귀
from sklearn.ensemble import RandomForestRegressor
# 부스팅 회귀 (예: Gradient Boosting Regressor)
from sklearn.ensemble import GradientBoostingRegressor
# 가우시안 프로세스 회귀
from sklearn.gaussian_process import GaussianProcessRegressor
# 베이지안 회귀
from sklearn.linear_model import BayesianRidge
# 배깅 (Bagging) 회귀 모델
from sklearn.ensemble import BaggingRegressor,AdaBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import StackingRegressor

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [10]:
from sklearn.pipeline import Pipeline

In [11]:
pipeline_lr = Pipeline([
    ('linear_regression', LinearRegression())
])
pipeline_poly = Pipeline([
    ('polynomial_features', PolynomialFeatures(degree=2)),
    ('linear_regression', LinearRegression())
])
pipeline_ridge = Pipeline([
    ('ridge_regression', Ridge(alpha=1.0))
])
pipeline_lasso = Pipeline([
    ('lasso_regression', Lasso(alpha=1.0))
])
pipeline_elasticnet = Pipeline([
    ('elasticnet_regression', ElasticNet(alpha=1.0, l1_ratio=0.5))
])
pipeline_pcr = Pipeline([
    ('pca', PCA(n_components=2)),
    ('linear_regression', LinearRegression())
])
pipeline_pls = Pipeline([
    ('pls', PLSRegression(n_components=2)),
    ('linear_regression', LinearRegression())
])
pipeline_svr = Pipeline([
    ('svr', SVR(kernel='linear'))
])
pipeline_dt = Pipeline([
    ('decision_tree', DecisionTreeRegressor())
])
pipeline_rf = Pipeline([
    ('random_forest', RandomForestRegressor())
])
pipeline_gb = Pipeline([
    ('gradient_boosting', GradientBoostingRegressor())
])
pipeline_gp = Pipeline([
    ('gaussian_process', GaussianProcessRegressor())
])
pipeline_br = Pipeline([
    ('bayesian_ridge', BayesianRidge())
])
pipeline_bagging = Pipeline([
    ('bagging', BaggingRegressor())
])
pipeline_ada = Pipeline([
    ('adaboost', AdaBoostRegressor())
])
pipeline_lgbm = Pipeline([
    ('lightgbm', LGBMRegressor())
])
pipeline_xgb = Pipeline([
    ('xgboost', XGBRegressor())
])
pipeline_cat = Pipeline([
    ('catboost', CatBoostRegressor())
])
# 스태킹회귀(여러모델을 조합해서 메타 모델로 RandomForest 사용)
estimators = [
    ('xgb', pipeline_xgb),
    ('lgbm', pipeline_lgbm),
    ('cat', pipeline_cat)
]
pipeline_stacking = Pipeline([
    ('stacking', StackingRegressor(estimators=estimators, final_estimator=RandomForestRegressor()))
])

In [12]:
# 모든 파이프라인을 리스트로 관리
pipelines = [
    ('Linear Regression', pipeline_lr),
    ('Polynomial Regression', pipeline_poly),
    ('Ridge Regression', pipeline_ridge),
    ('Lasso Regression', pipeline_lasso),
    ('ElasticNet Regression', pipeline_elasticnet),
    ('Principal Component Regression (PCR)', pipeline_pcr),
    ('Partial Least Squares (PLS)', pipeline_pls),
    ('Support Vector Regression (SVR)', pipeline_svr),
    ('Decision Tree Regression', pipeline_dt),
    ('Random Forest Regression', pipeline_rf),
    ('Gradient Boosting Regression', pipeline_gb),
    ('Gaussian Process Regression', pipeline_gp),
    ('Bayesian Ridge Regression', pipeline_br),
    ('Bagging Regression', pipeline_bagging),
    ('AdaBoost Regression', pipeline_ada),
    ('LightGBM Regression', pipeline_lgbm),
    ('XGBoost Regression', pipeline_xgb),
    ('Catboost Regression', pipeline_cat),
    ('Stacking Regression', pipeline_stacking)
]

In [45]:
from sklearn.model_selection import cross_val_score, cross_val_predict
def pIC50_to_IC50(pic50_values):
    """Convert pIC50 values to IC50 (nM)."""
    return 10 ** (9 - pic50_values)

# 교차검증의 score에 사용할수 있도록 별도의 함수를 제작
def evaluate_model(pipeline, val_y):
  val_y_pred = pipeline.predict(val_x)
  mse = mean_squared_error(pIC50_to_IC50(val_y), pIC50_to_IC50(val_y_pred))
  rmse = np.sqrt(mse)
  return rmse


In [41]:
# 파이프라인을 통해서 모든 머신러닝을 학습하고 평가 - 리소스 부족 문제 발생
# 파이프라인 재 구성
# 랜덤 포레스트 회귀
from sklearn.ensemble import VotingRegressor, StackingRegressor
pipeline_rfr = Pipeline([
    ('rfr', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Gradient Boosting 회귀
pipeline_gbr = Pipeline([
    ('gbr', GradientBoostingRegressor(n_estimators=100, random_state=42))
])

# XGBoost 회귀
pipeline_xgb = Pipeline([
    ('xgb', XGBRegressor(n_estimators=100, random_state=42))
])

# LightGBM 회귀
pipeline_lgb = Pipeline([
    ('lgb', LGBMRegressor(n_estimators=100, random_state=42))
])

# CatBoost 회귀
pipeline_cb = Pipeline([
    ('cb', CatBoostRegressor(n_estimators=100, random_state=42, verbose=0))
])

# 앙상블 기법 - Voting Regressor
voting_pipeline = Pipeline([
    ('voting', VotingRegressor(estimators=[
        ('rfr', RandomForestRegressor(n_estimators=100, random_state=42)),
        ('gbr', GradientBoostingRegressor(n_estimators=100, random_state=42)),
        ('xgb', XGBRegressor(n_estimators=100, random_state=42)),
        ('lgb', LGBMRegressor(n_estimators=100, random_state=42))
    ]))
])

# 앙상블 기법 - Stacking Regressor
stacking_estimators = [
    ('rfr', RandomForestRegressor(n_estimators=100, random_state=42)),
    ('gbr', GradientBoostingRegressor(n_estimators=100, random_state=42)),
    ('xgb', XGBRegressor(n_estimators=100, random_state=42))
]

stacking_pipeline = Pipeline([
    ('stacking', StackingRegressor(
        estimators=stacking_estimators,
        final_estimator=Ridge()
    ))
])

# 각 앙상블 파이프라인
ensemble_pipelines = [
    ('Voting Regressor', voting_pipeline),
    ('Stacking Regressor', stacking_pipeline)
]


In [46]:
from sklearn.model_selection import cross_val_score
result = {}
for name,pipeline in ensemble_pipelines:
  try:
    pipeline.fit(train_x, train_y)
    rmse = evaluate_model(pipeline, val_y)
    print(f"{name} RMSE: {rmse:.4f}")
    result[name] = rmse
  except Exception as e:
    print(f"{name} Error: {e}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006607 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1132
[LightGBM] [Info] Number of data points in the train set: 1366, number of used features: 566
[LightGBM] [Info] Start training from score 7.527196
Voting Regressor RMSE: 2192.3972
Stacking Regressor RMSE: 2156.7484


In [47]:
result

{'Voting Regressor': 2192.3971947838304,
 'Stacking Regressor': 2156.748387458801}

In [48]:
stacking_pipeline.fit(train_x, train_y)

In [49]:
test = pd.read_csv('/content/data/test.csv')
test['Fingerprint'] = test['Smiles'].apply(smiles_to_fingerprint)
test_x = np.stack(test['Fingerprint'].values)
print(test_x.shape)  # (113,2048)
test_y_pred = stacking_pipeline.predict(test_x)
submit = pd.read_csv('/content/data/sample_submission.csv')
submit['IC50_nM'] = pIC50_to_IC50(test_y_pred)
submit.to_csv('./baseline_submit3.csv', index=False)

(113, 2048)




In [None]:
# RandomForestRegressor  튜닝
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
# 모델 초기화
model = RandomForestRegressor(random_state=CFG['SEED'])
param_grid = {
    'n_estimators': [50,100,200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1,2,4],
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False]
}
def rmse_score(y_true,y_pred):
  mse = mean_squared_error(pIC50_to_IC50(y_true), pIC50_to_IC50(y_pred))
  rmse = np.sqrt(mse)
  return rmse

# 사용자 정의 에러스코어
rmse_score = make_scorer(rmse_score, greater_is_better=False)

grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring=rmse_score, n_jobs=-1)
grid_search.fit(train_x, train_y)


In [None]:
model = grid_search.best_estimator_
test = pd.read_csv('/content/data/test.csv')
test['Fingerprint'] = test['Smiles'].apply(smiles_to_fingerprint)
test_x = np.stack(test['Fingerprint'].values)
test_y_pred = model.predict(test_x)
submit = pd.read_csv('/content/data/sample_submission.csv')
submit['IC50_nM'] = pIC50_to_IC50(test_y_pred)
submit.to_csv('./baseline_submit4.csv', index=False)

In [37]:
# 랜덤 포레스트 모델 학습
model = RandomForestRegressor(random_state=CFG['SEED'])
model.fit(train_x, train_y)

In [38]:
def pIC50_to_IC50(pic50_values):
    """Convert pIC50 values to IC50 (nM)."""
    return 10 ** (9 - pic50_values)

# Validation 데이터로부터의 학습 모델 평가
val_y_pred = model.predict(val_x)
mse = mean_squared_error(pIC50_to_IC50(val_y), pIC50_to_IC50(val_y_pred))
rmse = np.sqrt(mse)

print(f'RMSE: {rmse}')

RMSE: 2169.5781089857264


### Inference

In [22]:
test = pd.read_csv('/content/data/test.csv')
test['Fingerprint'] = test['Smiles'].apply(smiles_to_fingerprint)



In [27]:
# test['Fingerprint'].values[0] --> (113,) 이 안에.. 2048개의 배열이 113개
test_x = np.stack(test['Fingerprint'].values)
print(test_x.shape)  # (113,2048)
test_y_pred = model.predict(test_x)
print(test_y_pred.shape)

(113, 2048)
(113,)


### Submission

In [28]:
submit = pd.read_csv('/content/data/sample_submission.csv')
submit['IC50_nM'] = pIC50_to_IC50(test_y_pred)
submit.head()

Unnamed: 0,ID,IC50_nM
0,TEST_000,181.961706
1,TEST_001,31.6422
2,TEST_002,10.780527
3,TEST_003,21.376667
4,TEST_004,25.312789


In [29]:
submit.to_csv('./baseline_submit.csv', index=False)