<a href="https://colab.research.google.com/github/kimjaehwankimjaehwan/Dacon/blob/main/xgboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.9 kB)
Downloading rdkit-2024.3.5-cp310-cp310-manylinux_2_28_x86_64.whl (33.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.1/33.1 MB[0m [31m44.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2024.3.5


In [31]:
import pandas as pd
import numpy as np
import os
import random

from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

CFG = {
    'NBITS':2048,
    'SEED':42,
}

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(CFG['SEED']) # Seed 고정

# SMILES 데이터를 분자 지문으로 변환
def smiles_to_fingerprint(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=CFG['NBITS'])
        return np.array(fp)
    else:
        return np.zeros((CFG['NBITS'],))

# 학습 ChEMBL 데이터 로드
chembl_data = pd.read_csv('train.csv')  # 예시 파일 이름
chembl_data.head()

train = chembl_data[['Smiles', 'pIC50']]
train['Fingerprint'] = train['Smiles'].apply(smiles_to_fingerprint)

train_x = np.stack(train['Fingerprint'].values)
train_y = train['pIC50'].values

# train_x, train_y 를 하나로 합치고 csv 파일로 저장
train_data = pd.DataFrame({'Fingerprint': list(train_x), 'pIC50': train_y})
train_data.to_csv('train_data.csv', index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Fingerprint'] = train['Smiles'].apply(smiles_to_fingerprint)


In [32]:
def pIC50_to_IC50(pic50_values):
    """Convert pIC50 values to IC50 (nM)."""
    return 10 ** (9 - pic50_values)


test = pd.read_csv('./test.csv')
test['Fingerprint'] = test['Smiles'].apply(smiles_to_fingerprint)

test_x = np.stack(test['Fingerprint'].values)

# test_x 데이터를 csv 파일로 저장
test_data = pd.DataFrame({'Fingerprint': list(test_x)})
test_data.to_csv('test_data.csv', index=False)

submit = pd.read_csv('./sample_submission.csv')




In [14]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# XGBoost 모델을 학습 및 평가하는 함수
def train_xgboost_model(train_x, train_y, val_x, val_y):
    # 데이터셋을 DMatrix 형태로 변환
    dtrain = xgb.DMatrix(train_x, label=train_y)
    dval = xgb.DMatrix(val_x, label=val_y)

    # 파라미터 설정
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'seed': CFG['SEED'],
    }

    # 모델 학습
    evals = [(dtrain, 'train'), (dval, 'eval')]
    model = xgb.train(params, dtrain, num_boost_round=1000, evals=evals, early_stopping_rounds=50, verbose_eval=100)

    return model

# 데이터 분리 (이전과 동일)
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.3, random_state=42)

# XGBoost 모델 학습
xgb_model = train_xgboost_model(train_x, train_y, val_x, val_y)

# 검증 데이터에 대한 예측
val_y_pred = xgb_model.predict(xgb.DMatrix(val_x))

# RMSE 계산
mse = mean_squared_error(pIC50_to_IC50(val_y), pIC50_to_IC50(val_y_pred))
rmse = np.sqrt(mse)

print(f'XGBoost RMSE: {rmse}')

# 테스트 데이터 예측
test_y_pred = xgb_model.predict(xgb.DMatrix(test_x))

# 예측 결과를 제출 파일에 저장
submit['IC50_nM'] = pIC50_to_IC50(test_y_pred)
submit.to_csv('./xgboost_submit.csv', index=False)


[0]	train-rmse:0.89649	eval-rmse:0.96123
[85]	train-rmse:0.12591	eval-rmse:0.71288
XGBoost RMSE: 1503.2879516929443


In [15]:
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb
from sklearn.model_selection import train_test_split

# 데이터 분리
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.3, random_state=42)

# 랜덤 포레스트 모델 학습
rf_model = RandomForestRegressor(random_state=CFG['SEED'])
rf_model.fit(train_x, train_y)
rf_val_pred = rf_model.predict(val_x)

# XGBoost 모델 학습
dtrain = xgb.DMatrix(train_x, label=train_y)
dval = xgb.DMatrix(val_x, label=val_y)
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'seed': CFG['SEED'],
}
xgb_model = xgb.train(params, dtrain, num_boost_round=1000, evals=[(dtrain, 'train'), (dval, 'eval')], early_stopping_rounds=50, verbose_eval=100)
xgb_val_pred = xgb_model.predict(dval)

# 앙상블: 두 모델의 예측 결과를 평균
ensemble_val_pred = (rf_val_pred + xgb_val_pred) / 2

# RMSE 계산
mse = mean_squared_error(pIC50_to_IC50(val_y), pIC50_to_IC50(ensemble_val_pred))
rmse = np.sqrt(mse)

print(f'Ensemble RMSE: {rmse}')

# 테스트 데이터 예측
rf_test_pred = rf_model.predict(test_x)
xgb_test_pred = xgb_model.predict(xgb.DMatrix(test_x))

# 앙상블: 테스트 데이터에 대한 예측도 평균
ensemble_test_pred = (rf_test_pred + xgb_test_pred) / 2

# 예측 결과를 제출 파일에 저장
submit['IC50_nM'] = pIC50_to_IC50(ensemble_test_pred)
submit.to_csv('./ensemble_submit.csv', index=False)


[0]	train-rmse:0.85403	eval-rmse:1.03867
[55]	train-rmse:0.13011	eval-rmse:0.95418
Ensemble RMSE: 5330.351358056268


In [23]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

# 데이터 분리
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.3, random_state=42)

# 신경망 모델 생성
def create_model(input_dim):
    model = Sequential()
    model.add(Dense(512, input_dim=input_dim, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(256, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(1, activation='linear'))  # 회귀 모델이므로 마지막 층은 선형 활성화 함수 사용
    return model

# 모델 컴파일
model = create_model(train_x.shape[1])
model.compile(optimizer='adam', loss='mean_squared_error')

# 모델 학습
history = model.fit(train_x, train_y, validation_data=(val_x, val_y), epochs=100, batch_size=32, verbose=1)

# 검증 데이터 예측
val_y_pred = model.predict(val_x)
mse = mean_squared_error(pIC50_to_IC50(val_y), pIC50_to_IC50(val_y_pred))
rmse = np.sqrt(mse)

# 테스트 데이터 예측
test_y_pred = model.predict(test_x)

# 예측 결과를 제출 파일에 저장
submit['IC50_nM'] = pIC50_to_IC50(test_y_pred)
submit.to_csv('deep_learning_submit.csv', index=False)

rmse


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - loss: 61.2119 - val_loss: 52.2471
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 153ms/step - loss: 53.6103 - val_loss: 44.9623
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 89ms/step - loss: 44.7563 - val_loss: 36.5733
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 98ms/step - loss: 36.2728 - val_loss: 27.1026
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 137ms/step - loss: 28.3845 - val_loss: 17.0925
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 129ms/step - loss: 19.2752 - val_loss: 7.7223
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 137ms/step - loss: 6.0084 - val_loss: 1.2895
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 138ms/step - loss: 2.2226 - val_loss: 0.8514
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step




[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step  


13509.265661573403

In [24]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.metrics import mean_squared_error
import numpy as np

# 데이터 분리 (이미 train_x, train_y, val_x, val_y로 분리되어 있다고 가정)
# train_x, val_x, train_y, val_y = ...

# 신경망 모델 생성
def create_optimized_model(input_dim):
    model = Sequential()
    model.add(Dense(1024, input_dim=input_dim, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.4))

    model.add(Dense(512, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.4))

    model.add(Dense(256, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.4))

    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.3))

    model.add(Dense(1, activation='linear'))  # 회귀 모델이므로 마지막 층은 선형 활성화 함수 사용
    return model

# 모델 컴파일
model = create_optimized_model(train_x.shape[1])
model.compile(optimizer='adam', loss='mean_squared_error')

# 콜백 설정
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001)

# 모델 학습
history = model.fit(train_x, train_y,
                    validation_data=(val_x, val_y),
                    epochs=100,
                    batch_size=64,
                    callbacks=[early_stopping, reduce_lr],
                    verbose=1)

# 검증 데이터 예측
val_y_pred = model.predict(val_x)
mse = mean_squared_error(pIC50_to_IC50(val_y), pIC50_to_IC50(val_y_pred))
rmse = np.sqrt(mse)

# 테스트 데이터 예측
test_y_pred = model.predict(test_x)

# 예측 결과를 제출 파일에 저장
submit['IC50_nM'] = pIC50_to_IC50(test_y_pred)
submit.to_csv('optimized_deep_learning_submit.csv', index=False)

rmse


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - loss: 62.1079 - val_loss: 58.2188 - learning_rate: 0.0010
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 341ms/step - loss: 64.1318 - val_loss: 57.9586 - learning_rate: 0.0010
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 115ms/step - loss: 50.1812 - val_loss: 57.6599 - learning_rate: 0.0010
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/step - loss: 58.3699 - val_loss: 57.1154 - learning_rate: 0.0010
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 100ms/step - loss: 53.4583 - val_loss: 56.4170 - learning_rate: 0.0010
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 105ms/step - loss: 48.1448 - val_loss: 55.5674 - learning_rate: 0.0010
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 147ms/step - loss: 49.9186 - val_loss: 54.5514 - learning_ra

2102895.460111993

In [25]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization

def create_deeper_model(input_dim):
    model = Sequential()

    # 첫 번째 레이어
    model.add(Dense(1024, input_dim=input_dim, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))

    # 추가된 중간 레이어
    model.add(Dense(512, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))

    model.add(Dense(512, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))

    model.add(Dense(256, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.4))

    model.add(Dense(256, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.4))

    # 마지막 레이어
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.3))

    model.add(Dense(1, activation='linear'))  # 회귀 문제이므로 마지막 층은 선형 활성화 함수 사용

    return model

# 모델 생성 및 컴파일
model = create_deeper_model(train_x.shape[1])
model.compile(optimizer='adam', loss='mean_squared_error')

# 모델 학습
history = model.fit(train_x, train_y,
                    validation_data=(val_x, val_y),
                    epochs=100,
                    batch_size=64,
                    callbacks=[early_stopping, reduce_lr],
                    verbose=1)

# 검증 데이터에 대한 예측 및 RMSE 계산
val_y_pred = model.predict(val_x)
mse = mean_squared_error(pIC50_to_IC50(val_y), pIC50_to_IC50(val_y_pred))
rmse = np.sqrt(mse)

# 테스트 데이터 예측
test_y_pred = model.predict(test_x)

# 예측 결과를 제출 파일에 저장
submit['IC50_nM'] = pIC50_to_IC50(test_y_pred)
submit.to_csv('deeper_model_submit.csv', index=False)

rmse


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step - loss: 47.8477 - val_loss: 56.9487 - learning_rate: 0.0010
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step - loss: 53.2271 - val_loss: 56.9462 - learning_rate: 0.0010
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 125ms/step - loss: 51.7546 - val_loss: 56.9556 - learning_rate: 0.0010
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 114ms/step - loss: 47.4488 - val_loss: 56.7692 - learning_rate: 0.0010
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/step - loss: 42.2586 - val_loss: 56.3712 - learning_rate: 0.0010
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 139ms/step - loss: 39.4624 - val_loss: 55.8339 - learning_rate: 0.0010
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 162ms/step - loss: 48.5611 - val_loss: 55.1642 - learning_ra

779910399.3670728

In [26]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam

# 신경망 모델 생성
def create_model_with_adamw(input_dim):
    model = Sequential()

    # 첫 번째 레이어
    model.add(Dense(1024, input_dim=input_dim, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))

    # 중간 레이어
    model.add(Dense(512, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))

    model.add(Dense(256, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.4))

    # 마지막 레이어
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.3))

    model.add(Dense(1, activation='linear'))  # 회귀 문제이므로 마지막 층은 선형 활성화 함수 사용

    return model

# 모델 생성
model = create_model_with_adamw(train_x.shape[1])

# AdamW 옵티마이저 설정
# learning_rate와 weight_decay는 실험에 따라 조정 가능
optimizer = Adam(learning_rate=0.001, decay=0.01)

# 모델 컴파일
model.compile(optimizer=optimizer, loss='mean_squared_error')

# 콜백 설정
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001)

# 모델 학습
history = model.fit(train_x, train_y,
                    validation_data=(val_x, val_y),
                    epochs=100,
                    batch_size=64,
                    callbacks=[early_stopping, reduce_lr],
                    verbose=1)

# 검증 데이터 예측 및 RMSE 계산
val_y_pred = model.predict(val_x)
mse = mean_squared_error(pIC50_to_IC50(val_y), pIC50_to_IC50(val_y_pred))
rmse = np.sqrt(mse)

# 테스트 데이터 예측
test_y_pred = model.predict(test_x)

# 예측 결과를 제출 파일에 저장
submit['IC50_nM'] = pIC50_to_IC50(test_y_pred)
submit.to_csv('adamw_model_submit.csv', index=False)

rmse


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4s/step - loss: 71.5571 - val_loss: 58.7611 - learning_rate: 0.0010
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 114ms/step - loss: 67.5287 - val_loss: 58.4562 - learning_rate: 0.0010
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step - loss: 66.2150 - val_loss: 57.9103 - learning_rate: 0.0010
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 129ms/step - loss: 66.8992 - val_loss: 57.2845 - learning_rate: 0.0010
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 132ms/step - loss: 54.5800 - val_loss: 56.6353 - learning_rate: 0.0010
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 109ms/step - loss: 62.6661 - val_loss: 55.7728 - learning_rate: 0.0010
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 142ms/step - loss: 61.2547 - val_loss: 54.8636 - learning_rate: 0.0010


267073.3557743491

In [27]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization

def create_deeper_model(input_dim):
    model = Sequential()

    # 첫 번째 레이어
    model.add(Dense(2048, input_dim=input_dim, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))

    # 중간 레이어들
    model.add(Dense(1024, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))

    model.add(Dense(1024, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))

    model.add(Dense(512, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.4))

    model.add(Dense(512, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.4))

    model.add(Dense(256, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.3))

    model.add(Dense(256, activation='relu'))
    model.add(BatchNormalization())
    model.add(Dropout(0.3))

    # 마지막 레이어
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.3))

    model.add(Dense(1, activation='linear'))  # 회귀 문제이므로 마지막 층은 선형 활성화 함수 사용

    return model

# 모델 생성
model = create_deeper_model(train_x.shape[1])

# AdamW 옵티마이저 사용
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001, decay=0.01)
model.compile(optimizer=optimizer, loss='mean_squared_error')

# 콜백 설정
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=5, min_lr=0.0001)

# 모델 학습
history = model.fit(train_x, train_y,
                    validation_data=(val_x, val_y),
                    epochs=100,
                    batch_size=64,
                    callbacks=[early_stopping, reduce_lr],
                    verbose=1)

# 검증 데이터 예측 및 RMSE 계산
val_y_pred = model.predict(val_x)
mse = mean_squared_error(pIC50_to_IC50(val_y), pIC50_to_IC50(val_y_pred))
rmse = np.sqrt(mse)

# 테스트 데이터 예측
test_y_pred = model.predict(test_x)

# 예측 결과를 제출 파일에 저장
submit['IC50_nM'] = pIC50_to_IC50(test_y_pred)
submit.to_csv('deeper_model_submit.csv', index=False)

rmse


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 7s/step - loss: 60.0056 - val_loss: 58.2984 - learning_rate: 0.0010
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 205ms/step - loss: 67.8590 - val_loss: 58.0640 - learning_rate: 0.0010
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 269ms/step - loss: 55.0601 - val_loss: 57.6279 - learning_rate: 0.0010
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 305ms/step - loss: 51.6682 - val_loss: 57.1894 - learning_rate: 0.0010
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 371ms/step - loss: 58.3816 - val_loss: 56.8814 - learning_rate: 0.0010
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 265ms/step - loss: 53.9706 - val_loss: 56.5547 - learning_rate: 0.0010
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 240ms/step - loss: 54.2328 - val_loss: 56.2225 - learning_rate: 0.0010


198518.3625490051

In [28]:
# AdamW 모델 설정 및 학습
optimizer_adamw = tf.keras.optimizers.Adam(learning_rate=0.001, decay=0.01)
model_adamw = create_deeper_model(train_x.shape[1])
model_adamw.compile(optimizer=optimizer_adamw, loss='mean_squared_error')

# 모델 학습
history_adamw = model_adamw.fit(train_x, train_y,
                                validation_data=(val_x, val_y),
                                epochs=100,
                                batch_size=64,
                                callbacks=[early_stopping, reduce_lr],
                                verbose=1)

# 검증 데이터 예측 및 RMSE 계산
val_y_pred_adamw = model_adamw.predict(val_x)
mse_adamw = mean_squared_error(pIC50_to_IC50(val_y), pIC50_to_IC50(val_y_pred_adamw))
rmse_adamw = np.sqrt(mse_adamw)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 12s/step - loss: 54.8828 - val_loss: 58.6891 - learning_rate: 0.0010
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 161ms/step - loss: 58.1630 - val_loss: 58.3398 - learning_rate: 0.0010
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 162ms/step - loss: 45.5529 - val_loss: 57.8976 - learning_rate: 0.0010
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 160ms/step - loss: 56.8141 - val_loss: 57.4436 - learning_rate: 0.0010
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 183ms/step - loss: 53.1980 - val_loss: 56.9999 - learning_rate: 0.0010
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 280ms/step - loss: 50.3133 - val_loss: 56.4412 - learning_rate: 0.0010
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 175ms/step - loss: 49.4860 - val_loss: 55.7361 - learning

In [29]:
# Nadam 모델 설정 및 학습
optimizer_nadam = tf.keras.optimizers.Nadam(learning_rate=0.001)
model_nadam = create_deeper_model(train_x.shape[1])
model_nadam.compile(optimizer=optimizer_nadam, loss='mean_squared_error')

# 모델 학습
history_nadam = model_nadam.fit(train_x, train_y,
                                validation_data=(val_x, val_y),
                                epochs=100,
                                batch_size=64,
                                callbacks=[early_stopping, reduce_lr],
                                verbose=1)

# 검증 데이터 예측 및 RMSE 계산
val_y_pred_nadam = model_nadam.predict(val_x)
mse_nadam = mean_squared_error(pIC50_to_IC50(val_y), pIC50_to_IC50(val_y_pred_nadam))
rmse_nadam = np.sqrt(mse_nadam)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 11s/step - loss: 103.1606 - val_loss: 58.9846 - learning_rate: 0.0010
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 558ms/step - loss: 101.8751 - val_loss: 58.5420 - learning_rate: 0.0010
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 356ms/step - loss: 96.1811 - val_loss: 58.3597 - learning_rate: 0.0010
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 248ms/step - loss: 92.3239 - val_loss: 58.2245 - learning_rate: 0.0010
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 299ms/step - loss: 85.1873 - val_loss: 58.1961 - learning_rate: 0.0010
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 180ms/step - loss: 92.7776 - val_loss: 58.1067 - learning_rate: 0.0010
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 325ms/step - loss: 90.4741 - val_loss: 57.9670 - learni

In [30]:
print(f'AdamW RMSE: {rmse_adamw}')
print(f'Nadam RMSE: {rmse_nadam}')


AdamW RMSE: 993861138.4429259
Nadam RMSE: 1040446159.5401345


In [33]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem

# SMILES 데이터를 분자 지문으로 변환하는 함수
def smiles_to_fingerprint(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=2048)
        return np.array(fp)
    else:
        return np.zeros((2048,))

# 원본 데이터 로드
original_train_data = pd.read_csv('train.csv')

# Fingerprint 생성
original_train_data['Fingerprint'] = original_train_data['Smiles'].apply(smiles_to_fingerprint)

# Fingerprint와 pIC50만 추출하여 저장
train_data = original_train_data[['Fingerprint', 'pIC50']]
train_data.to_csv('train_data_corrected.csv', index=False)




In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# 랜덤 포레스트 하이퍼파라미터 그리드 설정
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# 랜덤 포레스트 모델 초기화
rf = RandomForestRegressor(random_state=42)

# GridSearchCV 초기화
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=2, scoring='neg_mean_squared_error')

# GridSearchCV 실행
grid_search.fit(train_x, train_y)

# 최적의 하이퍼파라미터와 그에 따른 성능 확인
best_params = grid_search.best_params_
best_score = grid_search.best_score_

best_params, np.sqrt(-best_score)


Fitting 3 folds for each of 108 candidates, totalling 324 fits


  pid = os.fork()
