<a href="https://colab.research.google.com/github/kimjaehwankimjaehwan/Dacon/blob/main/DeepLearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install rdkit-pypi



In [None]:
# 제 2회 신약 개발 AI 경진대회 Baseline
import pandas as pd
import numpy as np
import os
import random

from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [None]:
CFG = {
    'NBITS':2048,
    'SEED':42,
}

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(CFG['SEED']) # Seed 고정

# SMILES 데이터를 분자 지문으로 변환
def smiles_to_fingerprint(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=CFG['NBITS'])
        return np.array(fp)
    else:
        return np.zeros((CFG['NBITS'],))


In [None]:
# 학습 ChEMBL 데이터 로드
chembl_data = pd.read_csv('/content/drive/MyDrive/데이콘/train.csv')  # 예시 파일 이름
chembl_data.head()

train = chembl_data[['Smiles', 'pIC50']]
train['Fingerprint'] = train['Smiles'].apply(smiles_to_fingerprint)

train_x = np.stack(train['Fingerprint'].values)
train_y = train['pIC50'].values


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Fingerprint'] = train['Smiles'].apply(smiles_to_fingerprint)


In [None]:
# 학습 및 검증 데이터 분리
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.3, random_state=42)




In [None]:
train_x.shape , val_x.shape, train_y.shape, val_y.shape

((1366, 2048), (586, 2048), (1366,), (586,))

1366

In [None]:
import seaborn as sns
import tensorflow as tf
from tensorflow.keras import layers, models

In [None]:
dense1 = tf.keras.layers.Dense(200, activation='relu', input_shape = (2048,))
dense2 = tf.keras.layers.Dense(1024, activation='relu')
drop1 = tf.keras.layers.Dropout(0.01)
dense3 = tf.keras.layers.Dense(512, activation='relu')
dense4 = tf.keras.layers.Dense(64, activation='relu')
dense5 = tf.keras.layers.Dense(11, activation='softmax')
model = tf.keras.Sequential([dense1, dense2,drop1, dense3, dense4, dense5])
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
check_cb = tf.keras.callbacks.ModelCheckpoint('best_model.keras', save_best_only=True)
history = model.fit(train_x, train_y, epochs=50, validation_data=(val_x, val_y), callbacks=[check_cb])

Epoch 1/50
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 213ms/step - accuracy: 0.0147 - loss: 1.5687 - val_accuracy: 0.0137 - val_loss: 1.0890
Epoch 2/50
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 6ms/step - accuracy: 0.0106 - loss: 0.9576 - val_accuracy: 0.0154 - val_loss: 1.0107
Epoch 3/50
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.0232 - loss: 0.7520 - val_accuracy: 0.0154 - val_loss: 1.0034
Epoch 4/50
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0342 - loss: 0.5432 - val_accuracy: 0.0188 - val_loss: 1.0903
Epoch 5/50
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0306 - loss: 0.4030 - val_accuracy: 0.0119 - val_loss: 1.2726
Epoch 6/50
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.0413 - loss: 0.3020 - val_accuracy: 0.0171 - val_loss: 1.4361
Epoch 7/50
[1m43/43[0m [32m━━━━━━

In [None]:
# test = pd.read_csv('/content/drive/MyDrive/데이콘/test.csv')
# test['Fingerprint'] = test['Smiles'].apply(smiles_to_fingerprint)

# test_x = np.stack(test['Fingerprint'].values)

# test_y = model.predict(test_x)

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


In [None]:
def pIC50_to_IC50(pic50_values):
    """Convert pIC50 values to IC50 (nM)."""
    return 10 ** (9 - pic50_values)

test = pd.read_csv('/content/drive/MyDrive/데이콘/test.csv')
test['Fingerprint'] = test['Smiles'].apply(smiles_to_fingerprint)

test_x = np.stack(test['Fingerprint'].values)

test_y = model.predict(test_x)

# Ensure test_y is a NumPy array with the correct shape
test_y = np.argmax(test_y, axis=1)

model.evaluate(test_x, test_y)

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 1.0000 - loss: 0.0319 


[0.03120177611708641, 1.0]

In [None]:
# model.evaluate(test_x, test_y)

In [None]:
history.history.keys()

dict_keys(['accuracy', 'loss', 'val_accuracy', 'val_loss'])

In [None]:
check_cb = tf.keras.callbacks.ModelCheckpoint('best_model2.keras', save_best_only=True)
history = model.fit(train_x, train_y, epochs=200, validation_data=(val_x, val_y), callbacks=[check_cb])

Epoch 1/200
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.0294 - loss: 0.0751 - val_accuracy: 0.0205 - val_loss: 2.5961
Epoch 2/200
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.0389 - loss: 0.0261 - val_accuracy: 0.0188 - val_loss: 2.9905
Epoch 3/200
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0406 - loss: 0.0317 - val_accuracy: 0.0154 - val_loss: 3.1241
Epoch 4/200
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.0363 - loss: 0.0246 - val_accuracy: 0.0171 - val_loss: 3.1193
Epoch 5/200
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0429 - loss: 0.0232 - val_accuracy: 0.0171 - val_loss: 3.1637
Epoch 6/200
[1m43/43[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.0433 - loss: 0.0224 - val_accuracy: 0.0188 - val_loss: 3.3364
Epoch 7/200
[1m43/43[0m [32m━━━

In [None]:
# # Validation 데이터로부터의 학습 모델 평가
# val_y_pred = model.predict(val_x)
# mse = mean_squared_error(pIC50_to_IC50(val_y), pIC50_to_IC50(val_y_pred))
# rmse = np.sqrt(mse)

# print(f'RMSE: {rmse}')



In [None]:


submit = pd.read_csv('/content/drive/MyDrive/데이콘/sample_submission.csv')
submit['IC50_nM'] = pIC50_to_IC50(test_y)
submit.head()

submit.to_csv('./baseline_submit(deeplearning4).csv', index=False)