In [10]:
import pandas as pd
import numpy as np
import os
import random

from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.model_selection import KFold, train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


**seed 고정**

seed 고정 이유:
재현성을 보장하기 위해/ 동일한 데이터와 동일한 코드로 항상 동일한 결과를 얻음. 

In [12]:
CFG = {
    'NBITS':2048,
    'SEED':42,
    'N_ESTIMATORS' :200,

}

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    return f"Seed {seed} has been set."
seed_everything(CFG['SEED'])

# SMILES 데이터를 분자 지문으로 변환
def smiles_to_fingerprint(Smiles):
    mol = Chem.MolFromSmiles(Smiles)
    if mol is not None:
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=CFG['NBITS'])
        return np.array(fp)
    else:
        return np.zeros((CFG['NBITS'],))
    


# 학습 ChEMBL 데이터 로드
chembl_data = pd.read_csv('./open/train.csv')  # 예시 파일 이름
chembl_data.head()

train = chembl_data[['Smiles', 'pIC50']]
train['Fingerprint'] = train['Smiles'].apply(smiles_to_fingerprint)
train_x = np.stack(train['Fingerprint'].values)
train_y = train['pIC50'].values


# 학습 및 검증 데이터 분리
train_x, val_x, train_y, val_y = train_test_split(train_x, train_y, test_size=0.25, random_state=42)

# 랜덤 포레스트 모델 학습
model = RandomForestRegressor(max_depth=None, min_samples_leaf=1, min_samples_split=5, n_estimators=100, random_state=42)
model.fit(train_x, train_y)


def pIC50_to_IC50(pic50_values):
    """Convert pIC50 values to IC50 (nM)."""
    return 10 ** (9 - pic50_values)

# Validation 데이터로부터의 학습 모델 평가
val_y_pred = model.predict(val_x)

# IC50 단위로 변환된 값
val_y_ic50 = pIC50_to_IC50(val_y)
val_y_pred_ic50 = pIC50_to_IC50(val_y_pred)

mse = mean_squared_error(val_y_ic50, val_y_pred_ic50)
rmse = np.sqrt(mse)
mae = mean_absolute_error(val_y_ic50, val_y_pred_ic50)
r2 = r2_score(val_y_ic50, val_y_pred_ic50)

print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R² Score: {r2}')

#파일 저장하기

test = pd.read_csv('./open/test.csv')
test['Fingerprint'] = test['Smiles'].apply(smiles_to_fingerprint)

test_x = np.stack(test['Fingerprint'].values)
test_y_pred = model.predict(test_x)

submit = pd.read_csv('./open/sample_submission.csv')
submit['IC50_nM'] = pIC50_to_IC50(test_y_pred)
submit.head()

submit.to_csv('./open/submit_file.csv', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['Fingerprint'] = train['Smiles'].apply(smiles_to_fingerprint)


RMSE: 1779.585927547353
MAE: 514.1858517109845
R² Score: 0.16630680210327797


In [4]:
import deepchem as dc
import pandas as pd
from rdkit import Chem
from sklearn.ensemble import RandomForestRegressor

import numpy as np
import os
import random

from rdkit.Chem import AllChem
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [14]:



# 데이터 로드
chembl_data = pd.read_csv('./open/train.csv')

# SMILES와 pIC50 추출
chembl_data = chembl_data[['Smiles', 'pIC50']]

# DeepChem의 featurizer를 이용해 SMILES 데이터를 피처로 변환
featurizer = dc.feat.CircularFingerprint(size=2048)  # Morgan Fingerprint 사용
features = featurizer.featurize(chembl_data['Smiles'])

# DeepChem의 Dataset 형식으로 변환
dataset = dc.data.NumpyDataset(X=features, y=chembl_data['pIC50'].values)


# RandomForestRegressor를 DeepChem의 SklearnModel 래퍼로 감싸기
rf_model = dc.models.SklearnModel(RandomForestRegressor(n_estimators=200))

# 모델 학습
rf_model.fit(dataset)

# 모델 평가
metric = dc.metrics.Metric(dc.metrics.r2_score)
score = rf_model.evaluate(dataset, [metric])
# print(f"R^2 Score: {score['r2_score']}")

# Validation set 평가
val_data = pd.read_csv('./open/test.csv')
val_features = featurizer.featurize(val_data['Smiles'])
val_dataset = dc.data.NumpyDataset(X=val_features)

# 예측 수행
predictions = rf_model.predict(val_dataset)
print(predictions)

# IC50 값으로 변환 및 저장
def pIC50_to_IC50(pic50_values):
    return 10 ** (9 - pic50_values)

val_y_ic50 = pIC50_to_IC50(val_y)
val_y_pred_ic50 = pIC50_to_IC50(val_y_pred)

mse = mean_squared_error(val_y_ic50, val_y_pred_ic50)
rmse = np.sqrt(mse)
mae = mean_absolute_error(val_y_ic50, val_y_pred_ic50)
r2 = r2_score(val_y_ic50, val_y_pred_ic50)

print(f'RMSE: {rmse}')
print(f'MAE: {mae}')
print(f'R² Score: {r2}')





submit = pd.DataFrame({'SMILES': val_data['Smiles'], 'IC50_nM': predicted_ic50})
submit.to_csv('./open/submit_file.csv', index=False)



R^2 Score: 0.9547830068802042
[6.53590417 7.56868333 7.844055   7.6322     7.5886     7.90321333
 7.615205   7.63235    7.54309333 6.84411548 6.84471548 7.73174333
 7.5914     6.54315    7.5478     7.48505    7.60533333 7.59830333
 7.79888833 7.75123333 7.838155   7.5958     7.147175   7.94406333
 7.837005   7.64125    7.76336333 7.63196667 7.84062833 7.803655
 7.88973833 7.70244833 7.683805   7.21837833 7.25275833 7.79508833
 7.32732167 7.34355    7.73183    7.537425   7.849205   7.72040833
 7.73645833 7.838155   7.520825   8.26883    7.06931548 6.98056548
 7.019425   7.77123833 7.397175   7.79450833 7.4046     7.86461333
 7.5941     7.81027833 7.0486625  7.10465    7.63161667 7.52401667
 7.0332375  7.922855   7.81677833 7.60243333 7.60291667 7.71039833
 7.5344     7.55695    7.65328833 7.66253333 7.792505   7.05545
 7.90108333 7.24035    7.71048833 7.43390333 7.645      7.81005833
 7.825505   7.66056667 7.6928     7.620375   7.63886667 7.92666333
 7.4253     7.50918333 7.64238833 7.5

In [9]:
import deepchem as dc
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np
from deepchem.models import MPNNModel

# Set random seed for reproducibility
seed = 42
np.random.seed(seed)

# Load the dataset
data_path = './open/train.csv'
chembl_data = pd.read_csv(data_path)

# Prepare features and targets
featurizer = dc.feat.MolGraphConvFeaturizer()
X = featurizer.featurize(chembl_data['Smiles'])
y = chembl_data['pIC50'].values

# Split the dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=seed)

# Create DeepChem NumpyDatasets for training and validation
train_dataset = dc.data.NumpyDataset(X_train, y_train)
val_dataset = dc.data.NumpyDataset(X_val, y_val)

# Define the MPNN Model
model = MPNNModel(
    n_tasks=1,  # Single regression task
    mode='regression', 
    number_of_molecules=1, 
    n_atom_feat=75, 
    n_pair_feat=14, 
    n_hidden=128, 
    T=5, 
    M=3, 
    batch_size=32, 
    learning_rate=1e-3,
    use_queue=True,
    random_seed=seed
)

# Train the model
model.fit(train_dataset, nb_epoch=100)

# Validate the model
y_pred = model.predict(val_dataset)

# Evaluate model performance
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_val, y_pred)

print(f'Validation RMSE: {rmse}')
print(f'Validation R²: {r2}')

# Load the test set and predict IC50 values
test_data = pd.read_csv('./data/test.csv')
X_test = featurizer.featurize(test_data['Smiles'])
test_dataset = dc.data.NumpyDataset(X_test)
test_pred = model.predict(test_dataset)

# Save the predictions
submit = pd.DataFrame({'SMILES': test_data['Smiles'], 'Predicted_IC50': test_pred.flatten()})
submit.to_csv('./submission.csv', index=False)


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'g

2024-08-21 15:34:08.671292: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2024-08-21 15:34:08.673768: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2024-08-21 15:34:08.687020: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module 'gast' has no attribute 'Index'


AttributeError: module 'tensorflow.keras.optimizers' has no attribute 'legacy'