In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from sklearn.model_selection import train_test_split

In [2]:
# Load dataset
data = pd.read_excel('All_metabolite_antibiotics_and_non.xlsx')  # Ganti dengan nama file Anda

# Pisahkan features dan target
X = data['Smiles']
y = data['Class']  # Atau kolom target lainnya

In [3]:
def rdkit_descriptor(smiles):
    mol = Chem.MolFromSmiles(smiles)
    calculator = MoleculeDescriptors.MolecularDescriptorCalculator([x[0] for x in Descriptors._descList])
    return calculator.CalcDescriptors(mol)

In [4]:
from rdkit.Chem import AllChem
def morgan_fingerprint(smiles, radius=2, n_bits=2048):
    mol = Chem.MolFromSmiles(smiles)
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=n_bits)
    return np.array(fp)

In [5]:
morgan_fingerprint(X[1])

array([0, 1, 0, ..., 0, 0, 0])

In [6]:
from rdkit.Chem import MACCSkeys

def maccs_keys(smiles):
    mol = Chem.MolFromSmiles(smiles)
    fp = MACCSkeys.GenMACCSKeys(mol)
    return np.array(fp)

In [7]:
maccs_keys(X[1])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0])

In [8]:
#!pip install pubchempy
#!pip install xgboost

In [9]:
from pubchempy import get_compounds, Compound

def pubchem_fingerprint(smiles):
    compound = get_compounds(smiles, 'smiles')[0]
    return compound.cactvs_fingerprint

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rf_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

rf_grid = GridSearchCV(RandomForestClassifier(), rf_params, cv=5, n_jobs=-1, verbose=1)

from xgboost import XGBClassifier

xgb_params = {
    'n_estimators': [100, 200, 300],
    'max_depth': [3, 6, 9],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

xgb_grid = GridSearchCV(XGBClassifier(use_label_encoder=False, eval_metric='logloss'), 
                       xgb_params, cv=5, n_jobs=-1, verbose=1)

from sklearn.svm import SVC

svm_params = {
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1],
    'kernel': ['rbf', 'poly', 'sigmoid']
}

svm_grid = GridSearchCV(SVC(probability=True), svm_params, cv=5, n_jobs=-1, verbose=1)

from sklearn.neural_network import MLPClassifier

nn_params = {
    'hidden_layer_sizes': [(50,), (100,), (50,50), (100,50)],
    'activation': ['relu', 'tanh', 'logistic'],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate': ['constant', 'adaptive']
}

nn_grid = GridSearchCV(MLPClassifier(max_iter=1000), nn_params, cv=5, n_jobs=-1, verbose=1)




In [12]:
from joblib import dump
import os
import numpy as np
from sklearn.model_selection import train_test_split

# Buat folder untuk menyimpan model
os.makedirs('saved_models', exist_ok=True)

# Dictionary untuk menyimpan descriptor functions
descriptor_funcs = {
    'morgan': morgan_fingerprint,
    'maccs': maccs_keys,
}

# Dictionary untuk menyimpan model grids
model_grids = {
    'random_forest': rf_grid,
    'svm': svm_grid,
    'neural_network': nn_grid
}

# Loop melalui semua kombinasi descriptor dan model
for desc_name, desc_func in descriptor_funcs.items():
    print(f"\n🔬 Processing descriptor: {desc_name}")
    
    # Ekstrak features
    try:
        X_features = np.array([desc_func(smile) for smile in X])
    except Exception as e:
        print(f"❌ Error processing {desc_name}: {e}")
        continue
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X_features, y, test_size=0.2, random_state=42
    )
    
    for model_name, model_grid in model_grids.items():
        print(f"\n⚙️ Training {model_name} with {desc_name} descriptor...")
        
        try:
            # Latih model dengan grid search
            model_grid.fit(X_train, y_train)
            
            # Ambil model terbaik
            best_model = model_grid.best_estimator_
            
            # Simpan model
            filename = f"saved_models/{desc_name}_{model_name}_best.joblib"
            dump(best_model, filename, compress=3)
            
            # Simpan parameter terbaik
            best_params = model_grid.best_params_
            with open(f"saved_models/{desc_name}_{model_name}_best_params.txt", 'w') as f:
                f.write(str(best_params))
            
            print(f"✅ Best parameters for {model_name} with {desc_name}: {best_params}")
            print(f"✅ Model saved to {filename}")
        
        except Exception as e:
            print(f"❌ Error training {model_name} with {desc_name}: {e}")



🔬 Processing descriptor: morgan

⚙️ Training random_forest with morgan descriptor...
Fitting 5 folds for each of 216 candidates, totalling 1080 fits




✅ Best parameters for random_forest with morgan: {'bootstrap': False, 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
✅ Model saved to saved_models/morgan_random_forest_best.joblib

⚙️ Training svm with morgan descriptor...
Fitting 5 folds for each of 60 candidates, totalling 300 fits




✅ Best parameters for svm with morgan: {'C': 10, 'gamma': 'scale', 'kernel': 'sigmoid'}
✅ Model saved to saved_models/morgan_svm_best.joblib

⚙️ Training neural_network with morgan descriptor...
Fitting 5 folds for each of 72 candidates, totalling 360 fits




✅ Best parameters for neural_network with morgan: {'activation': 'logistic', 'alpha': 0.0001, 'hidden_layer_sizes': (100, 50), 'learning_rate': 'adaptive'}
✅ Model saved to saved_models/morgan_neural_network_best.joblib

🔬 Processing descriptor: maccs

⚙️ Training random_forest with maccs descriptor...
Fitting 5 folds for each of 216 candidates, totalling 1080 fits




✅ Best parameters for random_forest with maccs: {'bootstrap': False, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}
✅ Model saved to saved_models/maccs_random_forest_best.joblib

⚙️ Training svm with maccs descriptor...
Fitting 5 folds for each of 60 candidates, totalling 300 fits




✅ Best parameters for svm with maccs: {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
✅ Model saved to saved_models/maccs_svm_best.joblib

⚙️ Training neural_network with maccs descriptor...
Fitting 5 folds for each of 72 candidates, totalling 360 fits




✅ Best parameters for neural_network with maccs: {'activation': 'logistic', 'alpha': 0.01, 'hidden_layer_sizes': (50,), 'learning_rate': 'adaptive'}
✅ Model saved to saved_models/maccs_neural_network_best.joblib
