In [1]:
pip install rdkit

Collecting rdkit
  Downloading rdkit-2025.9.1-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.1 kB)
Downloading rdkit-2025.9.1-cp312-cp312-manylinux_2_28_x86_64.whl (36.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.2/36.2 MB[0m [31m41.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2025.9.1


In [37]:
# Cell 1: Install and Import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import Descriptors
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, roc_auc_score
import pickle
import warnings
warnings.filterwarnings('ignore')

In [38]:
# Cell 2: Load Data
file_path = r'/content/bixuchenggong11 (1).CSV'
df = pd.read_csv(file_path)
print(f"Original dataset: {len(df)} rows")
print(df.tail())
df["smiles"].dropna()

Original dataset: 305 rows
     num name  activity smiles
300  NaN   85       0.0    NaN
301  NaN   86       0.0    NaN
302  NaN  103       NaN    NaN
303  NaN  104       NaN    NaN
304  NaN  105       NaN    NaN


Unnamed: 0,smiles
0,ClC1=CC=C([C@H]2C3=CC=CC=C3[C@@H](NC)CC2)C=C1Cl
1,ClC1=CC=C([C@H]2C3=CC=CC=C3[C@@H](N(C)C)CC2)C=...
2,ClC1=CC=C([C@H]2C3=CC=CC=C3[C@@H](NCC)CC2)C=C1Cl
3,ClC1=CC=C([C@H]2C3=CC=CC=C3[C@@H](NCCC)CC2)C=C1Cl
4,ClC1=CC=C([C@H]2C3=CC=CC=C3[C@@H](NCCCCC)CC2)C...
...,...
263,O=C1c2ccccc2[C@H](c2ccc(Cl)c(Cl)c2)C[C@H]1O
264,CN[C@@H]1CC[C@H](c2ccc(Cl)c(Cl)c2)c2ccccc21
265,CN[C@H]1CC[C@@H](c2ccc(Cl)c(Cl)c2)c2ccccc21
266,CN[C@H]1CC[C@H](c2ccc(Cl)c(Cl)c2)c2ccccc21


In [39]:
# Cell 3: Validate and Clean SMILES
def is_valid_smiles(smiles):
    if pd.isna(smiles) or not isinstance(smiles, str):
        return False
    try:
        mol = Chem.MolFromSmiles(smiles)
        return mol is not None
    except:
        return False

df['valid'] = df['smiles'].apply(is_valid_smiles)
print(f"\nValid SMILES: {df['valid'].sum()}")
print(f"Invalid SMILES: {(~df['valid']).sum()}")

df = df[df['valid']].drop('valid', axis=1).reset_index(drop=True)
print(f"After cleaning: {len(df)} rows")


Valid SMILES: 268
Invalid SMILES: 37
After cleaning: 268 rows


In [40]:
# Cell 4: Extract Features from SMILES
def get_descriptors(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return {
        'MolWt': Descriptors.MolWt(mol),
        'LogP': Descriptors.MolLogP(mol),
        'NumHDonors': Descriptors.NumHDonors(mol),
        'NumHAcceptors': Descriptors.NumHAcceptors(mol),
        'NumRotatableBonds': Descriptors.NumRotatableBonds(mol),
        'TPSA': Descriptors.TPSA(mol),
        'NumAromaticRings': Descriptors.NumAromaticRings(mol),
        'NumAliphaticRings': Descriptors.NumAliphaticRings(mol),
        'NumHeavyAtoms': Descriptors.HeavyAtomCount(mol),
        'RingCount': Descriptors.RingCount(mol),
        'FractionCsp3': Descriptors.FractionCSP3(mol),
        'MolMR': Descriptors.MolMR(mol),
        'BertzCT': Descriptors.BertzCT(mol),
        'Chi0v': Descriptors.Chi0v(mol),
        'Chi1v': Descriptors.Chi1v(mol),
        'Kappa1': Descriptors.Kappa1(mol),
        'Kappa2': Descriptors.Kappa2(mol),
        'Kappa3': Descriptors.Kappa3(mol),
        'NumSaturatedRings': Descriptors.NumSaturatedRings(mol),
        'MaxPartialCharge': Descriptors.MaxPartialCharge(mol),
        'MinPartialCharge': Descriptors.MinPartialCharge(mol)
    }

print("\nExtracting features...")
descriptors_list = [get_descriptors(smiles) for smiles in df['smiles']]
df_features = pd.DataFrame(descriptors_list)
df_features['activity'] = df['activity'].values

print(f"Features extracted: {df_features.shape[1]-1} descriptors")
df_features.head()


Extracting features...
Features extracted: 21 descriptors


Unnamed: 0,MolWt,LogP,NumHDonors,NumHAcceptors,NumRotatableBonds,TPSA,NumAromaticRings,NumAliphaticRings,NumHeavyAtoms,RingCount,...,BertzCT,Chi0v,Chi1v,Kappa1,Kappa2,Kappa3,NumSaturatedRings,MaxPartialCharge,MinPartialCharge,activity
0,306.236,5.1796,1,1,2,12.03,2,1,20,3,...,624.137111,12.878153,7.670475,13.940059,5.767689,2.460404,0,0.059477,-0.312979,1.0
1,320.263,5.5218,0,1,2,3.24,2,1,21,3,...,651.416628,13.825366,8.034426,14.898156,5.99955,2.829184,0,0.059477,-0.302387,0.0
2,320.263,5.5697,1,1,3,12.03,2,1,21,3,...,638.865495,13.58526,8.231135,14.898156,6.393119,2.962366,0,0.059477,-0.310206,0.0
3,334.29,5.9598,1,1,4,12.03,2,1,22,3,...,653.633485,14.292366,8.731135,15.859752,7.037704,3.172233,0,0.059477,-0.309942,0.0
4,362.344,6.74,1,1,6,12.03,2,1,24,3,...,683.289004,15.70658,9.731135,17.791826,8.378995,3.96149,0,0.059477,-0.309932,0.0


In [20]:
# Cell 5: Split X and y
X = df_features.drop('activity', axis=1)
y = df_features['activity']

print(f"\nX shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"y distribution:\n{y.value_counts()}")


X shape: (268, 21)
y shape: (268,)
y distribution:
activity
1.0    137
0.0    131
Name: count, dtype: int64


In [21]:
# Cell 6: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"\nTrain size: {X_train.shape}")
print(f"Test size: {X_test.shape}")


Train size: (214, 21)
Test size: (54, 21)


In [22]:
# Cell 7: Scale Data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
print("\nData scaled successfully")


Data scaled successfully


In [26]:
# Cell 8: Train Models
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42, eval_metric='logloss'),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'SVC': SVC(probability=True, random_state=42),
    'GradientBoostingClassifier': GradientBoostingClassifier(random_state=42)
}

results = {}
print("\n" + "="*70)
print("TRAINING MODELS")
print("="*70)

for name, model in models.items():
    print(f"\nTraining {name}...")
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)
    y_proba = model.predict_proba(X_test_scaled)[:, 1]

    results[name] = {
        'model': model,
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, average='weighted'),
        'recall': recall_score(y_test, y_pred, average='weighted'),
        'f1': f1_score(y_test, y_pred, average='weighted'),
        'auc': roc_auc_score(y_test, y_proba),
        'y_proba': y_proba
    }

    print(f"Accuracy:  {results[name]['accuracy']:.4f}")
    print(f"Precision: {results[name]['precision']:.4f}")
    print(f"Recall:    {results[name]['recall']:.4f}")
    print(f"F1-Score:  {results[name]['f1']:.4f}")
    print(f"AUC-ROC:   {results[name]['auc']:.4f}")


TRAINING MODELS

Training Random Forest...
Accuracy:  0.7037
Precision: 0.7040
Recall:    0.7037
F1-Score:  0.7029
AUC-ROC:   0.7898

Training XGBoost...
Accuracy:  0.6852
Precision: 0.6862
Recall:    0.6852
F1-Score:  0.6836
AUC-ROC:   0.8125

Training Logistic Regression...
Accuracy:  0.6667
Precision: 0.6667
Recall:    0.6667
F1-Score:  0.6667
AUC-ROC:   0.8159

Training SVC...
Accuracy:  0.6852
Precision: 0.6862
Recall:    0.6852
F1-Score:  0.6836
AUC-ROC:   0.8091

Training GradientBoostingClassifier...
Accuracy:  0.7222
Precision: 0.7221
Recall:    0.7222
F1-Score:  0.7219
AUC-ROC:   0.7720


In [31]:
# Cell 10: Hyperparameter Tuning
param_grids = {
    'Random Forest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [10, 20, 30, None],
        'min_samples_split': [2, 5, 10]
    },
    'XGBoost': {
        'n_estimators': [100, 200, 300],
        'max_depth': [3, 5, 7],
        'learning_rate': [0.01, 0.1, 0.3]
    },
    'Logistic Regression': {
        'C': [0.01, 0.1, 1, 10, 100],
        'penalty': ['l2']
    },
    'SVC': {
        'C': [0.1, 1, 10],
        'kernel': ['rbf', 'linear'],
        'gamma': ['scale', 'auto']
    },
    'GradientBoostingClassifier': {
      'n_estimators': [100, 200, 300],
      'max_depth': [3, 5, 7],
      'learning_rate': [0.01, 0.1, 0.3],
      'subsample': [0.8, 1.0]
      }
}

print("\n" + "="*70)
print("HYPERPARAMETER TUNING")
print("="*70)

tuned_results = {}

for name, model in models.items():
    print(f"\nTuning {name}...")
    grid = GridSearchCV(model, param_grids[name], cv=5, scoring='accuracy', n_jobs=-1)
    grid.fit(X_train_scaled, y_train)

    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_test_scaled)
    y_proba = best_model.predict_proba(X_test_scaled)[:, 1]

    tuned_results[name] = {
        'model': best_model,
        'params': grid.best_params_,
        'accuracy': accuracy_score(y_test, y_pred),
        'precision': precision_score(y_test, y_pred, average='weighted'),
        'recall': recall_score(y_test, y_pred, average='weighted'),
        'f1': f1_score(y_test, y_pred, average='weighted'),
        'auc': roc_auc_score(y_test, y_proba)
    }

    print(f"Best params: {grid.best_params_}")
    print(f"Accuracy:  {tuned_results[name]['accuracy']:.4f}")
    print(f"Precision: {tuned_results[name]['precision']:.4f}")
    print(f"Recall:    {tuned_results[name]['recall']:.4f}")
    print(f"F1-Score:  {tuned_results[name]['f1']:.4f}")
    print(f"AUC-ROC:   {tuned_results[name]['auc']:.4f}")


HYPERPARAMETER TUNING

Tuning Random Forest...
Best params: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 100}
Accuracy:  0.6852
Precision: 0.6862
Recall:    0.6852
F1-Score:  0.6836
AUC-ROC:   0.8077

Tuning XGBoost...
Best params: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100}
Accuracy:  0.6852
Precision: 0.6850
Recall:    0.6852
F1-Score:  0.6849
AUC-ROC:   0.8084

Tuning Logistic Regression...
Best params: {'C': 0.1, 'penalty': 'l2'}
Accuracy:  0.6481
Precision: 0.6479
Recall:    0.6481
F1-Score:  0.6478
AUC-ROC:   0.7995

Tuning SVC...
Best params: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
Accuracy:  0.6667
Precision: 0.6667
Recall:    0.6667
F1-Score:  0.6667
AUC-ROC:   0.7953

Tuning GradientBoostingClassifier...
Best params: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}
Accuracy:  0.7593
Precision: 0.7599
Recall:    0.7593
F1-Score:  0.7593
AUC-ROC:   0.7830


In [32]:
# Cell 11: Best Model Summary
print("\n" + "="*70)
print("BEST MODEL FOR EACH ALGORITHM (After Tuning)")
print("="*70)

for name in models.keys():
    print(f"\n{name}:")
    print(f"  Best Parameters: {tuned_results[name]['params']}")
    print(f"  Accuracy:  {tuned_results[name]['accuracy']:.4f}")
    print(f"  Precision: {tuned_results[name]['precision']:.4f}")
    print(f"  Recall:    {tuned_results[name]['recall']:.4f}")
    print(f"  F1-Score:  {tuned_results[name]['f1']:.4f}")
    print(f"  AUC-ROC:   {tuned_results[name]['auc']:.4f}")


BEST MODEL FOR EACH ALGORITHM (After Tuning)

Random Forest:
  Best Parameters: {'max_depth': 10, 'min_samples_split': 10, 'n_estimators': 100}
  Accuracy:  0.6852
  Precision: 0.6862
  Recall:    0.6852
  F1-Score:  0.6836
  AUC-ROC:   0.8077

XGBoost:
  Best Parameters: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 100}
  Accuracy:  0.6852
  Precision: 0.6850
  Recall:    0.6852
  F1-Score:  0.6849
  AUC-ROC:   0.8084

Logistic Regression:
  Best Parameters: {'C': 0.1, 'penalty': 'l2'}
  Accuracy:  0.6481
  Precision: 0.6479
  Recall:    0.6481
  F1-Score:  0.6478
  AUC-ROC:   0.7995

SVC:
  Best Parameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}
  Accuracy:  0.6667
  Precision: 0.6667
  Recall:    0.6667
  F1-Score:  0.6667
  AUC-ROC:   0.7953

GradientBoostingClassifier:
  Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}
  Accuracy:  0.7593
  Precision: 0.7599
  Recall:    0.7593
  F1-Score:  0.7593
  AUC-ROC:   0.7

In [33]:
# Cell 12: Find Overall Best Model
best_name = max(tuned_results.keys(), key=lambda x: tuned_results[x]['accuracy'])
best_model = tuned_results[best_name]['model']

print("\n" + "="*70)
print("OVERALL BEST MODEL")
print("="*70)
print(f"\nModel: {best_name}")
print(f"Accuracy: {tuned_results[best_name]['accuracy']:.4f}")
print(f"Best Parameters: {tuned_results[best_name]['params']}")


OVERALL BEST MODEL

Model: GradientBoostingClassifier
Accuracy: 0.7593
Best Parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}


In [None]:
# Cell 13: Save Best Model
with open('best_model.pkl', 'wb') as f:
    pickle.dump(best_model, f)
print("\nBest model saved as 'best_model.pkl'")

with open('feature_columns.pkl', 'wb') as f:
    pickle.dump(X.columns.tolist(), f)
print("Feature columns saved as 'feature_columns.pkl'")

print("\n" + "="*70)
print("DONE!")
print("="*70)
print(f"\nFiles created:")
print("  1. best_model.pkl")
print("  2. scaler.pkl")
print("  3. feature_columns.pkl")
print("  4. roc_curves.png")


Best model saved as 'best_model.pkl'
Scaler saved as 'scaler.pkl'
Feature columns saved as 'feature_columns.pkl'

DONE!

Files created:
  1. best_model.pkl
  2. scaler.pkl
  3. feature_columns.pkl
  4. roc_curves.png
