In [None]:
# Import shared Lipinski and ADMET property functions
from tools.chem_utils import lipinski_filter, calc_admet_properties
from rdkit import Chem
from rdkit.Chem import Descriptors
import pandas as pd

# Example: Check Lipinski compliance for a molecule
smiles = 'CC(=O)OC1=CC=CC=C1C(=O)O'  # Aspirin
print('Lipinski compliant:', lipinski_filter(smiles))

Lipinski compliant: True


## Importance of Property-Based Filtering
- Filtering by molecular properties helps identify drug-like molecules.
- Lipinski's Rule of Five is widely used to assess oral bioavailability.
- Property-based filtering reduces the number of candidates for experimental testing.

## Batch Property Calculation and Filtering
This cell demonstrates how to compute molecular properties and apply Lipinski's Rule of Five to a batch of ligands from a CSV file (e.g., exported from Chapter 1).

In [None]:
import pandas as pd
from tools.chem_utils import lipinski_filter, calc_admet_properties

# Load ligands from CSV (e.g., ligands_for_docking.csv)
ligands = pd.read_csv('ligands_for_docking.csv')

results = []
for idx, row in ligands.iterrows():
    props = calc_admet_properties(row['smiles'])
    lipinski = lipinski_filter(row['smiles'])
    results.append({
        'name': row['name'],
        'smiles': row['smiles'],
        'MW': props['MW'] if props else None,
        'LogP': props['LogP'] if props else None,
        'HBD': props['HBD'] if props else None,
        'HBA': props['HBA'] if props else None,
        'Lipinski': lipinski
    })

df = pd.DataFrame(results)
df.to_csv('ligands_annotated.csv', index=False)
df

Unnamed: 0,name,smiles,MW,LogP,HBD,HBA,Lipinski
0,CHEMBL273030,c1ccc2cc3c(NCCc4c[nH]cn4)nnc(NCCc4c[nH]cn4)c3c...,398.474,3.5384,4,6,True
1,CHEMBL272808,Clc1nnc(NCCc2c[nH]cn2)c2cc3ccccc3cc12,323.787,3.8141,2,4,True
2,CHEMBL272641,c1ccc2cc3c(NCCCn4ccnc4)nnc(NCCCn4ccnc4)c3cc2c1,426.528,4.1804,2,8,True
3,CHEMBL405899,Clc1nnc(NCCCn2ccnc2)c2cc3ccccc3cc12,337.814,4.1351,1,5,True
4,CHEMBL1672028,c1ccc2cc3c(Nc4cc[nH]n4)nnc(Nc4cc[nH]n4)c3cc2c1,342.366,3.7164,4,6,True


# Machine Learning for Drug Discovery

The following cells demonstrate how to use scikit-learn and RDKit to build a simple regression model for molecular properties, as covered in Chapter 5.

In [10]:
# !pip install scikit-learn rdkit-pypi pandas
from rdkit import Chem
from rdkit.Chem import Descriptors
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

## Prepare Data and Features
Let's create a small dataset of SMILES and molecular weights for regression.

In [11]:
data = {'smiles': ['CCO', 'CC(=O)OC1=CC=CC=C1C(=O)O', 'CCN(CC)CCCC(C)NC1=C2C=CC(=CC2=NC=C1)Cl'],
        'property': [46.07, 180.16, 318.86]}
df = pd.DataFrame(data)
df['mol'] = df['smiles'].apply(Chem.MolFromSmiles)
df['mw'] = df['mol'].apply(Descriptors.MolWt)

X = df[['mw']]
y = df['property']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Train and Evaluate a Random Forest Model
Let's train a Random Forest regressor and evaluate its performance.

In [12]:
model = RandomForestRegressor()
model.fit(X_train, y_train)
preds = model.predict(X_test)
print("MSE:", mean_squared_error(y_test, preds))

MSE: 30385.021969000016


In [None]:
# Example: Filter new molecules by model prediction and Lipinski's rule
from tools.chem_utils import lipinski_filter, calc_admet_properties

new_data = {'smiles': ['CCO', 'CC(=O)OC1=CC=CC=C1C(=O)O', 'CCN(CC)CCCC(C)NC1=C2C=CC(=CC2=NC=C1)Cl']}
new_df = pd.DataFrame(new_data)

# Calculate properties and Lipinski compliance
new_df['lipinski'] = new_df['smiles'].apply(lipinski_filter)
new_df['props'] = new_df['smiles'].apply(calc_admet_properties)
new_df['mw'] = new_df['props'].apply(lambda x: x['MW'] if x else None)
new_df['logp'] = new_df['props'].apply(lambda x: x['LogP'] if x else None)
new_df['hbd'] = new_df['props'].apply(lambda x: x['HBD'] if x else None)
new_df['hba'] = new_df['props'].apply(lambda x: x['HBA'] if x else None)

# Predict property using the trained model
new_df['predicted_property'] = model.predict(new_df[['mw']])

# Combine filtering: Lipinski-compliant and predicted property > threshold
threshold = 100
filtered = new_df[(new_df['lipinski']) & (new_df['predicted_property'] > threshold)]
print(filtered[['smiles', 'mw', 'logp', 'hbd', 'hba', 'lipinski', 'predicted_property']])

                                   smiles       mw    logp  hbd  hba  \
0                                     CCO   46.069 -0.0014    1    1   
1                CC(=O)OC1=CC=CC=C1C(=O)O  180.159  1.3101    1    3   
2  CCN(CC)CCCC(C)NC1=C2C=CC(=CC2=NC=C1)Cl  319.880  4.8106    1    3   

   lipinski  predicted_property  
0      True             220.383  
1      True             220.383  
2      True             288.346  


## Link to Book Chapter
For more details and explanations, see [Chapter 5: Machine Learning for Drug Discovery](../chapters/chapter5-ml-for-drug-discovery.qmd) in the book.

### References and Further Reading
- [RDKit](https://www.rdkit.org) [@landrum_rdkit]
- Lipinski, C. A. et al. (2001) [@lipinski2001]