# Using a Fingerprint in a  Model

This notebook demonstrates fitting a regression model to a molecular fingerprint.

In [1]:
import pandas as pd

from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs

In [None]:
df = pd.read_csv("./ESOL/ESOL_supplemental.csv")
df.head()

Unnamed: 0,Compound ID,measured log(solubility:mol/L),ESOL predicted log(solubility:mol/L),SMILES
0,"1,1,1,2-Tetrachloroethane",-2.18,-2.794,ClCC(Cl)(Cl)Cl
1,"1,1,1-Trichloroethane",-2.0,-2.232,CC(Cl)(Cl)Cl
2,"1,1,2,2-Tetrachloroethane",-1.74,-2.549,ClC(Cl)C(Cl)Cl
3,"1,1,2-Trichloroethane",-1.48,-1.961,ClCC(Cl)Cl
4,"1,1,2-Trichlorotrifluoroethane",-3.04,-3.077,FC(F)(Cl)C(F)(Cl)Cl


In [6]:
df["Mol"] = df["SMILES"].apply(Chem.MolFromSmiles)

In [10]:
df["fingerprint"] = df["Mol"].apply(AllChem.RDKFingerprint)

In [14]:
df.head()

Unnamed: 0,Compound ID,measured log(solubility:mol/L),ESOL predicted log(solubility:mol/L),SMILES,Mol,fingerprint
0,"1,1,1,2-Tetrachloroethane",-2.18,-2.794,ClCC(Cl)(Cl)Cl,<rdkit.Chem.rdchem.Mol object at 0x7f9ffc877f40>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"1,1,1-Trichloroethane",-2.0,-2.232,CC(Cl)(Cl)Cl,<rdkit.Chem.rdchem.Mol object at 0x7f9ffc876ce0>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"1,1,2,2-Tetrachloroethane",-1.74,-2.549,ClC(Cl)C(Cl)Cl,<rdkit.Chem.rdchem.Mol object at 0x7f9ffc8773e0>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"1,1,2-Trichloroethane",-1.48,-1.961,ClCC(Cl)Cl,<rdkit.Chem.rdchem.Mol object at 0x7f9ffc877b50>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"1,1,2-Trichlorotrifluoroethane",-3.04,-3.077,FC(F)(Cl)C(F)(Cl)Cl,<rdkit.Chem.rdchem.Mol object at 0x7f9ffc877840>,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [35]:
import numpy as np

from sklearn.metrics import mean_absolute_error, r2_score   # ← add r2_score


numpy_arrays = []

for fingerprint in df["fingerprint"]:
    arr = np.zeros(2048, dtype=np.int8)
    DataStructs.ConvertToNumpyArray(fingerprint, arr)

    numpy_arrays.append(arr)


In [36]:
numpy_arrays[0]

array([0, 0, 0, ..., 0, 1, 0], shape=(2048,), dtype=int8)

In [37]:
X = numpy_arrays
Y = df[["measured log(solubility:mol/L)"]]

In [38]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor          # for creating a random forest regression model

X_tr, X_te, y_tr, y_te = train_test_split(
    X, Y, test_size=0.20, random_state=42, shuffle=True
)

# --------------------------------------------------------------------------- #
# 4.  Random‑Forest regressor                                                 #
# --------------------------------------------------------------------------- #
rf = RandomForestRegressor(
    n_estimators=300,
    max_depth=None,
    n_jobs=-1,
    random_state=42
)
rf.fit(X_tr, y_tr)

# --------------------------------------------------------------------------- #
# 5.  Evaluation: MAE and R²                                                  #
# --------------------------------------------------------------------------- #
pred = rf.predict(X_te)
mae = mean_absolute_error(y_te, pred)
r2  = r2_score(y_te, pred)

print(f"MAE = {mae:.1f} nm   R² = {r2:.2f}   (test n = {len(y_te)})")


  return fit_method(estimator, *args, **kwargs)


MAE = 0.7 nm   R² = 0.82   (test n = 229)
