# With this Code the data for the examples_wisp.ipynb was prepared.

Dataset used in the example notebooks:      
***https://www.kaggle.com/datasets/shivanshuman/quantum-machine-8-aka-qm8***  

Source:
***https://arxiv.org/abs/1703.00564***

In [2]:
from rdkit import Chem

import pandas as pd

import os
import re
import pickle

from ase.io import read

import numpy as np

from WISP.ml_helper import get_features

from openbabel import pybel
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import WhiteKernel, RBF, Matern, RationalQuadratic

from dscribe.descriptors import MBTR

from openbabel import openbabel
openbabel.obErrorLog.SetOutputLevel(3)


### Functions that will be used to generate .xyz files and split up the original QM8 dataset.

In [3]:
def gen_xyz_pybel(df, name_col, smiles_col,  out_dir):
    """
    Generates .xyz-files from Smiles-Strings and saves them to a specified diectory.

    Parameters:
        df (pandas.Dataframe): Dataframe containing the Smiles-Strings and the column for naming the .xyz-file.
        name_col (str): Name of the column containing the names that should be used for the .xyz-files.
        smiles_col (str): Name of the Column containing the Smiles-Strings.
        out_dir (str): Path to the directory where the .xyz-files will be saved.
    """

    idx_failed = []
    for idx, row in df.iterrows():
        smiles = row[smiles_col]
        print(f'Converting Smiles: {smiles}')
        name = row[name_col]

        mol = pybel.readstring("smi", smiles)
        mol.addh()
        try:
            print('Starting conversion')
            mol.make3D(forcefield="uff")
            # if more optimization steps are necessary
            #mol.localopt(forcefield='uff')
            xyz_name = f'{name}.xyz'
            save_xyz(mol, xyz_name, out_dir)
            print(f'Conversion for Smiles {smiles} finished')
        except Exception as e:
            print(f"make3D() failed for SMILES={smiles!r} (idx: {idx}): {e}")
            idx_failed.append(idx)

    print('Completed the conversion to .xyz-files.')
    print(f'Indices of molecules with failed conversion:\n{idx_failed}')


def save_xyz(inp_mol, name_col, output_path):
    """
    Saves the embeded and FF-optimized molecule objects to a .xyz-file.

    Parameters:
        inp_mol (pybel.Molecule): pybel.Molecule (wrapper for openbabel.OBMol) objekt to be converted to a .xyz-file
        name_col (str): Name of the .xyz-file. Idealy something that can be matched later to a row in a dataframe.
        output_path (str): Path where the new .xyz-file should be saved.
    """

    filename = name_col
    out_path = os.path.join(output_path, filename)
    inp_mol.write("xyz", out_path)


def gen_clean_name(df, raw_col_name, clean_col_name):
    """
    Replaces the special characters for every string inside raw_col_name and saves the cleaned string in new_col_name.

    Parameters:
        df (pandas.Dataframe): Dataframe containing the raw_col_name column.
        raw_col_name (str): Name of the column containing the raw names with special characters.
        clean_col_name (str): Name of the column for saving the newly generated names. 
    
    Returns:
        df (pandas.Dataframe): Dataframe with the new column containing the cleaned names
    """

    for idx, row in df.iterrows():
        rawname = row[raw_col_name]
        s = rawname.strip().lower()
        s = re.sub(r'[^0-9a-z]+', '_', s)
        s = re.sub(r'_+', '_', s).strip('_')
        new_name = s

        # Save new name in new column
        df.loc[idx, clean_col_name] = new_name
    return df

#### In the following cell a subset of the QM8 dataset is created, to limit the computing time.  

Therefore only 12.5% of the original QM8 dataset is retained using the *sample* methode from pandas (`random_state=42` is used to make the sampling reproducible).  
Afterwards an identifier column is created to name the .xyz files that will be created later in this notebook.

In [5]:
# Import the dataset
df = pd.read_csv('./data_files/qm8.csv')

# Only retain 12.5% of the original qm8 dataset (reproducible with `random_state=42``)
df_sampled = df.sample(frac=0.125, random_state=42)
df_id_col = df_sampled.copy()

# incorporates an identifier column based on the row indices 
for idx, row in df_id_col.iterrows():
    name = f"{idx}"
    df_id_col.loc[idx, 'mol_id'] = name

# saves the subset to a .csv file
df_id_col.to_csv('./data_files/qm8_subset.csv')

# shows the dataframe after modifications
df_id_col

Unnamed: 0,smiles,E1-CC2,E2-CC2,f1-CC2,f2-CC2,E1-PBE0,E2-PBE0,f1-PBE0,f2-PBE0,E1-PBE0.1,E2-PBE0.1,f1-PBE0.1,f2-PBE0.1,E1-CAM,E2-CAM,f1-CAM,f2-CAM,mol_id
19900,[H]C1([H])C2([H])C([H])([H])C3([H])C4([H])C1([...,0.268963,0.270580,8.294710e-03,0.005270,0.278046,0.293753,0.003283,0.007678,0.278046,0.293753,0.003283,0.007678,0.272772,0.276531,0.0051,0.0066,19900
19467,[H]C([H])([H])C1([H])OC2([H])C3([H])C([H])([H]...,0.254242,0.262824,9.326290e-03,0.008601,0.271510,0.277585,0.002396,0.001899,0.271510,0.277585,0.002396,0.001899,0.266907,0.268965,0.0099,0.0034,19467
13787,[H]OC([H])(C([H])([H])[H])C12OC1([H])C([H])([H...,0.266831,0.280019,1.391910e-03,0.025544,0.278261,0.288864,0.000343,0.013774,0.278261,0.288864,0.000343,0.013774,0.272970,0.285781,0.0048,0.0127,13787
8094,[H]OC1([H])C([H])([H])C12OC(=O)C2([H])[H],0.227932,0.270654,5.977400e-04,0.092588,0.220892,0.263470,0.001079,0.038702,0.220892,0.263470,0.001079,0.038702,0.220059,0.271188,0.0006,0.0010,8094
17960,[H]C1=C2C([H])([H])OC2([H])C(=O)C1([H])[H],0.157602,0.201839,1.945700e-04,0.020764,0.152441,0.198269,0.000251,0.001670,0.152441,0.198269,0.000251,0.001670,0.154726,0.212878,0.0001,0.0061,17960
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3268,[H]OC([H])([H])C1([H])OC2([H])C([H])([H])C12[H],0.255623,0.275348,4.305030e-03,0.013750,0.268542,0.285748,0.001410,0.010069,0.268542,0.285748,0.001410,0.010069,0.262291,0.278538,0.0041,0.0100,3268
11799,[H]C([H])([H])C1(C2(C([H])([H])[H])C([H])([H])...,0.288495,0.293907,1.383104e-02,0.001494,0.304986,0.305876,0.002014,0.004446,0.304986,0.305876,0.002014,0.004446,0.297206,0.299797,0.0055,0.0093,11799
10904,[H]C(=O)C1([H])OC(=O)N([H])C1([H])[H],0.159391,0.245802,3.090000e-05,0.002251,0.154084,0.202687,0.000032,0.002005,0.154084,0.202687,0.000032,0.002005,0.154432,0.231818,0.0000,0.0021,10904
11474,[H]C([H])([H])C([H])([H])C12OC([H])([H])C([H])...,0.257015,0.266735,3.009810e-03,0.000772,0.272807,0.287922,0.000796,0.000402,0.272807,0.287922,0.000796,0.000402,0.270979,0.282000,0.0013,0.0002,11474


#### Generating the .xyz files from the SMILES-strings using the `gen_xyz_pybel` function.

In [None]:
df_xyz_conv = pd.read_csv('./data_files/qm8_subset.csv')

gen_xyz_pybel(df_xyz_conv, 'mol_id', 'smiles', './data_files/subset_xyz_qm8/')

Converting Smiles: [H]C1([H])C2([H])C([H])([H])C3([H])C4([H])C1([H])C2([H])C34[H]
Starting conversion
Conversion for Smiles [H]C1([H])C2([H])C([H])([H])C3([H])C4([H])C1([H])C2([H])C34[H] finished
Converting Smiles: [H]C([H])([H])C1([H])OC2([H])C3([H])C([H])([H])C3([H])C12[H]
Starting conversion
Conversion for Smiles [H]C([H])([H])C1([H])OC2([H])C3([H])C([H])([H])C3([H])C12[H] finished
Converting Smiles: [H]OC([H])(C([H])([H])[H])C12OC1([H])C([H])([H])C2([H])[H]
Starting conversion
Conversion for Smiles [H]OC([H])(C([H])([H])[H])C12OC1([H])C([H])([H])C2([H])[H] finished
Converting Smiles: [H]OC1([H])C([H])([H])C12OC(=O)C2([H])[H]
Starting conversion
Conversion for Smiles [H]OC1([H])C([H])([H])C12OC(=O)C2([H])[H] finished
Converting Smiles: [H]C1=C2C([H])([H])OC2([H])C(=O)C1([H])[H]
Starting conversion
Conversion for Smiles [H]C1=C2C([H])([H])OC2([H])C(=O)C1([H])[H] finished
Converting Smiles: [H]OC1([H])C([H])([H])C([H])([H])C2(C([H])([H])[H])N([H])C12[H]
Starting conversion
Conversion 

## Extracting min/max number of none-Hydrogen atoms and the unique atomic species inside the QM8 subset.

In [8]:
xyz_path = './data_files/subset_xyz_qm8/'
xyz_files = [f for f in os.listdir(xyz_path) if f.endswith('.xyz')]


# Getting the maximum and minimum number of atoms over all molecules inside the QM8 subset
all_molecules = []
for xyz in xyz_files:
    file_path = os.path.join(xyz_path, xyz)
    try:
        xyz_mol = read(file_path)
        if xyz_mol is not None:
            all_molecules.append(xyz_mol)
    except Exception as e:
        print(f"Molecule {xyz} could not be read properly: {e}")

value_n_atoms_max = np.max([mol.get_global_number_of_atoms() for mol in all_molecules])
value_n_atoms_min = np.min([mol.get_global_number_of_atoms() for mol in all_molecules])
print(f'Maximum ({value_n_atoms_max}) and minimum ({value_n_atoms_min}) number of atoms in the QM8 subset.')


# Getting the unique atomic species from the QM8 subset
unique_atoms = set('H',)
for smi in df_id_col['smiles']:
    mol = Chem.MolFromSmiles(smi)
    if mol is None:
        print(f"WARNING: could not parse SMILES {smi}")
        continue
    for atom in mol.GetAtoms():
        unique_atoms.add(atom.GetSymbol())

print("Unique atomic species found:", unique_atoms)

Maximum (26) and minimum (5) number of atoms in the QM8 subset.
Unique atomic species found: {'O', 'F', 'C', 'N', 'H'}
