# ECFP4 featurisation

In [33]:
from rdkit.Chem import PandasTools, MolFromSmiles, AllChem
import pandas as pd

In [34]:
# Load dataframe from path
df = pd.read_csv("/Users/sethhowes/Desktop/FS-Tox/data/external/toxcast.csv")

In [36]:
# Add molecule object column to df
PandasTools.AddMoleculeColumnToFrame(df,"smiles", "molecule", includeFingerprints=True)

[15:10:25] Explicit valence for atom # 0 F, 2, is greater than permitted
[15:10:26] Explicit valence for atom # 2 Cl, 2, is greater than permitted
[15:10:26] Explicit valence for atom # 0 Cl, 2, is greater than permitted
[15:10:26] Explicit valence for atom # 3 Si, 8, is greater than permitted
[15:10:26] Explicit valence for atom # 3 Si, 8, is greater than permitted
[15:10:26] SMILES Parse Error: syntax error while parsing: FAIL
[15:10:26] SMILES Parse Error: Failed parsing SMILES 'FAIL' for input: 'FAIL'
[15:10:27] SMILES Parse Error: syntax error while parsing: FAIL
[15:10:27] SMILES Parse Error: Failed parsing SMILES 'FAIL' for input: 'FAIL'
[15:10:27] SMILES Parse Error: syntax error while parsing: FAIL
[15:10:27] SMILES Parse Error: Failed parsing SMILES 'FAIL' for input: 'FAIL'
[15:10:27] SMILES Parse Error: syntax error while parsing: FAIL
[15:10:27] SMILES Parse Error: Failed parsing SMILES 'FAIL' for input: 'FAIL'
[15:10:27] SMILES Parse Error: syntax error while parsing: FAIL

In [43]:
print(df.isna().sum())

smiles                                 0
ACEA_T47D_80hr_Negative             6850
ACEA_T47D_80hr_Positive             6850
APR_HepG2_CellCycleArrest_24h_dn    7548
APR_HepG2_CellCycleArrest_24h_up    7548
                                    ... 
Tanguay_ZF_120hpf_SWIM_up           7561
Tanguay_ZF_120hpf_TRUN_up           7561
Tanguay_ZF_120hpf_TR_up             7561
Tanguay_ZF_120hpf_YSE_up            7561
molecule                               5
Length: 619, dtype: int64


In [40]:
df[df['molecule'].isna()]

Unnamed: 0,smiles,ACEA_T47D_80hr_Negative,ACEA_T47D_80hr_Positive,APR_HepG2_CellCycleArrest_24h_dn,APR_HepG2_CellCycleArrest_24h_up,APR_HepG2_CellCycleArrest_72h_dn,APR_HepG2_CellLoss_24h_dn,APR_HepG2_CellLoss_72h_dn,APR_HepG2_MicrotubuleCSK_24h_dn,APR_HepG2_MicrotubuleCSK_24h_up,...,Tanguay_ZF_120hpf_PE_up,Tanguay_ZF_120hpf_PFIN_up,Tanguay_ZF_120hpf_PIG_up,Tanguay_ZF_120hpf_SNOU_up,Tanguay_ZF_120hpf_SOMI_up,Tanguay_ZF_120hpf_SWIM_up,Tanguay_ZF_120hpf_TRUN_up,Tanguay_ZF_120hpf_TR_up,Tanguay_ZF_120hpf_YSE_up,molecule
1039,[F-][B+3]([F-])([F-])[F-].CC[N+]1(C)CCCC1,,,,,,,,,,...,,,,,,,,,,
1789,[NH4+].[NH4+].[Cl-][Pt++]([Cl-])([Cl-])[Cl-],,,,,,,,,,...,,,,,,,,,,
1881,[Cl-][Pt]1([Cl-])[NH2+]CC[NH2+]1,,,,,,,,,,...,,,,,,,,,,
2456,[Na+].[Na+].F[Si--](F)(F)(F)(F)F,0.0,0.0,,,,,,,,...,,,,,,,,,,
2463,[NH4+].[NH4+].F[Si--](F)(F)(F)(F)F,,,,,,,,,,...,,,,,,,,,,
4175,FAIL,,,,,,,,,,...,,,,,,,,,,
4781,FAIL,,,,,,,,,,...,,,,,,,,,,
5098,FAIL,0.0,1.0,,,,,,,,...,,,,,,,,,,
5993,FAIL,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
6160,FAIL,,,,,,,,,,...,,,,,,,,,,


There are approx 15 observations where the SMILE is marked as FAIL.

I need to remove these.

In [41]:
# Remove rows where molecule value is FAIL
df.drop(df[df['smiles'] == "FAIL"].index, inplace=True)

The remaining error messages indicate there are 5 molecules for which the valence is incorrect (e.g. one molecule has a fluorine atom bonded to two other atoms, which is greater than its expected valence of 1).

I manually searched the PubChem database for each of the remaining SMILES. The first three molecules in the table above returned no match, with the latter two matching Sodium Hexafluorosilicate and Ammonium Hexafluorosilicate respectively. Considering how only two of the 8252 different molecules were identifiable manually, I think it is best to just discard these as part of the pipeline.

In [42]:
# Remove variables with NA for molecule
df.drop(df[df['molecule'] == None].index, inplace=True)

In [21]:
def mol_to_ecfp4(mol):
    if mol is None:  # If RDKit couldn't parse the SMILES string
        return None
    else:
        ecfp4 = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)
        return list(map(int, ecfp4.ToBitString()))  # Convert the BitVector to a Python list of ints

# Apply the function to your DataFrame
df['ECFP4'] = df['molecule'].apply(mol_to_ecfp4)

In [22]:
df['ECFP4']

0       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
1       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
2       [0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...
3       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
4       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
                              ...                        
8592    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
8593    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
8594    [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
8595    [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
8596    [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: ECFP4, Length: 8582, dtype: object

# Refactor code

I have combined the creation of a molecule object, then the derivation of the ECFP4 fingerprint into a single function.

In [32]:
# Function to convert SMILES to ECFP4
def smiles_to_ecfp4(smiles_string):
    mol = Chem.MolFromSmiles(smiles_string)
    if mol is None:  # If RDKit couldn't parse the SMILES string
        return None
    else:
        ecfp4 = AllChem.GetMorganFingerprintAsBitVect(mol, 2, nBits=1024)
        return list(map(int, ecfp4.ToBitString()))  # Convert the BitVector to a Python list of ints

# Apply the function to dataframe
df['ECFP4'] = df['smiles'].apply(smiles_to_ecfp4)

# Remove variables with NA for molecule
df.drop(df[df['ECFP4'] == None].index, inplace=True)

[15:09:57] Explicit valence for atom # 0 F, 2, is greater than permitted
[15:09:57] Explicit valence for atom # 2 Cl, 2, is greater than permitted
[15:09:57] Explicit valence for atom # 0 Cl, 2, is greater than permitted
[15:09:57] Explicit valence for atom # 3 Si, 8, is greater than permitted
[15:09:57] Explicit valence for atom # 3 Si, 8, is greater than permitted
[15:09:58] SMILES Parse Error: syntax error while parsing: FAIL
[15:09:58] SMILES Parse Error: Failed parsing SMILES 'FAIL' for input: 'FAIL'
[15:09:58] SMILES Parse Error: syntax error while parsing: FAIL
[15:09:58] SMILES Parse Error: Failed parsing SMILES 'FAIL' for input: 'FAIL'
[15:09:58] SMILES Parse Error: syntax error while parsing: FAIL
[15:09:58] SMILES Parse Error: Failed parsing SMILES 'FAIL' for input: 'FAIL'
[15:09:58] SMILES Parse Error: syntax error while parsing: FAIL
[15:09:58] SMILES Parse Error: Failed parsing SMILES 'FAIL' for input: 'FAIL'
[15:09:58] SMILES Parse Error: syntax error while parsing: FAIL