# Program to get RDKit features and add them to the main dataset

Version: 0.0.0

## Importing libraries

In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import Descriptors
from tqdm import tqdm
import json
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*') 

## Initialising function to get features

In [2]:
def rdkitDescriptorsDict(inchi):
    """ 
    Function to fetch values for every available RDKit descriptor for a given InChI string.
    
    Input:
    inchi (str): InChI string for a molecule
    
    Output:
    descDict (dict): Dictionary with descriptor names as keys and descriptor values as values"""

    descDict = {}
    mol = Chem.MolFromInchi(inchi, sanitize=False, removeHs=False) #Converting InChI to mol object
    try:
        if mol is None: #If mol object could not be generated
            print(f"Could not generate mol object from InChI: {inchi}")
            raise ValueError("Could not generate mol object from InChI")
        
        for descName, descFn in Descriptors.descList: #Iterating over all available descriptors
            descDict[descName] = descFn(mol)

    except (ValueError, AttributeError) as e: #If any error occurs, insert nan
        for descName, descFn in Descriptors.descList:
            descDict[descName] = np.nan

    return descDict

In [3]:
vers = "0.5.0"

df = pd.read_csv("../Data/Combined/0.5.0-CleanedMaster.csv")
df.dropna(subset="InChI", inplace=True)
print(df.shape)

inchiList = np.unique(df["InChI"].tolist())

try: 
    descDF = pd.read_csv(f"../Data/Combined/LargeFiles/ExtractedDescriptors/{vers}-RDKitDescriptors.csv")
    print("Descriptors found!")
except:
    print("Fetching descriptors.")
    compoundDict = {}; errors = [] #Initalising 
    for inchi in tqdm(inchiList): #Iterating over all InChI strings
        try:
            compoundDict[inchi] = rdkitDescriptorsDict(inchi) #Fetching descriptor values
        except:
            compoundDict[inchi] = np.nan
            errors.append(inchi)
            print(f"Total errors: {len(errors)}")

    print(len(errors))
    # for i in range(len(errors)): #Removing InChI strings that could not be converted to mol object
    #     del compoundDict[errors[i]]

    descDF = pd.DataFrame(compoundDict).T #Saving as csv
    descDF.reset_index(inplace=True)
    descDF.rename(columns={"index": "InChI"}, inplace=True)
    descDF.to_csv(f"../Data/Combined/LargeFiles/ExtractedDescriptors/{vers}-RDKitDescriptors.csv", index=False)
    print(descDF.shape)

(21291, 9)
Fetching descriptors.


100%|██████████| 18006/18006 [01:47<00:00, 167.61it/s]


Total errors: 1
Total errors: 2
Total errors: 3
3
(18006, 211)


In [4]:
print(df.shape)
dfInChI = df["InChI"].tolist()
rowList = []

for inchi in tqdm(dfInChI):
    try:
        rowList.append(descDF.loc[descDF["InChI"] == inchi].to_dict(orient="records")[0])
    except:
        rowList.append(np.nan)

output = pd.DataFrame.from_dict(rowList)
del output["InChI"] #remove duplicate col
output = pd.concat([df, output], axis=1)

output.to_csv(f"../Data/Combined/{vers}-Master+RDKit.csv", index=False)

(21291, 9)


100%|██████████| 21291/21291 [02:42<00:00, 131.16it/s]
