# Data enrichment

Now, let's obtain the chemicals descriptos. For this, [RDKit](https://github.com/rdkit/rdkit) collection of cheminformatics will be used.

In [1]:
# Importing libraries
import os
from rdkit import Chem
from rdkit.Chem import Descriptors
import pandas as pd

In [2]:
def rdkit_descriptors():
    '''
    This is a function for getting the list of all molecular descriptors in RDKit package
    '''

    # List of attributes to drop
    Methods_exception = [
                         '_FingerprintDensity',
                         '_isCallable', '_runDoctests',
                         '_setupDescriptors',
                         'setupAUTOCorrDescriptors',
                         '_ChargeDescriptors'
                         ]

    # Getting list of attributes as functions
    methods =  {func: getattr(Descriptors, func) for func in dir(Descriptors)
                if type(getattr(Descriptors, func)).__name__ == "function"
                and func not in Methods_exception}
    methods = {s: methods[s] for s in sorted(methods)}

    return methods

In [3]:
def descriptors_for_chemical(SMILES):
    '''
    This is a function for collecting all the descriptor for a molecule
    '''

    descriptors = None

    # Molecule from SMILES
    molecule = Chem.MolFromSmiles(SMILES)

    if molecule is not None:
        # Molecular descriptors
        descriptors = {}
        for descriptor_name, descriptor_func in rdkit_descriptors().items():
            try:
                descriptors.update({descriptor_name: [descriptor_func(molecule)]})
            except ZeroDivisionError:
                descriptors.update({descriptor_name: None})

    return descriptors

In [4]:
def information_for_set_of_chems(
                                 col_id,
                                 df_chems
                                 ):
    '''
    This is a function to look for the descriptors for all molecules
    '''    

    # Iterating over the dataframe rows (chemicals)
    df_descriptors = pd.DataFrame()
    for _, row in df_chems.iterrows():
        descriptors = descriptors_for_chemical(row['smiles'])
        if descriptors is None:
            continue
        else:
            descriptors.update({col_id: row[col_id]})
            df_descriptors = \
                pd.concat([df_descriptors,
                        pd.DataFrame(descriptors)])
            del descriptors

    # Merging descriptors and input parameters
    df_chems = pd.merge(df_descriptors,
                        df_chems,
                        how='right',
                        on=col_id)
    del df_descriptors

    return df_chems

In [5]:
# Opening the dataset

df = pd.read_csv(os.path.join(os.getcwd(),
            os.pardir,
            'data',
            'transformed',
            'dataset_after_eda.csv'))

In [6]:
# Drop records without SMILES

df = df[pd.notnull(df['smiles'])]

In [7]:
df.head()

Unnamed: 0,source_reduction_general_category,description_code,2_digit_naics,smiles,cas_number
0,Good Operating Practices,"greater than or equal 5%, but less than to 15%","Mining, Quarrying, and Oil and Gas Extraction",C=O,50-00-0
1,Good Operating Practices,"greater than or equal 5%, but less than to 15%","Mining, Quarrying, and Oil and Gas Extraction",C(C(CO[N+](=O)[O-])O[N+](=O)[O-])O[N+](=O)[O-],55-63-0
2,Good Operating Practices,"greater than or equal 5%, but less than to 15%","Mining, Quarrying, and Oil and Gas Extraction",C1=CC=C(C=C1)C2(C(=O)NC(=O)N2)C3=CC=CC=C3,57-41-0
3,Good Operating Practices,"greater than or equal 5%, but less than to 15%","Mining, Quarrying, and Oil and Gas Extraction",C1=CC=C(C=C1)N,62-53-3
4,Good Operating Practices,"greater than or equal 5%, but less than to 15%","Mining, Quarrying, and Oil and Gas Extraction",COP(=O)(OC)OC=C(Cl)Cl,62-73-7


In [9]:
# Organizing unique chemicals

df_chem = df[['smiles', 'cas_number']].drop_duplicates(keep='first').reset_index(drop=True)
df.drop(['smiles'], axis=1, inplace=True)

In [10]:
# Searching for chemical descriptors

df_chem = information_for_set_of_chems('cas_number', df_chem)

In [11]:
# Merging both datasets

df = pd.merge(df, df_chem, on='cas_number', how='inner')

In [13]:
# Dropping columns that are no longer required

df.drop(['smiles', 'cas_number'], axis=1, inplace=True)

In [None]:
# Saving the result dataset

df.to_csv(os.path.join(os.getcwd(),
                        os.pardir,
                        'data',
                        'transformed',
                        'dataset_after_enrichment.csv.zip'),
         index=False, 
         compression='zip')