In [1]:
import rdkit # used to import various descriptors
from rdkit import Chem
from rdkit.Chem import Descriptors 

import pandas as pd # for making data frames
import numpy as np # for creation of arrays
import csv # outputs csv files

In [2]:
def check_group(chain, group):
          
    '''
    
    check_group:
    ------------
    
    A function that checks a string for a particular substring and returns the corresponding boolean.
    
    Args:
    -----
    
    chain: string to check
    group: substring being tested for
    
    Returns:
    --------
    
    pos: index of located string
    group_exists: boolean showing whether or not that substring was located 
    
    '''
        
    try:
        pos = chain.index(group)       # finds index of start of group if possible 
    except ValueError:                
        pos = np.nan
        
    if np.isnan(pos):                  # identifies if it located the specified string
        group_exists = False           # it sets the correct boolean to say whether or not that string is present
    else:
        group_exists = True
        
    return(pos, group_exists)

In [3]:
cmc_data = pd.read_csv(open("Submit_Data/output_smiles.csv"), 
                       delimiter = (","))  
# reads in data about the critical micelle concentration, values are separated by a comma
    
func_group = []; chain = []; smiles = []; smiles_head = []; smiles_tail = []; carbon_length = []; 
ethoxy_length = []; ph_exists = []; counter_ion = []; cmc_vals = []; ln_cmc = []

for ind, row in cmc_data.iterrows(): # reads in the rows of the data 
    
    func_group_i = row["Surfactant"] # head group info 
    chain_i = row["Chain"] # chain length
    smiles_i = row["Smiles"] # smiles string
    smiles_head_i = row["Head Smiles"] # smiles string of head group
    smiles_tail_i = row["Tail Smiles"] # smiles string of tail goup
    carbon_length_i = row["Carbon Length"] # carbon chain length
    ethoxy_length_i = row["Ethoxy Length"] # ethoxylate group length
    counter_ion_i = row["Counter Ion"] # finds counter ion
    cmc_i = row["CMC / mM"] # finds CMC data
    
    func_group.append(func_group_i) # adds the data to separate lists  
    chain.append(chain_i); smiles.append(smiles_i); smiles_head.append(smiles_head_i)
    smiles_tail.append(smiles_tail_i); carbon_length.append(carbon_length_i); ethoxy_length.append(ethoxy_length_i)
    counter_ion.append(counter_ion_i); cmc_vals.append(cmc_i); ln_cmc.append(np.log(cmc_i))

In [4]:
tail_group = []

for string in chain: # finds the tail group information e.g. C8
    pos_e, is_e = check_group(string, "E") # ethoxylate chain considered the head group
    pos_py, is_py = check_group(string, "Py")
    if is_e:
        new_str = string[:pos_e]
    elif is_py:
        new_str = string[:pos_py]
    else:
        new_str = string
    tail_group.append(new_str)

In [5]:
# converts smile string into an RDKit mol object for each smiles string and puts it into a new list
mol_list_head = [Chem.MolFromSmiles(smiles_str) for smiles_str in smiles_head]
mol_list_tail = [Chem.MolFromSmiles(smiles_str) for smiles_str in smiles_tail]

In [6]:
# uses the function to calculate the molecular weight of each of the mol objects and puts them into a new list
mol_weight_head = [rdkit.Chem.Descriptors.ExactMolWt(elem) for elem in mol_list_head]
mol_weight_tail = [rdkit.Chem.Descriptors.ExactMolWt(elem) for elem in mol_list_tail]

In [7]:
# uses the function to calculate number of valence electrons of each mol objects and puts them into a new list
val_elec_head = [rdkit.Chem.Descriptors.NumValenceElectrons(elem) for elem in mol_list_head]
val_elec_tail = [rdkit.Chem.Descriptors.NumValenceElectrons(elem) for elem in mol_list_tail]

In [8]:
# uses the function to calculate logP of each mol objects and puts them into a new list
logp_head = [rdkit.Chem.Crippen.MolLogP(elem) for elem in mol_list_head]
logp_tail = [rdkit.Chem.Crippen.MolLogP(elem) for elem in mol_list_tail]

In [9]:
# uses the function to calculate number of aromatic rings of each mol objects and puts them into a new list
aromatic_rings_head = [rdkit.Chem.Lipinski.NumAromaticRings(elem) for elem in mol_list_head]
aromatic_rings_tail = [rdkit.Chem.Lipinski.NumAromaticRings(elem) for elem in mol_list_tail]

In [10]:
header = ["surfactant", "chain", "tail_group", "smiles", "head_smiles", "tail_smiles", "carbon_len", "ethoxy_len", 
          "aromatic_rings_tail", "counter_ion", "CMC", "MW_head", "MW_tail", "NVE_head", "NVE_tail", 
          "logP_head", "logP_tail"] # creates headers for the new outputted csv file

# creates a new csv file with additional information
with open("Submit_Data/output_descript_sep.csv", "w") as file:    
    writer = csv.writer(file) # creates a new file
    writer.writerow(header) # adds the headers as the first row in the file
                   
    # finds a row of the new data to be added to the csv file using the index
    for ind, func_group in enumerate(func_group):
        current_row = [func_group, chain[ind], tail_group[ind], smiles[ind], smiles_head[ind], smiles_tail[ind], 
                       carbon_length[ind],ethoxy_length[ind], aromatic_rings_tail[ind], counter_ion[ind], 
                       cmc_vals[ind], mol_weight_head[ind], mol_weight_tail[ind], val_elec_head[ind], 
                       val_elec_tail[ind], logp_head[ind], logp_tail[ind]]
        
        writer.writerow(current_row) # writes the new row to the csv file