In [1]:
import pubchempy as pcp
import pandas as pd
import numpy as np

In [2]:
#loading a DES melting point dataset to use as an example
data = pd.read_csv('melting_point.csv')

In [3]:
data

Unnamed: 0,HBA,HBD,HBA Tmp (¡C),HBD Tmp (¡C),HBA_Ratio,HBD_Ratio,Mole Fraction of HBA,Mole Fraction of HBD,DES Tmp (¡C)
0,choline chloride,urea,302.0,134.00,1.0,2.0,0.333333,0.666667,12
1,choline chloride,thiourea,302.0,175.00,1.0,3.0,0.250000,0.750000,<10
2,choline chloride,trifluoroacetamide,302.0,72.00,1.0,2.0,0.333333,0.666667,-44
3,choline chloride,propanoic acid,302.0,-21.00,1.0,2.0,0.333333,0.666667,<-80
4,choline chloride,chloroacetic acid,302.0,63.00,1.0,2.0,0.333333,0.666667,<-80
5,choline chloride,trichloroacetic acid,302.0,58.00,1.0,2.0,0.333333,0.666667,<-80
6,choline chloride,phenylacetic acid,302.0,77.00,1.0,2.0,0.333333,0.666667,25
7,choline chloride,phenylpropanoic acid,302.0,48.00,1.0,2.0,0.333333,0.666667,20
8,choline chloride,malonic acid,302.0,135.00,1.0,1.0,0.500000,0.500000,10
9,choline chloride,glutaric acid,302.0,98.00,1.0,1.0,0.500000,0.500000,-16


### The goal here is to write some code/functions using the pubchempy library in order to retrieve pubchem id's from chemicals in a dataset, which then allows us to later retrieve several important properties from the pubchem database. Let's start with trying to find the cid's for the HBA column.

Documentation for the pubchempy API can be found here: https://pubchempy.readthedocs.io/en/latest/

In [4]:
cid_results = [] #empty list that willcontain the cid's retrieved by pubchem 

for i, row in data.iterrows():
    cid = row['HBA']
    #pubchempy command for retrieving cid's and appending to temporary list
    #note you need to input how you wish to search for the cid, here we are searching by chemcial 'name'
    cid_results.append(pcp.get_cids(cid, 'name', list_return='flat')) 

In [5]:
cid_results

[[6209],
 [6209],
 [6209],
 [6209],
 [6209],
 [6209],
 [6209],
 [6209],
 [6209],
 [6209],
 [6209],
 [6209],
 [6209],
 [6209],
 [6209],
 [6209],
 [6209],
 [6209],
 [6209],
 [6209],
 [6209],
 [6209],
 [6209],
 [6209],
 [6209],
 [6209],
 [6209],
 [6209],
 [6209],
 [6209],
 [6209],
 [6209],
 [6209],
 [6209],
 [6209],
 [6209],
 [6209],
 [6209],
 [6209],
 [187],
 [187],
 [74724],
 [22134097],
 [13646546],
 [13836],
 [87940],
 [3014549],
 [3014549],
 [3014549],
 [3014549],
 [3014549],
 [3014549],
 [3014549],
 [6060],
 [6060],
 [6060],
 [6060],
 [6060],
 [6060],
 [6060],
 [6060],
 [6060],
 [6379],
 [6379],
 [6379],
 [11198, 521681],
 [11198, 521681],
 [11198, 521681],
 [68974],
 [68974],
 [517011],
 [517011],
 [517011],
 [82326],
 [5946],
 [5946],
 [5946],
 [5946],
 [5946],
 [5946],
 [5946],
 [6285],
 [6285],
 [6285],
 [6285],
 [12203661],
 [79880],
 [79880],
 [79880],
 [79880],
 [79880],
 [79880],
 [79880],
 [79880],
 [74745],
 [74745],
 [74745],
 [74745],
 [74745],
 [74745],
 [12199058],
 [1

Notice that the list is returned as a list of lists. Also, notice that for one entry there were two cid's returned. Pubchempy is supposed to only retrieve a single cid, but sometiems synonyms for the chemcial names aren't always accounted for and they can end up as multiple pubchem entries. Assuming these multiple pubchem cid's are for the same chemical, we will get rid of all but the first cid retrieved. We can also add in code that will skip this step if only a single cid was returned, or fill the entry with "no cid found" if pubchem could not find a cid, which happens occasionally. Usually this is because the name searched for is an unlisted synonym.

In [6]:
HBA_cid = [] #empty list that will contain the final cid's

for i in cid_results:
    
    if len(i) == 0:                    #case in which no cid was found by pubchempy
        HBA_cid.append("no cid found") 
    
    
    if len(i) >= 1:                    #append only the first cid
        HBA_cid.append(i[0])
        
    
        

In [7]:
HBA_cid

[6209,
 6209,
 6209,
 6209,
 6209,
 6209,
 6209,
 6209,
 6209,
 6209,
 6209,
 6209,
 6209,
 6209,
 6209,
 6209,
 6209,
 6209,
 6209,
 6209,
 6209,
 6209,
 6209,
 6209,
 6209,
 6209,
 6209,
 6209,
 6209,
 6209,
 6209,
 6209,
 6209,
 6209,
 6209,
 6209,
 6209,
 6209,
 6209,
 187,
 187,
 74724,
 22134097,
 13646546,
 13836,
 87940,
 3014549,
 3014549,
 3014549,
 3014549,
 3014549,
 3014549,
 3014549,
 6060,
 6060,
 6060,
 6060,
 6060,
 6060,
 6060,
 6060,
 6060,
 6379,
 6379,
 6379,
 11198,
 11198,
 11198,
 68974,
 68974,
 517011,
 517011,
 517011,
 82326,
 5946,
 5946,
 5946,
 5946,
 5946,
 5946,
 5946,
 6285,
 6285,
 6285,
 6285,
 12203661,
 79880,
 79880,
 79880,
 79880,
 79880,
 79880,
 79880,
 79880,
 74745,
 74745,
 74745,
 74745,
 74745,
 74745,
 12199058,
 16388,
 14249,
 14250,
 70681,
 70681,
 70681,
 70681,
 70681,
 70681,
 70681,
 70681,
 70681,
 70681,
 70681,
 70681,
 70681,
 70681,
 70681,
 70681,
 70681,
 70681,
 70681,
 74236,
 74236,
 74236,
 74236,
 74236,
 74236,
 7423

Now that we have all the cid's, we can add it to the dataframe.

In [8]:
data['HBA_cid'] = HBA_cid

In [9]:
data

Unnamed: 0,HBA,HBD,HBA Tmp (¡C),HBD Tmp (¡C),HBA_Ratio,HBD_Ratio,Mole Fraction of HBA,Mole Fraction of HBD,DES Tmp (¡C),HBA_cid
0,choline chloride,urea,302.0,134.00,1.0,2.0,0.333333,0.666667,12,6209
1,choline chloride,thiourea,302.0,175.00,1.0,3.0,0.250000,0.750000,<10,6209
2,choline chloride,trifluoroacetamide,302.0,72.00,1.0,2.0,0.333333,0.666667,-44,6209
3,choline chloride,propanoic acid,302.0,-21.00,1.0,2.0,0.333333,0.666667,<-80,6209
4,choline chloride,chloroacetic acid,302.0,63.00,1.0,2.0,0.333333,0.666667,<-80,6209
5,choline chloride,trichloroacetic acid,302.0,58.00,1.0,2.0,0.333333,0.666667,<-80,6209
6,choline chloride,phenylacetic acid,302.0,77.00,1.0,2.0,0.333333,0.666667,25,6209
7,choline chloride,phenylpropanoic acid,302.0,48.00,1.0,2.0,0.333333,0.666667,20,6209
8,choline chloride,malonic acid,302.0,135.00,1.0,1.0,0.500000,0.500000,10,6209
9,choline chloride,glutaric acid,302.0,98.00,1.0,1.0,0.500000,0.500000,-16,6209


Now we will turn this into a function and test it by applying it to the HBD column of the dataset.

In [10]:
def get_cid (dataframe, source_column, new_column):
    
    """This function will retrieve the pubchem cid's for chemicals in a dataframe. The dataframe, source column
    for which to retrieve cid's and name of a new column to append to the dataframe"""
    
    cid_results = [] #temporary empty list that will contain the cid's retrieved by pubchem  
    final_cid = []   #empty list that will contain the final cid's
    
    for i, row in dataframe.iterrows():
        
        names = row[source_column]
        
        #pubchempy command for retrieving cid's and appending to temporary list
        #note you need to input how you wish to search for the cid, here we are searching by chemcial 'name'
        cid_results.append(pcp.get_cids(names, 'name', list_return='flat')) 

    for j in cid_results:
    
        if len(j) == 0:                    #case in which no cid was found by pubchempy
            final_cid.append("no cid found") 


        if len(j) >= 1:                    #append only the first cid
            final_cid.append(j[0])
            
    dataframe[new_column] = final_cid
    
    return dataframe


In [11]:
get_cid(data, 'HBD', 'HBD_cid')

Unnamed: 0,HBA,HBD,HBA Tmp (¡C),HBD Tmp (¡C),HBA_Ratio,HBD_Ratio,Mole Fraction of HBA,Mole Fraction of HBD,DES Tmp (¡C),HBA_cid,HBD_cid
0,choline chloride,urea,302.0,134.00,1.0,2.0,0.333333,0.666667,12,6209,1176
1,choline chloride,thiourea,302.0,175.00,1.0,3.0,0.250000,0.750000,<10,6209,2723790
2,choline chloride,trifluoroacetamide,302.0,72.00,1.0,2.0,0.333333,0.666667,-44,6209,67717
3,choline chloride,propanoic acid,302.0,-21.00,1.0,2.0,0.333333,0.666667,<-80,6209,1032
4,choline chloride,chloroacetic acid,302.0,63.00,1.0,2.0,0.333333,0.666667,<-80,6209,300
5,choline chloride,trichloroacetic acid,302.0,58.00,1.0,2.0,0.333333,0.666667,<-80,6209,6421
6,choline chloride,phenylacetic acid,302.0,77.00,1.0,2.0,0.333333,0.666667,25,6209,999
7,choline chloride,phenylpropanoic acid,302.0,48.00,1.0,2.0,0.333333,0.666667,20,6209,107
8,choline chloride,malonic acid,302.0,135.00,1.0,1.0,0.500000,0.500000,10,6209,867
9,choline chloride,glutaric acid,302.0,98.00,1.0,1.0,0.500000,0.500000,-16,6209,743


### Now that we have the pubchem cid's for all these chemicals, we can use them to find other valuable information from the pubchem database, most of which are detailed in other notebooks. Pubchempy does have a limited set of properties you can obtain relatively easily with a single command, for a full list of those properties view the pubchempy api.

Let's use pubchempy to get the molecular weights  and molecular formulas of the HBA and HBD now that we have the cid's

In [57]:
def get_properties(dataframe, properties_list, source_column, name_prefix):
    """This function will retrieve chemical properties from the pubchem database by searching from their cid's. 
    Must input a dataframe, properties list, a source column for which to search based on cid, and a name to give as
    a prefix for the new columns"""
    
    empty_df = pd.DataFrame() #empty df to append results to
    
    for i, row in dataframe.iterrows():
        
        cids = row[source_column] #make sure the source column contains the cid's you want to obtain proeprties for
        
        temporary_df = pcp.get_properties(properties_list, cids, as_dataframe = True) #will return the properties as seperate df
    
        empty_df = temporary_df.append(empty_df) #append result to empty dataframe
         
    empty_df = empty_df.iloc[::-1] #need to keep original order of results so this will fix that
        
    empty_df = empty_df.reset_index() #also resetting index
    
    empty_df = empty_df.drop(['CID'], axis = 1) #dropping the cid column from dataframe
    
    empty_df = empty_df.add_prefix(name_prefix) #adding prefix to column names
        
    dataframe = pd.concat([dataframe, empty_df], axis = 1) #concatenating to original dataframe
    
    return dataframe

In [61]:
properties_list = ['molecular_weight', 'molecular_formula']

data = get_properties(data, properties_list, 'HBA_cid', 'HBA_')


In [62]:
data

Unnamed: 0,HBA,HBD,HBA Tmp (¡C),HBD Tmp (¡C),HBA_Ratio,HBD_Ratio,Mole Fraction of HBA,Mole Fraction of HBD,DES Tmp (¡C),HBA_cid,HBD_cid,HBA_MolecularFormula,HBA_MolecularWeight
0,choline chloride,urea,302.0,134.00,1.0,2.0,0.333333,0.666667,12,6209,1176,C5H14ClNO,139.62
1,choline chloride,thiourea,302.0,175.00,1.0,3.0,0.250000,0.750000,<10,6209,2723790,C5H14ClNO,139.62
2,choline chloride,trifluoroacetamide,302.0,72.00,1.0,2.0,0.333333,0.666667,-44,6209,67717,C5H14ClNO,139.62
3,choline chloride,propanoic acid,302.0,-21.00,1.0,2.0,0.333333,0.666667,<-80,6209,1032,C5H14ClNO,139.62
4,choline chloride,chloroacetic acid,302.0,63.00,1.0,2.0,0.333333,0.666667,<-80,6209,300,C5H14ClNO,139.62
5,choline chloride,trichloroacetic acid,302.0,58.00,1.0,2.0,0.333333,0.666667,<-80,6209,6421,C5H14ClNO,139.62
6,choline chloride,phenylacetic acid,302.0,77.00,1.0,2.0,0.333333,0.666667,25,6209,999,C5H14ClNO,139.62
7,choline chloride,phenylpropanoic acid,302.0,48.00,1.0,2.0,0.333333,0.666667,20,6209,107,C5H14ClNO,139.62
8,choline chloride,malonic acid,302.0,135.00,1.0,1.0,0.500000,0.500000,10,6209,867,C5H14ClNO,139.62
9,choline chloride,glutaric acid,302.0,98.00,1.0,1.0,0.500000,0.500000,-16,6209,743,C5H14ClNO,139.62


We can repeat this for the HBD

In [63]:
properties_list = ['molecular_weight', 'molecular_formula']

data = get_properties(data, properties_list, 'HBD_cid', 'HBD_')

In [64]:
data

Unnamed: 0,HBA,HBD,HBA Tmp (¡C),HBD Tmp (¡C),HBA_Ratio,HBD_Ratio,Mole Fraction of HBA,Mole Fraction of HBD,DES Tmp (¡C),HBA_cid,HBD_cid,HBA_MolecularFormula,HBA_MolecularWeight,HBD_MolecularFormula,HBD_MolecularWeight
0,choline chloride,urea,302.0,134.00,1.0,2.0,0.333333,0.666667,12,6209,1176,C5H14ClNO,139.62,CH4N2O,60.056
1,choline chloride,thiourea,302.0,175.00,1.0,3.0,0.250000,0.750000,<10,6209,2723790,C5H14ClNO,139.62,CH4N2S,76.120
2,choline chloride,trifluoroacetamide,302.0,72.00,1.0,2.0,0.333333,0.666667,-44,6209,67717,C5H14ClNO,139.62,C2H2F3NO,113.040
3,choline chloride,propanoic acid,302.0,-21.00,1.0,2.0,0.333333,0.666667,<-80,6209,1032,C5H14ClNO,139.62,C3H6O2,74.080
4,choline chloride,chloroacetic acid,302.0,63.00,1.0,2.0,0.333333,0.666667,<-80,6209,300,C5H14ClNO,139.62,C2H3ClO2,94.500
5,choline chloride,trichloroacetic acid,302.0,58.00,1.0,2.0,0.333333,0.666667,<-80,6209,6421,C5H14ClNO,139.62,C2HCl3O2,163.380
6,choline chloride,phenylacetic acid,302.0,77.00,1.0,2.0,0.333333,0.666667,25,6209,999,C5H14ClNO,139.62,C8H8O2,136.150
7,choline chloride,phenylpropanoic acid,302.0,48.00,1.0,2.0,0.333333,0.666667,20,6209,107,C5H14ClNO,139.62,C9H10O2,150.170
8,choline chloride,malonic acid,302.0,135.00,1.0,1.0,0.500000,0.500000,10,6209,867,C5H14ClNO,139.62,C3H4O4,104.060
9,choline chloride,glutaric acid,302.0,98.00,1.0,1.0,0.500000,0.500000,-16,6209,743,C5H14ClNO,139.62,C5H8O4,132.110
