In [1]:
pip install pubchempy

Note: you may need to restart the kernel to use updated packages.


#### 1.Importing Required Libraries

In [2]:
import pubchempy as pcp
import pandas as pd
import numpy as np

#### 2. Example Implementations:

In [None]:
# Example 1
c = pcp.Compound.from_cid(5090)
print ('IUPAC Name: ', c.iupac_name)
print ('Molecular Formula: ', c.molecular_formula)
print ('xLogP: ',c.xlogp)
print ('Molecular Weight: ', c.molecular_weight)
print ('Isomeric SMILES: ', c.isomeric_smiles)

In [2]:
# Example 2
List_of_Chemicals = ['benzene', 'toluene', '2-nonenal']

for chemical_name in List_of_Chemicals:

    cid=pcp.get_cids(chemical_name)
    prop = pcp.get_properties('CanonicalSMILES', cid)
    print (chemical_name + ' ' + str(prop))

IUPAC Name:  3-(4-methylsulfonylphenyl)-4-phenyl-2H-furan-5-one
Molecular Formula:  C17H14O4S
xLogP:  2.3
Molecular Weight:  314.4
Isomeric SMILES:  CS(=O)(=O)C1=CC=C(C=C1)C2=C(C(=O)OC2)C3=CC=CC=C3


#### 3. Exploratory Data Analysis

In [5]:
df = pd.read_csv('viridis_solu.csv')
new = df.head()
print(new)

   pubchem_cid          chemical_name  exp_double_value
0          790           hypoxanthine             700.0
1         2474            Bupivacaine              91.7
2         4062            mepivacaine            7000.0
3         4421         nalidixic acid             100.0
4         4649  4-Aminosalicylic acid            1690.0


In [6]:
len(df)

3801

In [7]:
filtered_df = df[df.notnull().all(axis=1)]
print(filtered_df)

      pubchem_cid          chemical_name  exp_double_value
0             790           hypoxanthine           700.000
1            2474            Bupivacaine            91.700
2            4062            mepivacaine          7000.000
3            4421         nalidixic acid           100.000
4            4649  4-Aminosalicylic acid          1690.000
...           ...                    ...               ...
3796      2758725           Dithiobiuret          2700.000
3797      5248709               AC1NRVFX        584000.000
3798      5284557            ethisterone             0.687
3799      6448437            Butenachlor            29.000
3800     12395290          AGN-PC-0NIW76             0.044

[3661 rows x 3 columns]


In [8]:
filtered_df_cut =filtered_df[:225]
print(filtered_df_cut)

     pubchem_cid          chemical_name  exp_double_value
0            790           hypoxanthine             700.0
1           2474            Bupivacaine              91.7
2           4062            mepivacaine            7000.0
3           4421         nalidixic acid             100.0
4           4649  4-Aminosalicylic acid            1690.0
..           ...                    ...               ...
226          867           malonic acid          763000.0
227          875       DL-Tartaric acid          215000.0
228          878           methanethiol           15400.0
229          887               methanol         1000000.0
230          904            Acetanilide            6390.0

[225 rows x 3 columns]


**For the purpose of the current project, I have cut the dataset into smaller subset in order to avoid network crashing.**

#### 4. Getting Properties of the compounds 

In [9]:
# Getting Smiles and exact mol aweight of dataset
i = 0
j = 0
new_data = []
for chemical_name_1 in filtered_df_cut['chemical_name']:
    cid=pcp.get_cids(chemical_name_1)
    if cid == []:
        filtered_df_cut.drop(i) 
        j = j + 1
    else:
        #print( "The value of i and chemical name is :", i ,",",cid, ',', chemical_name_1)
        #prop = pcp.get_properties('CanonicalSMILES', cid)
        prop_1 = pcp.get_properties(['MolecularWeight','XlogP','CanonicalSMILES'], cid, 'cid')
        new_data.append(prop_1)
    
        i = i + 1
print("The values of i and j are", i, j)

The values of i and j are 217 8


In [10]:
len(new_data[0][0].keys())

4

In [11]:
rows = []
columns = new_data[0][0].keys()
for i in range(len(new_data)):
    rows.append(new_data[i][0].values())
props_df = pd.DataFrame(data=rows, columns=columns) 
props_df.head()

Unnamed: 0,CID,MolecularWeight,CanonicalSMILES,XLogP
0,135398638,136.11,C1=NC2=C(N1)C(=O)NC=N2,-0.5
1,2474,288.4,CCCCN1CCCCC1C(=O)NC2=C(C=CC=C2C)C,3.4
2,4062,246.35,CC1=C(C(=CC=C1)C)NC(=O)C2CCCCN2C,1.9
3,4421,232.23,CCN1C=C(C(=O)C2=C1N=C(C=C2)C)C(=O)O,1.4
4,4649,153.14,C1=CC(=C(C=C1N)O)C(=O)O,1.3


In [12]:
print(len(filtered_df_cut))

225


In [13]:
filtered_df_cut.head()

Unnamed: 0,pubchem_cid,chemical_name,exp_double_value
0,790,hypoxanthine,700.0
1,2474,Bupivacaine,91.7
2,4062,mepivacaine,7000.0
3,4421,nalidixic acid,100.0
4,4649,4-Aminosalicylic acid,1690.0


In [14]:
# Renaming columns

filtered_df_cut.rename(columns={'pubchem_cid': 'CID',  'exp_double_value': 'measured_sol'}, inplace=True)
print(filtered_df_cut)

      CID          chemical_name  measured_sol
0     790           hypoxanthine         700.0
1    2474            Bupivacaine          91.7
2    4062            mepivacaine        7000.0
3    4421         nalidixic acid         100.0
4    4649  4-Aminosalicylic acid        1690.0
..    ...                    ...           ...
226   867           malonic acid      763000.0
227   875       DL-Tartaric acid      215000.0
228   878           methanethiol       15400.0
229   887               methanol     1000000.0
230   904            Acetanilide        6390.0

[225 rows x 3 columns]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df_cut.rename(columns={'pubchem_cid': 'CID',  'exp_double_value': 'measured_sol'}, inplace=True)


In [15]:
# Calling merge() function 
int_df = pd.merge(filtered_df_cut, props_df, how ='right', on =['CID'])
print(int_df)

           CID          chemical_name  measured_sol MolecularWeight  \
0    135398638                    NaN           NaN          136.11   
1         2474            Bupivacaine          91.7           288.4   
2         4062            mepivacaine        7000.0          246.35   
3         4421         nalidixic acid         100.0          232.23   
4         4649  4-Aminosalicylic acid        1690.0          153.14   
..         ...                    ...           ...             ...   
212        867           malonic acid      763000.0          104.06   
213        875       DL-Tartaric acid      215000.0          150.09   
214        878           methanethiol       15400.0           48.11   
215        887               methanol     1000000.0          32.042   
216        904            Acetanilide        6390.0          135.16   

                         CanonicalSMILES  XLogP  
0                 C1=NC2=C(N1)C(=O)NC=N2   -0.5  
1      CCCCN1CCCCC1C(=O)NC2=C(C=CC=C2C)C    3.4

In [16]:
df_filtered = int_df.dropna()
df_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Index: 196 entries, 1 to 216
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CID              196 non-null    int64  
 1   chemical_name    196 non-null    object 
 2   measured_sol     196 non-null    float64
 3   MolecularWeight  196 non-null    object 
 4   CanonicalSMILES  196 non-null    object 
 5   XLogP            196 non-null    float64
dtypes: float64(2), int64(1), object(3)
memory usage: 10.7+ KB


#### 5.  Getting the log Solubility values in terms of mol/L

In [17]:
print( df_filtered.measured_sol)
df_filtered.MolecularWeight = df_filtered.MolecularWeight.astype(float)
print( df_filtered.MolecularWeight)

1           91.7
2         7000.0
3          100.0
4         1690.0
5           12.0
         ...    
212     763000.0
213     215000.0
214      15400.0
215    1000000.0
216       6390.0
Name: measured_sol, Length: 196, dtype: float64
1      288.400
2      246.350
3      232.230
4      153.140
5      290.400
        ...   
212    104.060
213    150.090
214     48.110
215     32.042
216    135.160
Name: MolecularWeight, Length: 196, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered.MolecularWeight = df_filtered.MolecularWeight.astype(float)


In [18]:
import numpy as np
import math

# Printing the log base e (natural log) -- Example
print ("Natural logarithm of 14 is : ", end="")
print (math.log(14))

x = ((df_filtered.measured_sol.astype(float)) / (df_filtered.MolecularWeight.astype(float)) / 1000)
print(x)
df_filtered = df_filtered.assign(log_sol = (np.log10(x)))
df_filtered.head()

Natural logarithm of 14 is : 2.6390573296152584
1       0.000318
2       0.028415
3       0.000431
4       0.011036
5       0.000041
         ...    
212     7.332308
213     1.432474
214     0.320100
215    31.209038
216     0.047277
Length: 196, dtype: float64


Unnamed: 0,CID,chemical_name,measured_sol,MolecularWeight,CanonicalSMILES,XLogP,log_sol
1,2474,Bupivacaine,91.7,288.4,CCCCN1CCCCC1C(=O)NC2=C(C=CC=C2C)C,3.4,-3.497626
2,4062,mepivacaine,7000.0,246.35,CC1=C(C(=CC=C1)C)NC(=O)C2CCCCN2C,1.9,-1.546455
3,4421,nalidixic acid,100.0,232.23,CCN1C=C(C(=O)C2=C1N=C(C=C2)C)C(=O)O,1.4,-3.365918
4,4649,4-Aminosalicylic acid,1690.0,153.14,C1=CC(=C(C=C1N)O)C(=O)O,1.3,-1.957202
5,5879,androsterone,12.0,290.4,CC12CCC(CC1CCC3C2CCC4(C3CCC4=O)C)O,3.7,-4.383815


In [19]:
# Writing data to .csv files
data_smiles = pd.DataFrame(zip(df_filtered.CID, df_filtered.CanonicalSMILES, df_filtered.log_sol))
data_smiles.columns


RangeIndex(start=0, stop=3, step=1)

#### 6. Writing the Details in the form of data file

In [20]:
def select_columns(data_frame, column_names):
    new_frame = data_frame.loc[:, column_names]
    return new_frame

selected_columns = ['CID','CanonicalSMILES', 'log_sol']
new_smiles = select_columns(df_filtered, selected_columns)
print(new_smiles)

      CID                      CanonicalSMILES   log_sol
1    2474    CCCCN1CCCCC1C(=O)NC2=C(C=CC=C2C)C -3.497626
2    4062     CC1=C(C(=CC=C1)C)NC(=O)C2CCCCN2C -1.546455
3    4421  CCN1C=C(C(=O)C2=C1N=C(C=C2)C)C(=O)O -3.365918
4    4649              C1=CC(=C(C=C1N)O)C(=O)O -1.957202
5    5879   CC12CCC(CC1CCC3C2CCC4(C3CCC4=O)C)O -4.383815
..    ...                                  ...       ...
212   867                      C(C(=O)O)C(=O)O  0.865241
213   875               C(C(C(=O)O)O)(C(=O)O)O  0.156087
214   878                                   CS -0.494715
215   887                                   CO  1.494280
216   904                   CC(=O)NC1=CC=CC=C1 -1.325347

[196 rows x 3 columns]


In [21]:
df = pd.DataFrame(new_smiles)
df.to_csv('smiles_logSol_viridis.csv', header=True, index=False)

#### 7. Glossary PubChemPy 

In [19]:
pcp.get_compounds('C1=CC2=C(C3=C(C=CC=N3)C=C2)N=C1', 'smiles')

[Compound(1318)]

In [20]:
c = pcp.Compound.from_cid(962)
c.to_dict(properties=['atoms', 'bonds', 'inchi'])

{'atoms': [{'aid': 1, 'number': 8, 'element': 'O', 'x': 2.5369, 'y': -0.155},
  {'aid': 2, 'number': 1, 'element': 'H', 'x': 3.0739, 'y': 0.155},
  {'aid': 3, 'number': 1, 'element': 'H', 'x': 2, 'y': 0.155}],
 'bonds': [{'aid1': 1, 'aid2': 2, 'order': 1},
  {'aid1': 1, 'aid2': 3, 'order': 1}],
 'inchi': 'InChI=1S/H2O/h1H2'}

In [21]:
p = pcp.get_properties('IsomericSMILES', 'CC', 'smiles', searchtype='superstructure')

In [27]:
prop = pcp.get_properties(['MolecularFormula', 'MolecularWeight', 'CanonicalSMILES', 'IsomericSMILES',
'InChI', 'InChIKey', 'IUPACName'],'CN1N=C(C=C1C(F)(F)F)C1=CC=C(S1)C1=CC=NC(SCC(=O)NC2=CC=C(Cl)C=C2)=N1', 'smiles')
print(type(prop))
print(prop)

In [None]:
data = []

for chemical_name in df['chemical_name']:
    props = pcp.get_properties(['MolecularFormula', 'MolecularWeight','InChI', 'InChIKey', 'IUPACName', 
                                'XLogP', 'ExactMass', 'MonoisotopicMass', 'TPSA', 'Complexity', 'Charge', 
                                'HBondDonorCount', 'HBondAcceptorCount', 'RotatableBondCount', 
                                'HeavyAtomCount', 'IsotopeAtomCount', 'AtomStereoCount', 
                                'DefinedAtomStereoCount', 'UndefinedAtomStereoCount', 'BondStereoCount', 
                                'DefinedBondStereoCount', 'UndefinedBondStereoCount', 'CovalentUnitCount', 
                                'Volume3D', 'XStericQuadrupole3D', 'YStericQuadrupole3D', 
                                'ZStericQuadrupole3D', 'FeatureCount3D', 'FeatureAcceptorCount3D', 
                                'FeatureDonorCount3D', 'FeatureAnionCount3D', 'FeatureCationCount3D', 
                                'FeatureRingCount3D', 'FeatureHydrophobeCount3D', 'ConformerModelRMSD3D', 
                                'EffectiveRotorCount3D', 'ConformerCount3D'], cid, 'cid')
    data.append(props)
#data

In [None]:
cid,
cid_name,
XlogP,
Complexity,
MolecularWeight,
HBondDonorCount, 
HBondAcceptorCount,
FeatureRingCount3D, 
RotatableBondCount,
Volume3D,
Measured,
Smiles