In [1]:
import numpy as np
from chemspipy import ChemSpider
from chemspipy.errors import ChemSpiPyNotFoundError
from tqdm import tqdm
import pandas as pd

In [2]:
np.random.seed(828)

In [3]:
cs = ChemSpider('279Oa8rBZhS3guup6BmSIgHfhoclXMB3')

In [4]:
# comp = cs.get_compound(2157)

In [5]:
DEFAULT_ATTRS = [
    'average_mass',
    'common_name',
    'csid',
    'external_references',
#     'image',
    'image_url',
    'inchi',
    'inchikey',
    'mol_2d',
    'mol_3d',
    'molecular_formula',
    'molecular_weight',
    'monoisotopic_mass',
    'nominal_mass',
    'record_id',
    'smiles',
    'stdinchi',
    'stdinchikey',   
]

In [6]:
def extract_attrs_as_dict(compound, attrs=DEFAULT_ATTRS):
    result = {}
    
    for attr in attrs:
        result[attr] = getattr(compound, attr)
        
    return result

def is_valid_compound(compound):
    """Totally a hack, since not sure how to get the global scope of compounds"""
    is_valid = True
    
    try:
        compound.external_references
    
    except ChemSpiPyNotFoundError as e:
        is_valid = False
        
    return is_valid

In [7]:
compound = cs.get_compound(2157)

In [8]:
compound_info = extract_attrs_as_dict(compound)

In [9]:
compound_info

{'average_mass': 180.1574,
 'common_name': 'Aspirin',
 'csid': 2157,
 'external_references': [{'source': '1717 CheMall',
   'sourceUrl': 'http://www.1717chem.com',
   'externalId': 'OR045489',
   'externalUrl': 'http://3bsccorp.com/organic-chemicals/OR045489'},
  {'source': '1717 CheMall',
   'sourceUrl': 'http://www.1717chem.com',
   'externalId': 'OR108792',
   'externalUrl': 'http://3bsccorp.com/organic-chemicals/OR108792'},
  {'source': '1717 CheMall',
   'sourceUrl': 'http://www.1717chem.com',
   'externalId': 'OR197645',
   'externalUrl': 'http://3bsccorp.com/organic-chemicals/OR197645'},
  {'source': '1717 CheMall',
   'sourceUrl': 'http://www.1717chem.com',
   'externalId': 'OR275198',
   'externalUrl': 'http://3bsccorp.com/organic-chemicals/OR275198'},
  {'source': '1717 CheMall',
   'sourceUrl': 'http://www.1717chem.com',
   'externalId': 'OR382102',
   'externalUrl': 'http://3bsccorp.com/organic-chemicals/OR382102'},
  {'source': 'A&J Pharmtech',
   'sourceUrl': 'http://www.

In [None]:
n_samples = 100
MAX_NUM = 60 * (10 ** 6)

In [None]:
random_compound_ids = np.random.choice(MAX_NUM, size=n_samples, replace=False)

In [None]:
result = []

for cid in tqdm(random_compound_ids):
    compound = cs.get_compound(cid)
    
    is_valid = is_valid_compound(compound)
    if not is_valid:
        continue
    
    compound_info = extract_attrs_as_dict(compound)
    result.append(compound_info)

In [None]:
result_df = pd.DataFrame(result)

In [None]:
result_df.to_csv('./chemspider_82_sample_entries.csv', index=False)