# NIST Scrape 
Starting from this work:

1) https://pubs.acs.org/doi/pdf/10.1021/acs.analchem.1c00867

2) https://github.com/Ohio-State-Allen-Lab/FTIRMachineLearning

3) https://www.linkedin.com/pulse/unofficial-nist-webbook-api-how-get-thermochemistry-data-contreras/?trk=pulse-article_more-articles_related-content-card

4) https://github.com/oscarcontrerasnavas/nist-webbook-API

My work stored here: 

https://github.com/kevinpatrickkent/FTIRMachineLearning/

In [1]:
# Kevin Patrick Kent
# updated from original code at https://github.com/Ohio-State-Allen-Lab/FTIRMachineLearning
# RUN IN COMMAND LINE: 1 ) python -m pip install git+https://github.com/nzhagen/jcamp

import os
import requests, urllib
import argparse
import logging
import pandas as pd 
import jcamp #Planning to use this later to import all of the .jdx files
#from model.utils import set_logger
# Was not able to run this on the first runthrough.  Will try this again later after investigation but continuing for now.

In [2]:
nist_url = "https://webbook.nist.gov/cgi/cbook.cgi"

# Get Dataframe of all Molecules in NIST Webbook

Used webbook API from here: https://github.com/oscarcontrerasnavas/nist-webbook-API

In [3]:
# I chose 40000 here because there are currently 37495 molecules on webbook

# Perform an inital query to see how many items are present in the database
url = 'https://nist-api.fly.dev/substances?page=1&per_page=10'
response = requests.get(url)
json = response.json()
tot = json['totalItems'] #Get the total number of items and store it for the next request

url = 'https://nist-api.fly.dev/substances?page=1&per_page=' + str(tot+1000)
response = requests.get(url) 
json = response.json()
df = pd.DataFrame(json['items'])
df.head()

Unnamed: 0,name,cas,formula,molecular_weight,image
0,methane,74828,CH4,16.0425,https://webbook.nist.gov/cgi/cbook.cgi?Struct=...
1,biotin,58855,C10H16N2O3S,244.311,https://webbook.nist.gov/cgi/cbook.cgi?Struct=...
2,"butanoic acid, 3-hydroxy-3-methyl-",625081,C5H10O3,118.1311,https://webbook.nist.gov/cgi/cbook.cgi?Struct=...
3,lead telluride,1314916,PbTe,334.8,https://webbook.nist.gov/cgi/cbook.cgi?Struct=...
4,ditungsten zirconium octaoxide,16853740,O8W2Zr,586.9,https://webbook.nist.gov/cgi/cbook.cgi?Struct=...


# Define Function for Scraping the IR Spectra from Webbook

Used the function from here: https://github.com/Ohio-State-Allen-Lab/FTIRMachineLearning

In [4]:
def scrape_data(cas_ls, params, data_dir):
	'''Collect data from NIST database and store them in jdx format.
    Args:
        cas_ls: (list) CAS ids to download data for
		params: (dict) queries to be added to url
		data_dir: (string) path to store the data
    Returns:
        None
    '''	
    
    #Create directory for the relevant spetra 
	spectra_path = os.path.join(data_dir, params['Type'].lower(), '')
	if not os.path.exists(spectra_path):
		os.makedirs(spectra_path)

	num_created = 0
	for cas_id in cas_ls:
		params['JCAMP'] = 'C' + cas_id
		response = requests.get(nist_url, params=params)

		if response.text == '##TITLE=Spectrum not found.\n##END=\n':
			continue
		num_created+=1
		logging.info('Creating {} spectra for id: {}. Total spectra created {}'.format(params['Type'].lower(), cas_id, num_created))
		with open(spectra_path +cas_id +'.jdx', 'wb') as data:
			data.write(response.content)

In [None]:

#Create data directory to store logs and spectra
data_dir = './data'
if not os.path.exists(data_dir):
	os.makedirs(data_dir)

logging.info('Scrape IR spectra')
# Taking this out because I'm running from the command line
#if args.scrap_IR:
cas_ids = df.cas
params={'JCAMP': '', 'Type': 'IR', 'Index': 0}	
scrape_data(cas_ids, params, data_dir)

# Not Using these cells right now marked for deletion

In [None]:
def scrape_inchi(cas_ls, params, data_dir):
	'''Collect Inchi keys from NIST database and store them in txt format.
    Args:
        cas_ls: (list) CAS ids to download data for
		params: (dict) queries to be added to url
		data_dir: (string) path to store the data
    Returns:
        None
    '''	

	#Create file path for storing inchi keys
	inchi_path = os.path.join(data_dir, 'inchi.txt')
	num_created = 0
	with open(inchi_path,'a') as file:
		content = '{}\t{}\n'.format('cas_id', 'inchi')
		file.write(content)

		for cas_id in cas_ls:
			params['GetInChI'] = 'C' + cas_id
			response = requests.get(nist_url, params=params)

			num_created+=1
			logging.info('Creating InChi key for id: {}. Total keys created {}'.format(cas_id, num_created))
			content = '{}\t{}\n'.format(cas_id,response.content.decode("utf-8"))
			file.write(content)

In [None]:
# All code to used if you're running this from the command line.
# I'm trying to run this in a Jupyter notebook.

#parser = argparse.ArgumentParser()
#parser.add_argument('--save_dir', default= './data',\
#     help = "Directory path to store scrapped data")
#parser.add_argument('--cas_list', default= 'species.txt',\
#    help = "File containing CAS number and formula of molecules")
#parser.add_argument('--scrape_IR', default= True,\
#    help = "Whether to download IR or not")
#parser.add_argument('--scrape_InChi', default= True,\
#    help = "Whether to download InChi or not")

In [None]:
# Command line stuff
#args = parser.parse_args()

cas_list = 'species.txt'

#Check if file containing CAS ids exist
assert os.path.isfile(cas_list),"No file named {} exists".format(cas_list)

# Removing this until I figure out how to bring in models.util
#set_logger(data_dir, 'scrape.log')