In [1]:
import pandas as pd
import requests
import json
import pubchempy as pcp

### Using the url below, we can access the GHS classifictaion data for a compound on pubchem

In [3]:
safety_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/6209/JSON?heading=GHS+Classification" 

request = requests.get(safety_url)
request_json = request.json()

if 'Fault' in request_json:                 #first key in dict will be Fault if no GHS heading in json data
    print('No GHS data available')

else:
    print('GHS data is available')


GHS data is available


### The data is basially a giant dictionary and there is a specific block that contains the GHS information we want. Parsing through the dictionary to get to the desired block of text, each sentance is looped through and added to a list. 

In [4]:
GHS_information_list = [] #this list will contain the block of GHS information

#Some chemicals are classified as non hazardous in the GHS data. The length of the "Information" section is 1 for these. 
if len(request_json['Record']['Section'][0]['Section'][0]['Section'][0]['Information']) == 1:
    print('Not classified as a hazardous substance')
    

#Otherwise, the hazard codes are can be found within this parsing of the dictionary below.    
else:
    for i in range(len(request_json['Record']['Section'][0]['Section'][0]['Section'][0]['Information'][2]['Value']['StringWithMarkup'])):
        temp_list = [] #temporary list each sentance gets added to before appending to GHS list
        temp_list.append(request_json['Record']['Section'][0]['Section'][0]['Section'][0]['Information'][2]['Value']['StringWithMarkup'][i]['String'])
        GHS_information_list.append(temp_list)
 


In [5]:
GHS_information_list

[['Aggregated GHS information provided by 495 companies from  14  notifications to the ECHA C&L Inventory. Each notification may be associated with multiple companies.'],
 ['Reported as not meeting GHS hazard criteria by 206 of 495 companies. For more detailed information, please visit  ECHA C&L website'],
 ['Of the 12 notification(s) provided by 289 of 495 companies with hazard statement code(s):'],
 ['Information may vary between notifications depending on impurities, additives, and other factors. The percentage value in parenthesis indicates the notified classification ratio from companies that provide hazard codes. Only hazard codes with percentage values above 10% are shown.']]

### While this is all important information, we want to specifically extract the statements that contain the GHS hazard codes.

In [6]:
hazard_description_list = [] #list that will contain the hazrad codes and their descriptions. 
for item in GHS_information_list:
    temp_haz = [idx for idx in item if idx[0] == 'H'] #list comprehension, keeps lists that start with H, i.e. the hazard code
    hazard_description_list.append(temp_haz)
    #There will be empty lists so this step removes them
    for item in hazard_description_list:
        if len(item) == 0:
            hazard_description_list.remove(item)
        


In [7]:
hazard_description_list



### Now that we have the hazard code and description, we can go in and extract the hazard code to a list by splitting the string and retrieving the 1st substring, which is the hazard code. 

In [8]:
hazard_code_list = []
for item in hazard_description_list:
    string = item[0]
    hazard = string.split(' ', 1)[0]
    hazard_code_list.append(hazard)
    

In [9]:
hazard_code_list

['H315', 'H319', 'H335']

### Building this as a function

In [38]:
def check_GHS_data(request_json):
    
    """This function checks to see if GHS safety information data is available
    in the pubchem data file for a chemical"""
    
    
    if 'Fault' in request_json:                 #first key in dict will be Fault if no GHS heading in json data
        return 'No GHS data available'
     
    else:
        return 'GHS data available'
       

In [39]:
def hazard_classification(request_json):
    
    """This function checks if the subsatnce is classified as hazardous or non hazardous if GHS data was found"""
    
    GHS_status = check_GHS_data(request_json)
    
    # cas if no data was found in ghs retrieval function
    if GHS_status == 'No GHS data available':
        return GHS_status
    
    #otherwise, continue to parse through the json file to determine if the substance is hazardous or not
    else: 
        if len(request_json['Record']['Section'][0]['Section'][0]['Section'][0]['Information']) == 1:
            return 'Not classified as a hazardous substance'
            
        else:
            return 'Hazardous substance'

In [132]:
def get_hazard_codes(cid):
    
    """This is the main wrapper function for retrieving GHS hazard codes"""
    
    safety_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/%s/JSON?heading=GHS+Classification" % str(cid)

    request = requests.get(safety_url)
    request_json = request.json()

    
    hazard_status = hazard_classification(request_json)
    
    if hazard_status == 'No GHS data available':
        return hazard_status
    
    elif hazard_status == 'Not classified as a hazardous substance':
        return hazard_status
    
    elif hazard_status == 'Hazardous substance':
        
        GHS_information_list = [] #list that contains GHS information in which hazard codes are located.
        
        for i in range(len(request_json['Record']['Section'][0]['Section'][0]['Section'][0]['Information'][2]['Value']['StringWithMarkup'])):
            temp_list = [] #temporary list each sentance gets added to before appending to GHS list
            temp_list.append(request_json['Record']['Section'][0]['Section'][0]['Section'][0]['Information'][2]['Value']['StringWithMarkup'][i]['String'])
            GHS_information_list.append(temp_list)
            
        #this portion checks for lists with empty string '' that will break the code if not removed
        for item in GHS_information_list:
            if '' not in item: 
                pass
        
            elif '' in item:
                index = GHS_information_list.index(item)
                GHS_information_list[index].remove('')
        
            
        hazard_description_list = [] #list that will contain the hazard codes and their descriptions. 
        
        for item in GHS_information_list:
            temp_haz = [idx for idx in item if idx[0] == 'H'] #list comprehension, keeps lists that start with H, i.e. the hazard code
            hazard_description_list.append(temp_haz)
            #There will be empty lists so this step removes them
            for item in hazard_description_list:
                if len(item) == 0:
                    hazard_description_list.remove(item)
                    
        hazard_code_list = [] #list that contains all of the hazard codesfor the chemcial
        
        for item in hazard_description_list:
            string = item[0]
            hazard = string.split(' ', 1)[0]
            hazard_code_list.append(hazard)
            
        return hazard_code_list
            
    
        
    
        

In [133]:
data = pd.read_csv('/Users/Jaime/Downloads/HBA_list.csv')
data

Unnamed: 0,HBA,HBA_cid
0,choline chloride,6209
1,choline acetate,187
2,choline bromide,74724
3,choline fluoride,22134097
4,choline nitrate,13646546
5,(2-chloroethyl)trimethylammonium chloride,13836
6,ethyl(2-hydroxyethyl)dimethylammonium chloride,87940
7,benzyl(2-hydroxyethyl)dimethylammonium chloride,3014549
8,acetylcholine chloride,6060
9,tetramethylammonium chloride,6379


In [141]:
hazard_codes = []
for i, row in data.iterrows():
    cid = row['HBA_cid']
    values = get_hazard_codes(cid)
    hazard_codes.append(values)
data['GHS_info'] = hazard_codes



In [142]:
hazard_codes

[['H315', 'H319', 'H335'],
 'No GHS data available',
 ['H315', 'H319', 'H335'],
 'No GHS data available',
 'No GHS data available',
 ['H302:', 'H312:'],
 'No GHS data available',
 ['H315', 'H319', 'H335'],
 ['H315', 'H319', 'H335'],
 ['H300', 'H301', 'H311', 'H315', 'H319', 'H335', 'H370', 'H410', 'H411'],
 ['H315', 'H319', 'H335'],
 ['H315', 'H319'],
 'No GHS data available',
 ['H315', 'H319', 'H335'],
 ['H302', 'H315', 'H319', 'H335'],
 ['H302', 'H315', 'H319', 'H335'],
 ['H315', 'H319'],
 ['H315', 'H319', 'H335'],
 'Not classified as a hazardous substance',
 ['H315', 'H319'],
 ['H315', 'H319', 'H335'],
 ['H301', 'H302', 'H315', 'H317', 'H319', 'H335', 'H400', 'H410'],
 ['H302', 'H314', 'H315', 'H318', 'H335', 'H373', 'H400'],
 ['H315', 'H319', 'H335'],
 ['H302', 'H315', 'H319', 'H335', 'H411', 'H412'],
 ['H315', 'H319', 'H335'],
 ['H315', 'H319', 'H335'],
 ['H301', 'H315', 'H318', 'H319', 'H400', 'H410'],
 ['H315', 'H319', 'H335'],
 ['H315', 'H319', 'H335'],
 ['H301', 'H302', 'H311'

In [143]:
data

Unnamed: 0,HBA,HBA_cid,GHS_info
0,choline chloride,6209,"[H315, H319, H335]"
1,choline acetate,187,No GHS data available
2,choline bromide,74724,"[H315, H319, H335]"
3,choline fluoride,22134097,No GHS data available
4,choline nitrate,13646546,No GHS data available
5,(2-chloroethyl)trimethylammonium chloride,13836,"[H302:, H312:]"
6,ethyl(2-hydroxyethyl)dimethylammonium chloride,87940,No GHS data available
7,benzyl(2-hydroxyethyl)dimethylammonium chloride,3014549,"[H315, H319, H335]"
8,acetylcholine chloride,6060,"[H315, H319, H335]"
9,tetramethylammonium chloride,6379,"[H300, H301, H311, H315, H319, H335, H370, H41..."
