In [1]:
import pandas as pd
import requests
import json
import pubchempy as pcp

### Using the url below, we can access the GHS classifictaion data for a compound on pubchem

In [14]:
#testing with retrieving ghs classifications for choline chloride (cid = 6209)
safety_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/6209/JSON?heading=GHS+Classification"

request = requests.get(safety_url)
request_json = request.json()

In [15]:
request_json

{'Record': {'RecordType': 'CID',
  'RecordNumber': 6209,
  'RecordTitle': 'Choline chloride',
  'Section': [{'TOCHeading': 'Safety and Hazards',
    'Description': 'Safety and hazards information, properties, management techniques, reactivities and incompatibilities, first aid treatments, and more. For toxicity and related information, please visit Toxicity section.',
    'Section': [{'TOCHeading': 'Hazards Identification',
      'Description': 'Hazards Identification includes all hazards regarding the chemical; required label elements',
      'Section': [{'TOCHeading': 'GHS Classification',
        'Description': 'GHS (Globally Harmonized System of Classification and Labelling of Chemicals) is a United Nations system to identify hazardous chemicals and to inform users about these hazards. GHS has been adopted by many countries around the world and is now also used as the basis for international and national transport regulations for dangerous goods. The GHS hazard statements, class ca

### The data is basially a giant dictionary and there is a specific block that contains the GHS information we want. Parsing through the dictionary to get to the desired block of text, each sentance is looped through and added to a list. 

In [16]:
GHS_information_list = [] #this list will contain the block of GHS information
for i in range(len(request_json['Record']['Section'][0]['Section'][0]['Section'][0]['Information'][2]['Value']['StringWithMarkup'])):
    temp_list = [] #temporary list each sentance gets added to before appending to GHS list
    temp_list.append(request_json['Record']['Section'][0]['Section'][0]['Section'][0]['Information'][2]['Value']['StringWithMarkup'][i]['String'])
    GHS_information_list.append(temp_list)

In [17]:
GHS_information_list

[['Aggregated GHS information provided by 484 companies from  14  notifications to the ECHA C&L Inventory. Each notification may be associated with multiple companies.'],
 ['Reported as not meeting GHS hazard criteria by 194 of 484 companies. For more detailed information, please visit  ECHA C&L website'],
 ['Of the 12 notification(s) provided by 290 of 484 companies with hazard statement code(s):'],
 ['Information may vary between notifications depending on impurities, additives, and other factors. The percentage value in parenthesis indicates the notified classification ratio from companies that provide hazard codes. Only hazard codes with percentage values above 10% are shown.']]

### While this is all important information, we want to specifically extract the statements that contain the GHS hazard codes.

In [6]:
hazard_description_list = [] #list that will contain the hazrad codes and their descriptions. 
for item in GHS_information_list:
    temp_haz = [idx for idx in item if idx[0] == 'H'] #list comprehension, keeps lists that start with H, i.e. the hazard code
    hazard_description_list.append(temp_haz)
    #There will be empty lists so this step removes them
    for item in hazard_description_list:
        if len(item) == 0:
            hazard_description_list.remove(item)
        


In [7]:
hazard_description_list



### Now that we have the hazard code and description, we can go in and extract the hazard code to a list by splitting the string and retrieving the 1st substring, which is the hazard code. 

In [8]:
hazard_code_list = []
for item in hazard_description_list:
    string = item[0]
    hazard = string.split(' ', 1)[0]
    hazard_code_list.append(hazard)
    

In [9]:
hazard_code_list

['H315', 'H319', 'H335']