In [1]:
import pandas as pd
import requests
import json
import pubchempy as pcp
from safety import get_hazard_codes
data = pd.read_csv('/Users/Jaime/Downloads/HBA_list.csv') #change file path if necessary
scoring_table = pd.read_csv('hazard_score.csv')

### Recap from the previous work in the other notebooks, we were able to take a list of chemicals and determine if ghs data was available, and if so, if the chemical was hazardous or nonhazardous. If hazardous, the specific hazard codes were returned. We also took a table of all possible hazard codes from GHS, and used a paper (Verslycke et al, "the Chemistry Scoring Index")  that ranked these hazards based on how detrimental they are, to assign scores to all these values.

### Below, the get_hazard_code function is demonstrated which is contained in the safety.py file.

In [2]:
hazard_codes = []
for i, row in data.iterrows():
    cid = row['HBA_cid']
    values = get_hazard_codes(cid)
    hazard_codes.append(values)
data['GHS_info'] = hazard_codes


In [3]:
data

Unnamed: 0,HBA,HBA_cid,GHS_info
0,choline chloride,6209,"[H315, H319, H335]"
1,choline acetate,187,No GHS data available
2,choline bromide,74724,"[H315, H319, H335]"
3,choline fluoride,22134097,No GHS data available
4,choline nitrate,13646546,No GHS data available
5,(2-chloroethyl)trimethylammonium chloride,13836,"[H302:, H312:]"
6,ethyl(2-hydroxyethyl)dimethylammonium chloride,87940,No GHS data available
7,benzyl(2-hydroxyethyl)dimethylammonium chloride,3014549,"[H315, H319, H335]"
8,acetylcholine chloride,6060,"[H315, H319, H335]"
9,tetramethylammonium chloride,6379,"[H300, H301, H311, H315, H319, H335, H370, H41..."


### And here is what the scoring table looks like

In [4]:
scoring_table

Unnamed: 0,Code,Hazard Score
0,H200,100
1,H201,100
2,H202,100
3,H203,100
4,H204,100
5,H205,100
6,H206,100
7,H207,100
8,H207,100
9,H208,100


### Now the objective is to assign final hazard scores to the chemicals in the data table as a sum of all their hazard codes. Ideally, this could be split into a health and environmental score, since all hazard codes for these start with a 3 and 4, respectively. We also will rank health scores higher than environmental scores in the final chemical ranking against with all other properties (melting point, cost, etc.). 


### There could be a few ways to do this, but it might be beneficial to have this process work outside the dataframe and then append back to it. The reason is, the hazard codes are contained inside lists in the dataframe, extarcting that column as another list would make the hazrad codes for each chemical easily iteratted through a list of lists. For the hazard scoring table, we could keep as a dataframe or convert into dictionary instead. 

In [5]:
#Converting the GHS_info column in the data table as a list for testing our fucntions.
GHS_list = data['GHS_info'].tolist()

### Writing a quick function to check if the hazard score should get the max or min value, or needs get calculated.

In [190]:
def hazard_check(item):
    """This function will check the ghs info and determine to give it the max or min safety penalty
    or calculate a total hazard score"""

    if item == 'Not classified as a hazardous substance':
        return 'Minimum Penalty'                          # this will later get a score of 0, which is the best case

    if item == 'No GHS data available':
        return 'Maximum Penalty'                        # this will later get a score of 100 which is max case for single hazard

    else:
        return 'Calculate Score'


In [191]:
check_list = []
for item in GHS_list:
    checks = hazard_check(item)
    check_list.append(checks)



In [192]:
check_list

['Calculate Score',
 'Maximum Penalty',
 'Calculate Score',
 'Maximum Penalty',
 'Maximum Penalty',
 'Calculate Score',
 'Maximum Penalty',
 'Calculate Score',
 'Calculate Score',
 'Calculate Score',
 'Calculate Score',
 'Calculate Score',
 'Maximum Penalty',
 'Calculate Score',
 'Calculate Score',
 'Calculate Score',
 'Calculate Score',
 'Calculate Score',
 'Minimum Penalty',
 'Calculate Score',
 'Calculate Score',
 'Calculate Score',
 'Calculate Score',
 'Calculate Score',
 'Calculate Score',
 'Calculate Score',
 'Calculate Score',
 'Calculate Score',
 'Calculate Score',
 'Calculate Score',
 'Calculate Score',
 'Calculate Score',
 'Calculate Score',
 'Calculate Score',
 'Calculate Score',
 'Calculate Score',
 'Calculate Score',
 'Calculate Score',
 'Calculate Score']

### Writing the portion of code that will grab the hazard scores from the hazard codes and add them up to one single score

In [193]:
#testing function for replacing hazard codes with scores from the table

mock_list = ['H315:', 'H319', 'H335'] #sample list 
scoring_table_dict = scoring_table.to_dict() #dictionary to pull hazard scores from

In [194]:
mock_score_list = [] #sample list that will contain all the scores
mock_final_list = [] #sample list that will contain the sum of all the scores

for code in mock_list:
    
   
    if code.endswith(':'): #some of the codes end with a colon from extarcting from jsons. Remove them here if present. 
        code = code[:-1] #removes last string from item, which will be the colon.
        
    
    for i in scoring_table_dict['Code']:                                   #loop through the dictionary                        
        if code == scoring_table_dict['Code'][i]:                          #if code is present in dictionary
            mock_score_list.append(scoring_table_dict['Hazard Score'][i])  #append the hazard score to the score list
            
mock_final_list.append(sum(mock_score_list)) #sum the score list to get final score and append

In [195]:
mock_final_list

[70]

### Converting this into a function

In [196]:
def calculate_score (hazard_list, dictionary):
    """This function will take the hazard codes from a list and grab the respectvie hazard scores from a dictionary """
    
    score_list = []
    
    for code in hazard_list:
    
   
        if code.endswith(':'): #some of the codes end with a colon from extarcting from jsons. Remove them here if present. 
            code = code[:-1] #removes last string from item, which will be the colon.
        
    
        for i in scoring_table_dict['Code']:                                   #loop through the dictionary                        
            if code == dictionary['Code'][i]:                          #if code is present in dictionary
                score_list.append(dictionary['Hazard Score'][i])  #append the hazard score to the score list

    return score_list
    
    

In [197]:
#testing quickly
mock_final_list = []
scores = calculate_score(mock_list, scoring_table_dict)
mock_final_list.append(sum(scores)) #sum the score list to get final score and append
print(mock_final_list)

[70]


### Writing the main wrapper code

In [199]:
#Writing portion that will assign values to health and environmental scores if its nonhazrdous or no ghs info found

health_list = []  #the final list that will contain the health scores to append to the dataframe
env_list = []     #the final list that will contain the environmental scores to append to the dataframe

for item in GHS_list:
    checks = hazard_check(item)          #performing hazard check function
    
    if checks == 'Minimum Penalty':  #assign score of zero to both health and environmental lists
        value = 0
        health_list.append(value)
        env_list.append(value)
        
    elif checks == 'Maximum Penalty':  #assign score of one-hundred to both health and environmental lists
        value = 100
        health_list.append(value)
        env_list.append(value)
        
    elif checks == 'Calculate Score':  #here we will parse the codes based on if they pertain to a health or env hazard
        
        temp_health_list = [] #will temp store health hazard codes to be summed and appended to final health score list
        temp_env_list = []    #will temp store env hazard codes to be summed and appended to final health score list
            
        for hazard in item:
            
            
            if hazard[1] == '3':
                
                value = hazard
                temp_health_list.append(value)
                
                
            elif hazard[1] == '4':
                
                value = hazard
                temp_env_list.append(value)
                
        health_scores = calculate_score(temp_health_list, scoring_table_dict)
        env_scores = calculate_score(temp_env_list, scoring_table_dict)
                 
        health_list.append(sum(health_scores))
        env_list.append(sum(env_scores))
           
           

In [200]:
health_list

[70,
 100,
 70,
 100,
 100,
 100,
 100,
 70,
 70,
 370,
 70,
 20,
 100,
 70,
 120,
 120,
 20,
 70,
 0,
 20,
 70,
 220,
 210,
 70,
 120,
 70,
 70,
 120,
 70,
 70,
 320,
 145,
 70,
 320,
 495,
 320,
 70,
 570,
 70]

In [201]:
env_list

[0,
 100,
 0,
 100,
 100,
 0,
 100,
 0,
 0,
 175,
 0,
 0,
 100,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 200,
 100,
 0,
 125,
 0,
 0,
 200,
 0,
 0,
 50,
 50,
 0,
 75,
 75,
 0,
 0,
 0,
 0]

### Writing as final wrapper functions

In [203]:
def get_hazard_scores (item, health_list, env_list, scoring_table_dict):
    
#     for item in GHS_list:
        checks = hazard_check()          #performing hazard check function

        if checks == 'Minimum Penalty':  #assign score of zero to both health and environmental lists
            value = 0
            health_list.append(value)
            env_list.append(value)

        elif checks == 'Maximum Penalty':  #assign score of one-hundred to both health and environmental lists
            value = 100
            health_list.append(value)
            env_list.append(value)

        elif checks == 'Calculate Score':  #here we will parse the codes based on if they pertain to a health or env hazard

            temp_health_list = [] #will temp store health hazard codes to be summed and appended to final health score list
            temp_env_list = []    #will temp store env hazard codes to be summed and appended to final health score list

            for hazard in item:


                if hazard[1] == '3':     #if the first number in a hazard code is 3, it is a health code

                    value = hazard
                    temp_health_list.append(value)


                elif hazard[1] == '4': #if the first number in a hazard code is 4, it is an environmental code

                    value = hazard
                    temp_env_list.append(value)

            health_scores = calculate_score(temp_health_list, scoring_table_dict)    #calculating health and env scores
            env_scores = calculate_score(temp_env_list, scoring_table_dict)
            
       

            health_list.append(sum(health_scores)) #append and sum the health scores to the final list. Add these to your dataframe
            env_list.append(sum(env_scores))

### We test this new get_hazard_scores function together with the get_hazard_codes function in a final example notebook