# Cleaning and Preparing Melting Point Data from Williams.sdf

Version: 0.1.0

0.1.0 - Added column to flag whether or not value is an average

Note: All values are in celcius.  
https://figshare.com/articles/dataset/Melting_Point_and_Pyrolysis_Point_Data_for_Tens_of_Thousands_of_Chemicals/2007426
## Importing Libraries

In [3]:
import pandas as pd
import numpy as np
from rdkit import Chem
import re
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

from MyFunctions.smilesToInChI import *

## Reading in data and ignoring non utf-8 characters

In [4]:
f = open("../Data/SourceData/Williams/Williams.csv", encoding="utf-8", errors="ignore")
data = pd.read_csv(f, low_memory=False)
f.close()
print(data.head())

                                       Molecule Name  ...       Value
0                     1-n-butyl-5-nitro-isoquinoline  ...  69 to 69.5
1                      3-chloro-5-amino-isoquinoline  ...  176 to 177
2             1-chloro-3-methyl-5-nitro-isoquinoline  ...         112
3                       2-dodecylthio-p-benzoquinone  ...  131 to 132
4  11,18-dihydroxy-pregna-1,4-diene-3,20-dione 18...  ...  162 to 164

[5 rows x 10 columns]


## Removing redundant columns

In [5]:
try: 
    del data["Patent"]
    del data["Paragraph"]
    del data["OriginalText"]
    del data["SuspicousValue"]
except:
    pass
print(data.head(), "\n\n")

def multiValueChecker(data, column):
    """ 
    Check if a column has multiple values. Returns True if it has only one value, False otherwise.
    """
    print("Column values:", data[column].unique())
    if len(data["QuantityType"].unique()) == 1:
        return True
    else: 
        return False

multiValueChecker(data, "QuantityType")
multiValueChecker(data, "SuspiciousValue")
multiValueChecker(data, "FromLiterature") # Some values are from literature and some are not

                                       Molecule Name  ...       Value
0                     1-n-butyl-5-nitro-isoquinoline  ...  69 to 69.5
1                      3-chloro-5-amino-isoquinoline  ...  176 to 177
2             1-chloro-3-methyl-5-nitro-isoquinoline  ...         112
3                       2-dodecylthio-p-benzoquinone  ...  131 to 132
4  11,18-dihydroxy-pregna-1,4-diene-3,20-dione 18...  ...  162 to 164

[5 rows x 7 columns] 


Column values: ['MeltingPoint']
Column values: [False]
Column values: [False  True]


True

Need to check if "Value" contains non numerical characters (other than `.`) -> perhaps use regex. Separate into lower and higher values in separate columns. Need edge case to detect for `>` signs. Convert InChI keys or smiles into InChI to allow joining of dataframes.

In [6]:
values = data["Value"].values
lowerVals = []; upperVals = []; avgVals = []

for val in values:
    val = val.replace("~", "").replace(" ", "").replace(">", "").replace("<", "") # Remove special characters
    if re.search(r"to", val): #Using regex to find ranges and splitting them
        try:
            splitVal = re.split(r"to", val, 1)
            lowerVals.append(splitVal[0])
            upperVals.append(splitVal[1])
        except:
            print(val)
    # elif re.search(r">", val): Would conserve more information but makes calculating averages not work
    #     val = val.replace(">", "").replace(" ", "")
    #     lowerVals.append(val); upperVals.append(np.nan)
    # elif re.search(r"<", val):
    #     val = val.replace("<", "").replace(" ", "")
    #     lowerVals.append(np.nan); upperVals.append(val)
    else:
        lowerVals.append(val); upperVals.append(val)

print("Lower values:", lowerVals) #Checking value list lengths
print(len(lowerVals), len(upperVals))
print("Upper values:", upperVals)

Lower values: ['69', '176', '112', '131', '162', '148.5', '260.5', '194', '131', '54', '126', '179', '162', '224', '136', '310', '270', '250', '128', '69', '250', '135', '276', '271', '230', '270', '231', '131', '130', '115', '191', '88', '132', '134', '225', '158', '128', '206', '157', '242', '17', '67', '57', '86', '159', '43', '115', '73', '275', '169', '275', '168.5', '194', '194', '75', '74', '115.5', '90.5', '167', '190', '42', '40', '100', '113', '193', '264', '209', '186', '201', '142', '117', '288', '176', '151', '184', '233', '229', '205', '221', '72', '183', '214', '55', '142', '119', '224', '228', '133', '108', '124', '80', '82', '151', '131', '101.5', '30', '90', '175', '111', '197', '130.5', '240', '92.5', '94', '60', '85', '155', '147.5', '59.5', '137', '134', '270', '216', '128', '158', '205', '145', '84', '102', '116', '136', '215', '276', '147', '164', '132', '118', '220', '155', '205', '133', '115', '89', '169', '110', '74', '61.5', '152', '112', '178', '211', '233',

In [7]:
cleanedData = data.copy() #Creating a copy of the data to add the new columns to
cleanedData["LowerValue"] = lowerVals
cleanedData["UpperValue"] = upperVals

avValues = [np.mean([float(lower), float(upper)]) for lower, upper in zip(lowerVals, upperVals)] #Calculating average values
cleanedData["AverageValue"] = avValues

In [8]:
print(cleanedData.head())

cleanedData["InChI"] = smilesToInChI(cleanedData["SMILES"])
# cleanedData.dropna(subset=["InChI"], inplace=True)

print(cleanedData.head())

                                       Molecule Name  ...  AverageValue
0                     1-n-butyl-5-nitro-isoquinoline  ...         69.25
1                      3-chloro-5-amino-isoquinoline  ...        176.50
2             1-chloro-3-methyl-5-nitro-isoquinoline  ...        112.00
3                       2-dodecylthio-p-benzoquinone  ...        131.50
4  11,18-dihydroxy-pregna-1,4-diene-3,20-dione 18...  ...        163.00

[5 rows x 10 columns]


Converting SMILES to InChI: 100%|██████████| 228174/228174 [01:29<00:00, 2546.57it/s]

4 occurred
                                       Molecule Name  ...                                              InChI
0                     1-n-butyl-5-nitro-isoquinoline  ...  InChI=1S/C13H14N2O2/c1-2-3-6-12-10-5-4-7-13(15...
1                      3-chloro-5-amino-isoquinoline  ...  InChI=1S/C9H7ClN2/c10-9-4-7-6(5-12-9)2-1-3-8(7...
2             1-chloro-3-methyl-5-nitro-isoquinoline  ...  InChI=1S/C10H7ClN2O2/c1-6-5-8-7(10(11)12-6)3-2...
3                       2-dodecylthio-p-benzoquinone  ...  InChI=1S/C18H28OS/c1-2-3-4-5-6-7-8-9-10-11-12-...
4  11,18-dihydroxy-pregna-1,4-diene-3,20-dione 18...  ...  InChI=1S/C21H27NO6/c1-12(23)16-5-6-17-15-4-3-1...

[5 rows x 11 columns]





In [9]:
def isAverage(values):
    """ 
    Checks if a value is numeric or not. Returns True if it is, False otherwise.
    """
    boolList = []
    for value in values:
        try:
            float(value)
            boolList.append(True)
        except:
            boolList.append(False)
    return boolList

vals = cleanedData["Value"].values
cleanedData["IsAverage"] = isAverage(vals)

In [10]:
cleanedData.shape

cleanedData.to_csv("../Data/Processed/0.1.0-Williams.csv", index=False)