# Code For Extracting the CMC Data from NIST

Version: 0.2.2

## Importing Libraries and Functions

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import sys
import math as m

sys.path.append('../')

from MyFunctions.smilesToInChI import *

## Importing Data

In [17]:
with open("../Data/SourceData/CMC-sourceData_v1.csv", encoding="utf-8", errors="ignore") as f:
    df1 = pd.read_csv(f, low_memory=False)
f.close()

with open("../Data/SourceData/CMC-sourceData_v2.csv", encoding="utf-8", errors="ignore") as f:
    df2 = pd.read_csv(f, low_memory=False)
f.close()

In [18]:
names = df1["Name"].tolist() #Creating list of names
names = np.unique(names)

combined = pd.DataFrame() #Creating empty dataframe to store combined data

for n in names: #Iterating through names and matching SMILES strings up
    subset1 = df1[df1["Name"] == n]
    subset2 = df2[df2["Name"] == n]

    subset1.insert(0, "SMILES", subset2["SMILES"].values[0])

    combined = pd.concat([combined, subset1])

## Removing Missing Data

In [19]:
def checkNumeric(data, col):
    numNumeric = data[col].str.isalpha()
    isNumeric = data[col][~numNumeric]
    print(len(numNumeric), f"Values found in {col} column")
    data = data[numNumeric == False]
    return data

In [20]:
df = combined.copy()

df = df[df["Additives"].isnull()] #Removing rows with additives
df = df[df["SMILES"].notnull()] #Removing rows with no SMILES

df = df.dropna(subset=["Temperature", "CMC"]) #Removing things with no temperature or CMC value
df = checkNumeric(df, "Temperature") #Removing non numeric temperature values

print(df.shape)

1695 Values found in Temperature column
(1445, 20)


## Separating CMC Powers and Units

In [21]:
cmcList = df["CMC"].to_list()

valueList = []; powerList = []; unitList = []; errors = []

for cmc in cmcList: #Iterating through the CMC values
    cmc = cmc.replace("X", "x") 
    if "x" in cmc: #Seeing if there is a power of 10 in the CMC value
        try: 
            cmcSplit = cmc.split("x10") #Splitting the CMC
            value = cmcSplit[0]
            power = np.round(int(cmcSplit[1].split(" ")[0]), 0)
            unit = cmcSplit[1].split(" ")[1]
        except: 
            print(f"CMC value is: {cmc}. No units found")
            errors.append(cmc)
            value = np.nan; power = np.nan; unit = np.nan
    else:
        try: 
            value = cmc.split(" ")[0]; power = np.nan; unit = cmc.split(" ")[1] #Splitting the CMC
        except:
            print(f"CMC value is: {cmc}. No units found")
            value = np.nan; power = np.nan; unit = np.nan
            errors.append(cmc)
    
    valueList.append(value) 
    powerList.append(power)
    unitList.append(unit)

df["CMC_Value"] = valueList; df["CMC_Power"] = powerList; df["CMC_Unit"] = unitList #Adding to dataframe
df.dropna(subset=["CMC_Value"], inplace=True)
df.reset_index(drop=True, inplace=True)
df

with open("../Data/TempFiles/CMC_Errors.txt", "w") as f:
    for error in errors:
        f.write(f"{error}\n")
f.close()

print(len(errors), "Errors found in CMC column")

CMC value is: 1.50 x10 1 P 1.702x10 0 S. No units found
CMC value is: 1.1 x10 W. No units found
CMC value is: 1.5 x10 o W. No units found
CMC value is: 1.82 x10 M. No units found
CMC value is: 1.37 x10 P 1.555x10 0 S. No units found
CMC value is: 1.30 x10 1 P 1.475x10 0 S. No units found
CMC value is: 1.23 x10 1 P 1.396x10 0 S. No units found
CMC value is: 1.75 x10 0 M. No units found
CMC value is: 1.20 x10 1 P 1.362x10 S. No units found
CMC value is: 1.58 x10 M. No units found
CMC value is: 1.62 x10 0 M. No units found
CMC value is: 1.20 x10 0 D 3.902x10-2 M. No units found
CMC value is: 1.1 x10 0 P 3.57 x10-2 S. No units found
CMC value is: 6.8 x10-2. No units found
CMC value is: 6.5 x10-2. No units found
CMC value is: 8.7 x10 5 M. No units found
CMC value is: W. No units found
CMC value is: 4.30 xio-3 M. No units found
CMC value is: 6.0 x10-4. No units found
CMC value is: 1.1 x10 0 P 5.87 x10-2 S. No units found
CMC value is: 1.6 x10-5. No units found
CMC value is: 61008. No units f

## Recombining CMC Values

In [22]:
cmcPowers = df["CMC_Power"].values
cmcRawVals = df["CMC_Value"].values
cleanedPowers = []

for n in cmcPowers:
    if m.isnan(n):
        n = 1
    n = m.trunc(n)
    cleanedPowers.append(n)

cmcVals = []

for i in range(len(cmcRawVals)):
    val = str(cmcRawVals[i]) + "e" + str(cleanedPowers[i])
    val = val.replace(" ", "")
    try:
        cmcVals.append(float(val))
    except:
        print(val)
        cmcVals.append(np.nan)

df["CMC"] = cmcVals

1ite-4
1:(0)e-4
)41e-4
I91e-4
:1.8e4


In [23]:
print(df["Evaluation"].value_counts()) #Li must be a mistake in the extraction

df = df[df["Evaluation"] != "Li"]

Evaluation
L    1099
3     184
1      63
2      22
D      17
P       5
T       1
I       1
Name: count, dtype: int64


In [24]:
print(df["CMC_Unit"].value_counts())  

anomolousUnits = ["0", "o", "1", "4", "3"]

for unit in anomolousUnits:
    df = df[df["CMC_Unit"] != unit]

print(df["CMC_Unit"].value_counts())

CMC_Unit
M    1098
W     147
D     117
P      16
N      15
Name: count, dtype: int64
CMC_Unit
M    1098
W     147
D     117
P      16
N      15
Name: count, dtype: int64


## Adding InChI

In [25]:
smiles = df["SMILES"].values
df["InChI"] = smilesToInChI(smiles)

Converting SMILES to InChI: 100%|██████████| 1393/1393 [00:00<00:00, 2973.87it/s]

0 occurred





## Unit Conversion

In [26]:
# df = df[df["Mol.Wgt."] != 0]

# unitList = df["CMC_Unit"].to_list()

# for i in range(len(unitList)):
#     if unitList[i] == "D" or unitList[i] == "P":
#         oldCMC = df.iloc[i]["CMC"]
#         molWgt = df.iloc[i]["Mol.Wgt."]
#         newCMC = (10*oldCMC)/(molWgt)
#         df.iloc[i, df.columns.get_loc("CMC")] = newCMC

df.sort_values(by="Compound No.", inplace=True)
df.reset_index(drop=True, inplace=True)
df.to_csv("../Data/0.5.0-CMC.csv", index=False)