In [42]:
import pandas as pd
from tqdm import tqdm
import numpy as np
from scipy import stats
import sys
sys.path.append('../')
from MachineLearning.MyFuncs.checkDirs import *
from MiscScripts.DataCleaning import *
from sklearn.preprocessing import RobustScaler, MinMaxScaler

import warnings
warnings.filterwarnings("ignore")

In [43]:
def removeSingleValCols(data, cols):
    """ 
    Function that iterates through columns and deletes columns with only one unique value.
    
    Parameters:
    data: pandas DataFrame
    cols: list of strings
    
    Returns:
    data: pandas DataFrame
    """
    for col in cols:
        values = data[col].values.astype(float).tolist()
        if len(set(values)) == 1:
            data.drop(col, axis=1, inplace=True)
    return data

In [44]:
def getOnlyTarget(df, targetA, targetB):
    """
    Function that returns a DataFrame with only rows that have a value for targetA and no value for targetB, and targetA moved to position 0.

    Parameters:
    df: pandas DataFrame
    targetA: string
    targetB: string

    Returns:
    df: pandas DataFrame
    """

    df = df[df[targetA].notnull()]
    df = df[df[targetB].isnull()]

    popped = df.pop(targetA)
    df.insert(0, targetA, popped)
    return df

In [45]:
def cleanData(df, idx, tolPercent):
    print("\n\n", df.columns[0])

    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df = df[pd.to_numeric(df[df.columns[0]], errors='coerce').notnull()]

    tolVal = len(df) * tolPercent
    data = df.iloc[:, idx:]
    print("Initial data shape:", data.shape)
    data, dropped = dropNaN_cols(data, data.columns, tolVal)

    data = pd.DataFrame(data)
    data.dropna(inplace=True)
    print("Shape after removing NaN columns:", data.shape)

    data = removeSingleValCols(data, data.columns)
    df = pd.concat([df.iloc[:, :idx], data], axis=1)
    print("Shape after removing single value columns:", data.shape)
    print("Final dataset shape:", df.shape)
    return df
    

In [46]:
def moveToZero(df, col):
    """
    Function that moves a column to the first position in a DataFrame.

    Parameters:
    df: pandas DataFrame
    col: string

    Returns:
    df: pandas DataFrame
    """
    popped = df.pop(col)
    df.insert(0, col, popped)
    return df

In [47]:
og = pd.read_csv("../Data/Combined/0.5.0-Master+RDKit.csv")
og.dropna(subset = "InChI", inplace=True)
print(og.shape)

(21291, 219)


In [48]:
henryConstant = og["HenryConstant"].values.tolist()
print(og["HenryConstant"].notnull().sum())

logH = []
for x in henryConstant:
    if x < 1e15:
        logH.append(np.log(x))
    else:
        logH.append(np.nan)

og["HenryConstant"] = logH
print(og.shape)

og.replace([np.inf, -np.inf], np.nan, inplace=True)
og.rename(columns={"HenryConstant": "logHenry"}, inplace=True)

print(og.shape)
print(og["logHenry"].notnull().sum())

12167
(21291, 219)
(21291, 219)
12069


In [49]:
targets = ["logS", "logHenry"]
hasBoth = og[og["logHenry"].notnull() & og["logS"].notnull()] #Temperature is at index 7. Do not remove temperature.

for targ in targets:
    hasBoth = moveToZero(hasBoth, targ)

onlyLogS = getOnlyTarget(og, *targets)
onlyHenry = getOnlyTarget(og, *reversed(targets))

hasLogS = og.dropna(subset=["logS"])
hasLogS = moveToZero(hasLogS, "logS")

hasHenry = og.dropna(subset=["logHenry"])
hasHenry = moveToZero(hasHenry, "logHenry")

In [50]:
vers = "0.7.1"
onlyHenry.to_csv(f"../Data/Datasets/{vers}-onlyLogH.csv", index=False)
onlyLogS.to_csv(f"../Data/Datasets/{vers}-onlyLogS.csv", index=False)
hasBoth.to_csv(f"../Data/Datasets/{vers}-logHlogS.csv", index=False)

hasLogS.to_csv(f"../Data/Datasets/{vers}-hasLogS.csv", index=False)
hasHenry.to_csv(f"../Data/Datasets/{vers}-hasLogH.csv", index=False)

# Scaling

scale trainTest set, use col names from trainTest set to scale validation set

compare col names between logS and logH

hasLogX datasets need to have validation code applied

In [51]:
def myScaler(dset, tol):
    target = dset.columns[0]
    print("\n", target)
    print("Initial shape:", dset.shape)
    dset.reset_index(drop=True, inplace=True) #Must reset index or it will not work
    data = dset.iloc[:, 7:] #Temperature is at index 7
    data = pd.DataFrame(MinMaxScaler().fit_transform(data), columns = data.columns)

    dataMinusTemp = dropLowDistinction(data.iloc[:, 1:], tol)[0] #Do not drop temperature!
    print("Shape after dropping low distinction:", data.shape)

    data = pd.concat([data.iloc[:, :1], dataMinusTemp], axis=1) #Adding temperature back in
    cols = data.columns
    
    data = RobustScaler().fit_transform(data)
    data = pd.DataFrame(data, columns = cols)
    dset = pd.concat([dset.iloc[:, :7], data], axis=1)

    dset = removeOutliers(dset, [target], 3)
    dset.dropna(subset = dset.columns[7:], inplace=True)
    dset.dropna(subset = [target], inplace=True)
    print("Shape after removing outliers:", dset.shape)

    return dset

In [52]:
def moveNonTarget(df, col):
    popped = df.pop(col)
    df.insert(8, col, popped)
    return df

In [53]:
onlyLogS.iloc[:, 7:]

Unnamed: 0,Temperature,logHenry,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,...,fr_sulfide,fr_sulfonamd,fr_sulfone,fr_term_acetylene,fr_tetrazole,fr_thiazole,fr_thiocyan,fr_thiophene,fr_unbrch_alkane,fr_urea
0,25.0,,10.779590,10.779590,0.000000,-0.259307,0.099444,15.771429,639.597,583.149,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
1,25.0,,10.779590,10.779590,0.000000,-0.259307,0.168923,16.275862,555.435,511.083,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
2,25.0,,10.325282,10.325282,0.000000,-0.892639,0.348400,15.724138,519.355,470.971,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
3,25.0,,10.779590,10.779590,0.000000,-0.259307,0.053778,17.268293,743.727,683.247,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0
4,25.0,,12.770074,12.770074,0.690077,-1.936914,0.453223,109.142857,490.639,490.639,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21203,25.0,,0.000000,0.000000,0.000000,0.000000,0.372045,0.000000,95.940,95.940,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21204,25.0,,0.000000,0.000000,0.000000,0.000000,0.403981,0.000000,127.938,127.938,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21247,25.0,,0.000000,0.000000,0.000000,0.000000,0.495378,0.000000,223.199,223.199,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21263,25.0,,8.505000,8.505000,2.030000,-2.030000,0.441904,8.000000,159.598,159.598,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [54]:
datasets = [onlyLogS, onlyHenry]
tol = 0.01
trainColDict = {}
vers = "0.2.2"

for dset in datasets:
    dset = cleanData(dset, 8, tol)

    del dset["SMR_VSA3"]
    
    target = dset.columns[0]
    nonTarget = "logHenry" if target == "logS" else "logS"
    print("Dataset for prediction:", target)

    dset = myScaler(dset, tol) #Losing temperature here :(
    print(dset.columns[0:10])

    trainColDict[f"{target}-train"] = dset.columns[6:] #Used for comparison later

    dset.to_csv(f"../Data/Datasets/PredictionDatasets/{vers}-{target}-MRobust-TrainSet.csv", index=False)
    dset.to_csv(f"../Data/Datasets/PredictionDatasets/{vers}-{nonTarget}-MRobust-PredictionSet.csv", index=False)



 logS
Initial data shape: (9140, 211)
Shape after removing NaN columns: (9140, 198)
Shape after removing single value columns: (9140, 157)
Final dataset shape: (9140, 165)
Dataset for prediction: logS

 logS
Initial shape: (9140, 164)
Shape after dropping low distinction: (9140, 157)
Shape after removing outliers: (9117, 79)
Index(['logS', 'Compound', 'SMILES', 'InChI', 'logS-dataSource',
       'HenryConstant-dataSource', 'nIsomers', 'Temperature',
       'MaxAbsEStateIndex', 'MaxEStateIndex'],
      dtype='object')


 logHenry
Initial data shape: (9506, 211)
Shape after removing NaN columns: (9488, 198)
Shape after removing single value columns: (9488, 155)
Final dataset shape: (9506, 163)
Dataset for prediction: logHenry

 logHenry
Initial shape: (9506, 162)
Shape after dropping low distinction: (9506, 155)
Shape after removing outliers: (9459, 79)
Index(['logHenry', 'Compound', 'SMILES', 'InChI', 'logS-dataSource',
       'HenryConstant-dataSource', 'nIsomers', 'Temperature',
   

In [55]:
def compareCols(myDict):
    dictKeys = list(myDict.keys())
    valueList = []

    for key in dictKeys:
        valueList.append(myDict[key])

    print(f"Present in {dictKeys[0]} but not {dictKeys[1]}:", list(set(valueList[0]) - set(valueList[1])))
    print(f"Present in {dictKeys[1]} but not {dictKeys[0]}:", list(set(valueList[1]) - set(valueList[0])))

compareCols(trainColDict)

Present in logS-train but not logHenry-train: []
Present in logHenry-train but not logS-train: []


In [56]:
onlyHenryClean = cleanData(onlyHenry, 8, tol)
hBCols = onlyHenryClean.columns[7:]
trainCols = trainColDict["logS-train"]
toDelete = list(set(hBCols) - set(trainCols))

print(list(set(hBCols) - set(trainCols)))



 logHenry
Initial data shape: (9506, 211)
Shape after removing NaN columns: (9488, 198)
Shape after removing single value columns: (9488, 155)
Final dataset shape: (9506, 163)
['fr_term_acetylene', 'HeavyAtomCount', 'fr_oxime', 'PEOE_VSA9', 'NumSaturatedHeterocycles', 'fr_hdrzone', 'SMR_VSA3', 'fr_aldehyde', 'fr_HOCCN', 'fr_N_O', 'SMR_VSA9', 'NumAliphaticRings', 'fr_unbrch_alkane', 'fr_epoxide', 'fr_thiocyan', 'fr_halogen', 'fr_dihydropyridine', 'fr_nitrile', 'NOCount', 'NumSaturatedRings', 'fr_alkyl_halide', 'fr_imide', 'fr_Al_COO', 'fr_ketone_Topliss', 'SlogP_VSA8', 'fr_C_O', 'fr_SH', 'fr_COO2', 'NumHeteroatoms', 'fr_priamide', 'fr_methoxy', 'fr_NH2', 'fr_Al_OH', 'fr_ketone', 'fr_Ndealkylation1', 'fr_Ndealkylation2', 'fr_nitro', 'fr_piperzine', 'fr_sulfonamd', 'fr_phos_ester', 'fr_quatN', 'SMR_VSA2', 'fr_COO', 'fr_sulfide', 'EState_VSA11', 'fr_ether', 'fr_lactam', 'fr_amidine', 'fr_sulfone', 'SlogP_VSA1', 'fr_lactone', 'fr_urea', 'fr_isocyan', 'PEOE_VSA8', 'fr_C_S', 'RingCount', 'f

In [62]:
targets = ["logS", "logHenry"]
valColDict = {}
hasBothClean = cleanData(hasBoth, 10, tol)

for targ in targets:
    hB = hasBothClean.copy()

    oTarg = "logS" if targ == "logHenry" else "logHenry"
    del hB[oTarg]

    hB = myScaler(hB, 0.01)

    for col in toDelete:
        try:
            hB.drop(col, axis=1, inplace=True)
        except:
            pass

    valColDict[f"{targ}-val"] = hB.columns[7:]
    hB.to_csv(f"../Data/Datasets/PredictionDatasets/{vers}-{targ}-MRobust-ValidationSet.csv", index=False)

compareCols(valColDict)



 logHenry
Initial data shape: (2563, 209)
Shape after removing NaN columns: (2563, 197)
Shape after removing single value columns: (2563, 152)
Final dataset shape: (2563, 162)

 logS
Initial shape: (2563, 161)
Shape after dropping low distinction: (2563, 154)
Shape after removing outliers: (2557, 85)

 logHenry
Initial shape: (2563, 161)
Shape after dropping low distinction: (2563, 154)
Shape after removing outliers: (2539, 85)
Present in logS-val but not logHenry-val: []
Present in logHenry-val but not logS-val: []


In [63]:
combiDict = {"Validation" : valColDict[list(valColDict.keys())[0]],
                "Train" : trainColDict[list(trainColDict.keys())[0]]}
compareCols(combiDict)

Present in Validation but not Train: []
Present in Train but not Validation: ['nIsomers']


In [65]:
dset.iloc[:, 7:]

Unnamed: 0,Temperature,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,...,VSA_EState3,VSA_EState4,VSA_EState5,VSA_EState6,VSA_EState7,VSA_EState8,VSA_EState9,FractionCSP3,MolLogP,MolMR
0,0.000000,-1.741887,-1.742184,-0.566551,0.067701,-0.483849,-1.025424,2.003397,2.206442,2.000373,...,-0.193775,0.0,0.0,0.0,-0.273012,-0.487025,0.0,-0.977778,-2.587975,-1.479492
1,2.060012,-1.741887,-1.742184,-0.566551,0.067701,-0.483849,-1.025424,2.003397,2.206442,2.000373,...,-0.193775,0.0,0.0,0.0,-0.273012,-0.487025,0.0,-0.977778,-2.587975,-1.479492
2,0.000000,-1.109244,-1.109433,-0.566551,0.067701,0.800409,-0.918886,1.359621,1.378877,1.369417,...,-0.193775,0.0,0.0,0.0,-0.273012,1.654659,0.0,-0.533333,0.114792,0.010049
3,1.060572,-1.109244,-1.109433,-0.566551,0.067701,0.680301,-0.876271,0.996355,1.066424,1.012338,...,-0.193775,0.0,0.0,0.0,-0.273012,1.654659,0.0,-0.311111,-0.057459,-0.670953
4,1.302861,-1.063283,-1.063464,-0.566551,0.067701,0.795515,-0.705811,1.274192,1.311454,1.289968,...,-0.193775,0.0,0.0,0.0,0.051661,1.868828,0.0,-0.088889,0.199855,-0.332541
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9501,0.575995,-1.741887,-1.742184,-0.566551,0.067701,-0.159920,-1.025424,-0.265606,-0.130512,-0.256843,...,-0.193775,0.0,0.0,0.0,-0.273012,-0.487025,0.0,-0.977778,-0.610963,-1.479492
9502,0.606281,-1.741887,-1.742184,-0.566551,0.067701,-0.159920,-1.025424,-0.265606,-0.130512,-0.256843,...,-0.193775,0.0,0.0,0.0,-0.273012,-0.487025,0.0,-0.977778,-0.610963,-1.479492
9503,0.636567,-1.741887,-1.742184,-0.566551,0.067701,-0.159920,-1.025424,-0.265606,-0.130512,-0.256843,...,-0.193775,0.0,0.0,0.0,-0.273012,-0.487025,0.0,-0.977778,-0.610963,-1.479492
9504,0.666853,-1.741887,-1.742184,-0.566551,0.067701,-0.159920,-1.025424,-0.265606,-0.130512,-0.256843,...,-0.193775,0.0,0.0,0.0,-0.273012,-0.487025,0.0,-0.977778,-0.610963,-1.479492


In [66]:
hB.iloc[:, 7:]


Unnamed: 0,Temperature,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex,MinEStateIndex,qed,SPS,MolWt,HeavyAtomMolWt,ExactMolWt,...,VSA_EState3,VSA_EState4,VSA_EState5,VSA_EState6,VSA_EState7,VSA_EState8,VSA_EState9,FractionCSP3,MolLogP,MolMR
0,0.0,-1.043604,-1.043604,-0.543345,-0.076480,-0.021889,-1.309091,0.979785,0.964900,0.994047,...,-0.139466,0.000000,0.000000,0.0,-0.607632,3.359310,0.0,0.194444,-1.335257,-0.229974
1,0.0,-0.993276,-0.993276,-0.543345,-0.076480,-0.045985,-0.051515,1.678071,1.532609,1.704092,...,0.248278,2.549694,0.000000,0.0,3.019701,-0.332956,0.0,0.486111,1.648360,1.595513
2,0.0,-0.972306,-0.972306,-0.543345,-0.076480,0.154343,-1.017647,1.093566,0.939673,1.114159,...,-0.139466,0.000000,0.000000,0.0,0.448998,5.301168,0.0,0.486111,1.514299,1.031652
3,0.0,-1.697866,-1.697866,-0.543345,-0.076480,0.397733,-1.445455,0.569587,0.573351,0.583140,...,-0.139466,0.000000,0.000000,0.0,-0.607632,-0.332956,0.0,-0.388889,-0.403047,-0.425581
4,0.0,-0.884437,-0.884437,-0.543345,-0.076480,-2.039175,-0.112121,5.724152,5.305764,5.795271,...,-0.139466,6.053312,0.000000,0.0,6.572781,11.877367,0.0,-0.038889,5.779128,6.249603
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2558,0.0,-0.062211,-0.062211,-0.145056,0.123709,-0.828717,-1.081818,-0.896195,-0.817300,-0.902145,...,-0.139466,0.000000,0.000000,0.0,-0.578639,-0.332956,0.0,-0.388889,-1.223797,-1.120032
2559,0.0,0.012349,0.012349,1.182574,0.791006,-0.780986,-1.081818,-0.790737,-0.711517,-0.796243,...,-0.014490,0.000000,0.000000,0.0,-0.607632,-0.332956,0.0,-0.388889,-0.881958,-0.902317
2560,0.0,-0.901008,-0.901008,2.510203,1.458303,-0.655685,-1.081818,-0.685278,-0.605735,-0.690341,...,-0.139466,0.000000,0.000000,0.0,-0.607632,-0.332956,0.0,-0.388889,-0.567807,-0.720715
2561,0.0,-1.697866,-1.697866,-0.543345,-0.076480,-0.641066,-1.445455,-1.073259,-1.014816,-1.080868,...,-0.139466,0.000000,0.000000,0.0,-0.607632,-0.332956,0.0,-0.388889,-0.918452,-1.119634


In [67]:
dset.head()
dset.iloc[:,3] #Inchi is at index 3

0                       InChI=1S/2BrH.Hg/h2*1H;/q;;+2/p-2
1                       InChI=1S/2BrH.Hg/h2*1H;/q;;+2/p-2
2       InChI=1S/2C2H5.2CH3.Pb/c2*1-2;;;/h2*1H2,2H3;2*...
3                   InChI=1S/2C2H5.Hg/c2*1-2;/h2*1H2,2H3;
4               InChI=1S/2C3H7.Hg/c2*1-3-2;/h2*1,3H2,2H3;
                              ...                        
9501                                          InChI=1S/Xe
9502                                          InChI=1S/Xe
9503                                          InChI=1S/Xe
9504                                          InChI=1S/Xe
9505                                     InChI=1S/Xe/i1+2
Name: InChI, Length: 9459, dtype: object