# Program to generate unscaled datasets

Version: 0.2.2

Change log:  
0.0.1 - bug fix with dropping nans from entire dataset vs just feature columns  
0.2.1 - Removes constant columns  
0.2.2 - Removed VarianceThreshold and coded function manually instead  
0.3.2 - Removes really big hL vals
0.4.0 - New datasets


dataset versions:

0.6.1 - Removed outlier removal

## Importing Libraries

In [24]:
import pandas as pd
from tqdm import tqdm
import numpy as np
from scipy import stats
import sys
sys.path.append('../')
from MachineLearning.MyFuncs.checkDirs import *
from MiscScripts.DataCleaning import *

## Creating Functions

In [25]:
def removeSingleValCols(data, cols):
    """ 
    Function that iterates through columns and deletes columns with only one unique value.
    
    Parameters:
    data: pandas DataFrame
    cols: list of strings
    
    Returns:
    data: pandas DataFrame
    """
    for col in cols:
        values = data[col].values.astype(float).tolist()
        if len(set(values)) == 1:
            data.drop(col, axis=1, inplace=True)
    return data

In [26]:
sets = ["logHenry", "logS", "BoilingPoint_C", "MeltingPoint_C"]
version = "0.7.1"
og = pd.read_csv("../Data/Combined/0.5.0-Master+RDKit.csv")

In [27]:
og = og.rename(columns = {"BoilingPoint/C": "BoilingPoint_C",
                    "MeltingPoint/C": "MeltingPoint_C"})

toMove = ["Temperature", "HenryConstant", "logS"]#, "BoilingPoint_C", "MeltingPoint_C"]
toDelete = ["Tb, K", "Tm, K", "MolWt"]

# og = og.drop(toDelete, axis=1)

for col in toMove:
    popped = og.pop(col)
    og.insert(14, col, popped)

  og.insert(14, col, popped)
  og.insert(14, col, popped)
  og.insert(14, col, popped)


In [28]:
og.iloc[:, :9]

Unnamed: 0,Compound,SMILES,InChI,logS-dataSource,HenryConstant-dataSource,nIsomers,MaxAbsEStateIndex,MaxEStateIndex,MinAbsEStateIndex
0,2-ethylhexyl 2-{[dibutyl({2-[(2-ethylhexyl)oxy...,CCCCC(CC)COC(=O)CS[Sn](CCCC)(CCCC)SCC(=O)OCC(C...,InChI=1/2C10H20O2S.2C4H9.Sn/c2*1-3-5-6-9(4-2)7...,"Lowe et al, 2023",,1.0,10.779590,10.779590,0.000000
1,2-ethylhexyl 2-{[({2-[(2-ethylhexyl)oxy]-2-oxo...,CCCCC(CC)COC(=O)CS[Sn](C)(C)SCC(=O)OCC(CC)CCCC,InChI=1/2C10H20O2S.2CH3.Sn/c2*1-3-5-6-9(4-2)7-...,"Lowe et al, 2023",,1.0,10.779590,10.779590,0.000000
2,Dibutyltin bis(2-ethylhexanoate),CCCCC(CC)C(=O)O[Sn](CCCC)(CCCC)OC(=O)C(CC)CCCC,InChI=1/2C8H16O2.2C4H9.Sn/c2*1-3-5-6-7(4-2)8(9...,"Lowe et al, 2023",,1.0,10.325282,10.325282,0.000000
3,2-ethylhexyl 2-{[bis({2-[(2-ethylhexyl)oxy]-2-...,CCCCC(CC)COC(=O)CS[Sn](C)(SCC(=O)OCC(CC)CCCC)S...,InChI=1/3C10H20O2S.CH3.Sn/c3*1-3-5-6-9(4-2)7-1...,"Lowe et al, 2023",,1.0,10.779590,10.779590,0.000000
4,Kepone,ClC12C(=O)C3(Cl)C4(Cl)C1(Cl)C1(Cl)C2(Cl)C3(Cl)...,"InChI=1/C10Cl10O/c11-2-1(21)3(12)6(15)4(2,13)8...","Lowe et al, 2023",,1.0,12.770074,12.770074,0.690077
...,...,...,...,...,...,...,...,...,...
21286,xenon,[Xe],InChI=1S/Xe,,https://doi.org/10.5194/acp-23-10901-2023,1.0,0.000000,0.000000,0.000000
21287,xenon,[Xe],InChI=1S/Xe,,https://doi.org/10.5194/acp-23-10901-2023,1.0,0.000000,0.000000,0.000000
21288,xenon,[Xe],InChI=1S/Xe,,https://doi.org/10.5194/acp-23-10901-2023,1.0,0.000000,0.000000,0.000000
21289,xenon,[Xe],InChI=1S/Xe,,https://doi.org/10.5194/acp-23-10901-2023,1.0,0.000000,0.000000,0.000000


In [29]:
try:
    hConst = og["HenryConstant"].values.astype(float).tolist()
    hConstCleaned = []
    for h in hConst:
        if h > 1e+10:
            hConstCleaned.append(np.nan)
        else:
            hConstCleaned.append(h)

    og["HenryConstant"] = hConstCleaned
    og.insert(11, "logHenry", np.log(og["HenryConstant"]))
    del og["HenryConstant"]
except:
    pass

  og.insert(11, "logHenry", np.log(og["HenryConstant"]))


In [30]:
# for dset in sets:
#     df = og.copy()
#     df.drop_duplicates(inplace=True)

#     df = df[pd.to_numeric(df[dset], errors='coerce').notnull()] #Filters out non numeric values and replaces with NaN
#     df.dropna(subset=[dset], inplace=True) 
#     print(f"\n Variable: {dset}\n", df.shape)

#     popped = df.pop(dset) #Moving target column
#     df.insert(0, dset, popped)

#     tol = df.shape[0] * 0.01
#     cols = df.iloc[:, 8:].columns #Filtering through rest of columns
#     df, dropped = dropNaN_cols(df, cols, tol)

#     df = df[df[dset] != -np.inf] #Removing +-inf values
#     df = df[df[dset] != np.inf]

#     cols = df.iloc[:, 8:].columns
#     for col in cols:
#         df.dropna(subset=[col], inplace=True)

#     print("After dropping NaNs:", df.shape)
    
#     # df = removeOutliers(df, [dset], 3)
#     # print("After dropping outliers:", df.shape)
    
#     cols = df.iloc[:, 8:].columns
#     df = removeSingleValCols(df, cols)
#     print("after dropping single values columns:", df.shape)

#     checkPath(f"../Data/Datasets/")
#     df.reset_index(drop=True, inplace=True)
#     # df.to_csv(f"../Data/Datasets/{version}-{dset}.csv", index=False)

## Final datasets for predicting logS and logH

In [31]:
sets = ["logS", "logHenry"]
vers = "0.2.0"
df = og.copy()
for dset in sets:
    popped = df.pop(dset) #Moving target column
    df.insert(0, dset, popped)

for i in range(len(sets)):
    dset = sets[i]
    print("\n\n", dset)

    tdf = df.copy()
    tdf = tdf[pd.to_numeric(tdf[dset], errors='coerce').notnull()] #Filters out non numeric values and replaces with NaN
    # tdf.dropna(subset=[dset], inplace=True)
    print(tdf.shape)

    tol = tdf.shape[0] * 0.01
    cols = tdf.iloc[:, 8:].columns #Filtering through rest of columns
    tdf, dropped = dropNaN_cols(tdf, cols, tol)

    tdf = tdf[tdf[dset] != -np.inf] #Removing +-inf values
    tdf = tdf[tdf[dset] != np.inf]

    cols = tdf.iloc[:, 8:].columns
    for col in cols:
        tdf.dropna(subset=[col], inplace=True)

    print("After dropping NaNs:", tdf.shape)

    cols = tdf.iloc[:, 10:].columns
    tdf = removeSingleValCols(tdf, cols)
    print("after dropping single values columns:", tdf.shape)

    oTarg = oTarg = sets[1-i]
    oTarg = sets[1-i]
    val = tdf[tdf[oTarg].notnull()]
    print(val.shape)

    train = tdf[tdf[oTarg].isnull()]
    print(train.shape)

    checkPath(f"../Data/Datasets/PredictionDatasets")
    val.to_csv(f"../Data/Datasets/PredictionDatasets/{vers}-{dset}-ValidationSet.csv", index=False)
    train.to_csv(f"../Data/Datasets/PredictionDatasets/{vers}-{dset}-TrainSet.csv", index=False)

  df.insert(0, dset, popped)
  df.insert(0, dset, popped)




 logS
(11706, 219)
After dropping NaNs: (11703, 207)
after dropping single values columns: (11703, 166)
(2524, 166)
(9179, 166)


 logHenry
(11826, 219)
After dropping NaNs: (11808, 207)
after dropping single values columns: (11808, 165)
(2524, 165)
(9284, 165)
