# Dataset Scaler

Version: 0.2.0

In [11]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, normalize
from scipy import stats
from MiscScripts.DataCleaning import *

In [12]:
tol = 0.001
vers = "0.2.0"
datasets = ["logHenry", "logS"]

datasetDict = {"logHenry" : "hasLogH",
               "logS" : "hasLogS"}
trainTest = ["Train", "Validation"]

In [13]:
for dset in datasets:
    print("\n\n", dset)
    master = pd.DataFrame()

    for tt in trainTest:
        df = pd.read_csv(f"../Data/Datasets/PredictionDatasets/0.2.0-{dset}-{tt}Set.csv")
        print(tt, df.shape)
        splitIndex = df.shape[0]
        master = pd.concat([master, df], axis=0)

    print("Split index:", splitIndex)
    print(master.shape)
    master.reset_index(drop=True, inplace=True)
    oTarg = datasets[1-datasets.index(dset)]
    oTargData = master[oTarg].values
    
    master.insert(10, f"{oTarg}-Copy", oTargData)

    m = prepForScaling(master, dset, [])
    print(master.columns.tolist()[6:10])
    print("Initial shape:", m.shape)
    scaled = pd.DataFrame(MinMaxScaler().fit_transform(m.iloc[:, 8:]))

    scaled, dropped = dropLowDistinction(pd.DataFrame(scaled), .005)
    scaled = RobustScaler().fit_transform(scaled)
    m = pd.concat([m.iloc[:, :8], pd.DataFrame(scaled)], axis=1)
    print("After dropping low distinction:", m.shape)

    trainIndex = m.shape[0] - splitIndex
    train = m.iloc[:trainIndex, :]; val = m.iloc[trainIndex:, :]
    print(train.shape, val.shape)

    train = removeOutliers(train, [dset], 3)

    train.to_csv(f"../Data/Datasets/ScaledPredictionDatasets/{vers}-{dset}-MRobust-TrainSet.csv", index=False)
    val.to_csv(f"../Data/Datasets/ScaledPredictionDatasets/{vers}-{dset}-MRobust-ValidationSet.csv", index=False)

    train.to_csv(f"../Data/Datasets/ScaledPredictionDatasets/{vers}-{oTarg}-MRobust-PredictionSet.csv", index=False)



 logHenry
Train (9284, 165)
Validation (2524, 165)
Split index: 2524
(11808, 165)

 Variable: logHenry
 (11808, 166)
After dropping NaNs: (11808, 165)
After dropping outliers: (11806, 165)
After dropping NaNs again: (11806, 165)
['HenryConstant-dataSource', 'nIsomers', 'MaxAbsEStateIndex', 'MaxEStateIndex']
Initial shape: (11806, 165)
After dropping low distinction: (11808, 80)
(9284, 80) (2524, 80)


 logS
Train (9179, 166)
Validation (2524, 166)
Split index: 2524
(11703, 166)

 Variable: logS
 (11703, 167)
After dropping NaNs: (11703, 166)
After dropping outliers: (11670, 166)
After dropping NaNs again: (11670, 166)
['HenryConstant-dataSource', 'nIsomers', 'MaxAbsEStateIndex', 'MaxEStateIndex']
Initial shape: (11670, 166)
After dropping low distinction: (11703, 83)
(9179, 83) (2524, 83)


## logHlogS datasets

In [14]:
# dset = "logHlogS"
# for tt in trainTest:
#     df = pd.read_csv(f"../Data/Datasets/TrainTest/{vers}-{dset}-{tt}Set.csv")
#     popped = df.pop("logS")
#     popped = MinMaxScaler().fit_transform(popped.values.reshape(-1, 1))
#     df.insert(7, "logS", popped)

#     df = prepForScaling(df, "logHenry", [])
#     scaled = ScalingMethods.MRobustScaler(df.iloc[:, 8:], tol)
#     df = pd.concat([df.iloc[:, :8], pd.DataFrame(scaled)], axis=1)

#     print(df.columns.tolist()[6:10])
#     df.to_csv(f"../Data/Datasets/ScaledTrainTest/{vers}-{dset}-MRobust-{tt}Set.csv", index=False)

In [15]:
# dset = "logSlogH"

# for tt in trainTest:
#     df = pd.read_csv(f"../Data/Datasets/TrainTest/{vers}-{dset}-{tt}Set.csv")
#     popped = df.pop("logHenry")
#     popped = MinMaxScaler().fit_transform(popped.values.reshape(-1, 1))
#     df.insert(7, "logHenry", popped)

#     df = prepForScaling(df, "logS", [])
#     scaled = ScalingMethods.MRobustScaler(df.iloc[:, 8:], tol)
#     df = pd.concat([df.iloc[:, :8], pd.DataFrame(scaled)], axis=1)

#     print(df.columns.tolist()[6:10])
#     df.to_csv(f"../Data/Datasets/ScaledTrainTest/{vers}-{dset}-MRobust-{tt}Set.csv", index=False)