In [25]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler, normalize
from scipy import stats
from MiscScripts.DataCleaning import *
import warnings
warnings.filterwarnings("ignore")

In [26]:
def MRobustScaler(data, tol):
    print("Initial shape:", data.shape)
    data = MinMaxScaler().fit_transform(data)
    data, d = dropLowDistinction(pd.DataFrame(data), tol)
    data = RobustScaler().fit_transform(data)
    return data

In [27]:
tol = 0.01
vers = "0.7.0"
datasets = ["logHenry", "logS"]
trainTest = ["Train", "Validation"]

In [28]:
def removeSingleValCols(data, cols):
    """ 
    Function that iterates through columns and deletes columns with only one unique value.
    
    Parameters:
    data: pandas DataFrame
    cols: list of strings
    
    Returns:
    data: pandas DataFrame
    """
    for col in cols:
        values = data[col].values.astype(float).tolist()
        if len(set(values)) == 1:
            data.drop(col, axis=1, inplace=True)
    return data

def cleanData(df, idx, tolPercent):
    print("\n\n", df.columns[0])

    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df = df[pd.to_numeric(df[df.columns[0]], errors='coerce').notnull()]

    tolVal = len(df) * tolPercent
    data = df.iloc[:, idx:]
    print("Initial data shape:", data.shape)
    data, dropped = dropNaN_cols(data, data.columns, tolVal)

    data = pd.DataFrame(data)
    data.dropna(inplace=True)
    print("Shape after removing NaN columns:", data.shape)

    data = removeSingleValCols(data, data.columns)
    df = pd.concat([df.iloc[:, :idx], data], axis=1)
    print("Shape after removing single value columns:", data.shape)
    print("Final dataset shape:", df.shape)
    return df
    

In [30]:
# hasLogSClean = cleanData(hasLogS, 8, 0.01)
# hasHenryClean = cleanData(hasHenry, 8, 0.01)

datasets = ["logS", "logHenry"]
vers = "0.7.0"
colDict = {}

for tt in trainTest:
    for dset in datasets:
        print("\n\n", tt)
        df = pd.read_csv(f"../Data/Datasets/TrainTest/{vers}-{dset}-{tt}Set.csv")
        df = cleanData(df, 8, 0.01)
        
        print("\n", dset)
        print("Initial shape:", df.shape)
        df.reset_index(drop=True, inplace=True) #Must reset index or it will not work
        data = df.iloc[:, 7:] #Temperature is at index 7
        data = pd.DataFrame(MinMaxScaler().fit_transform(data), columns = data.columns)

        data = dropLowDistinction(data.iloc[:, 1:], tol)[0] #Do not drop temperature!
        print("Shape after dropping low distinction:", data.shape)

        cols = data.columns
        data = RobustScaler().fit_transform(data)
        data = pd.DataFrame(data, columns = cols)
        df = pd.concat([df.iloc[:, :7], data], axis=1)

        df = removeOutliers(df, [dset], 3)
        print("Shape after removing outliers:", df.shape)

        df.dropna(subset = df.columns[7:], inplace=True)
        print("Shape after removing NaN:", df.shape)

        colDict[dset] = df.columns[7:] #Need to swap around targets so logS trains on logH and vice versa

        df.to_csv(f"../Data/Datasets/ScaledTrainTest/{vers}-{dset}-MRobust-{tt}Set.csv", index=False)



 Train


 logS
Initial data shape: (9944, 211)
Shape after removing NaN columns: (9944, 198)
Shape after removing single value columns: (9944, 156)
Final dataset shape: (9944, 164)

 logS
Initial shape: (9944, 164)
Shape after dropping low distinction: (9944, 72)
Shape after removing outliers: (9916, 79)
Shape after removing NaN: (9916, 79)


 Train


 logHenry
Initial data shape: (10301, 211)
Shape after removing NaN columns: (10297, 198)
Shape after removing single value columns: (10297, 156)
Final dataset shape: (10301, 164)

 logHenry
Initial shape: (10301, 164)
Shape after dropping low distinction: (10301, 71)
Shape after removing outliers: (10201, 78)
Shape after removing NaN: (10197, 78)


 Validation


 logS
Initial data shape: (1759, 211)
Shape after removing NaN columns: (1759, 198)
Shape after removing single value columns: (1759, 155)
Final dataset shape: (1759, 163)

 logS
Initial shape: (1759, 163)
Shape after dropping low distinction: (1759, 81)
Shape after removing ou