similar to ExperimentalScaler but with larger datasets

In [9]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import PowerTransformer, RobustScaler, normalize, MinMaxScaler
from MiscScripts import DataCleaning as dc

In [10]:
def logTarget(targVals): #Special log transformation for target variable due to negative values
    targVals = targVals + abs(np.min(targVals)) + 1

    log = np.log1p(targVals)
    log = pd.Series(log)
    log = log.replace([np.inf, -np.inf], np.nan)
    
    return log

In [11]:
vers = "0.2.1"

df = pd.read_csv("../Data/Datasets/0.6.1-MeltingPoint_C.csv")
df = df.dropna(subset = ["MeltingPoint_C"])
df.drop_duplicates(subset = "InChI", inplace = True)
df = df.drop_duplicates()
print(f"Shape of data: {df.shape}")

shape = df.shape[0]
frac = np.around(10000/shape, decimals = 3)
print(f"Fraction of data to be used: {frac}")
target = "MeltingPoint_C"
df = df.sample(frac = frac)
print(f"Shape of data: {df.shape}")

  df = pd.read_csv("../Data/Datasets/0.6.1-MeltingPoint_C.csv")


Shape of data: (250072, 167)
Fraction of data to be used: 0.04
Shape of data: (10003, 167)


In [12]:
targVals = df[target].values.astype(float).tolist()
df.pop(target)
# scaledTargs = logTarget(targVals)
scaledTargs = MinMaxScaler().fit_transform(np.array(targVals).reshape(-1, 1))
df.insert(0, target, scaledTargs)
# df.insert(0, target, targVals)
df.dropna(subset = [target], inplace = True)
print(f"Shape of data: {df.shape}")

Shape of data: (10003, 167)


In [13]:
df.iloc[:, 7:10] #Select from 8 onwards

Unnamed: 0,Compound,MaxAbsEStateIndex,MaxEStateIndex
224295,,13.280833,13.280833
28043,,4.432370,4.432370
175658,,13.071016,13.071016
198272,,6.360950,6.360950
199844,,12.727835,12.727835
...,...,...,...
146640,,12.616607,12.616607
208160,,12.278250,12.278250
142729,,12.225943,12.225943
177001,,10.736385,10.736385


In [14]:
def MMPT(data, tol):
    print("Initial shape: ", data.shape)
    data = MinMaxScaler().fit_transform(data)
    data, d = dc.dropLowDistinction(pd.DataFrame(data), tol)
    data = PowerTransformer().fit_transform(data)
    data = pd.DataFrame(data)
    return data

def MMLog(data, tol):
    print("Initial shape: ", data.shape)
    data = MinMaxScaler().fit_transform(data)
    data, d = dc.dropLowDistinction(pd.DataFrame(data), tol)
    data = pd.DataFrame(np.log1p(data))
    return data

In [15]:
df = dc.prepForScaling(df, target, [])
df.reset_index(drop = True, inplace = True) #Reset index after sampling or it won't work

scaled = MMPT(df.iloc[:, 8:], 0.01)
df_mmpt = pd.concat([df.iloc[:, :8], scaled], axis = 1)
print(f"Shape of data: {df_mmpt.shape}")
df_mmpt.to_csv(f"../Data/Datasets/BPMP_Scaling/{vers}_MinMaxMM_PowerTransformer().csv", index = False)


 Variable: MeltingPoint_C
 (10003, 167)
After dropping NaNs: (10003, 167)
After dropping outliers: (9968, 167)
After dropping NaNs again: (9968, 167)
Initial shape:  (9968, 159)
Shape of data: (9968, 80)


In [16]:
scaled = MMLog(df.iloc[:, 8:], 0.01)
df_mmlog = pd.concat([df.iloc[:, :8], scaled], axis = 1)
print(f"Shape of data: {df_mmlog.shape}")
df_mmlog.to_csv(f"../Data/Datasets/BPMP_Scaling/{vers}_MinMaxMM_Log1p().csv", index = False)

Initial shape:  (9968, 159)
Shape of data: (9968, 80)
