In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin

from tqdm.auto import tqdm
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [4]:
x_pretrain = pd.read_csv("./pretrain_features.csv.zip", index_col="Id", compression='zip').drop("smiles", axis=1)
y_pretrain = pd.read_csv("./pretrain_labels.csv.zip", index_col="Id", compression='zip').to_numpy().squeeze(-1)
x_train = pd.read_csv("./train_features.csv.zip", index_col="Id", compression='zip').drop("smiles", axis=1)
y_train = pd.read_csv("./train_labels.csv.zip", index_col="Id", compression='zip').to_numpy().squeeze(-1)
x_test = pd.read_csv("./test_features.csv.zip", index_col="Id", compression='zip').drop("smiles", axis=1)


In [5]:
x_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 50000 to 50099
Columns: 1000 entries, feature_0000 to feature_0999
dtypes: float64(1000)
memory usage: 782.0 KB


In [7]:
x_train.describe()

Unnamed: 0,feature_0000,feature_0001,feature_0002,feature_0003,feature_0004,feature_0005,feature_0006,feature_0007,feature_0008,feature_0009,...,feature_0990,feature_0991,feature_0992,feature_0993,feature_0994,feature_0995,feature_0996,feature_0997,feature_0998,feature_0999
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,0.0,0.02,0.02,0.29,0.31,0.0,0.03,0.08,0.14,0.01,...,0.01,0.0,0.01,0.0,0.0,0.06,0.01,0.06,0.0,0.0
std,0.0,0.140705,0.140705,0.456048,0.464823,0.0,0.171447,0.27266,0.348735,0.1,...,0.1,0.0,0.1,0.0,0.0,0.238683,0.1,0.238683,0.0,0.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,0.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0


In [8]:
std = StandardScaler()
std_x_p = std.fit_transform(x_pretrain)
std_x_t = std.transform(x_train)
std_x_p = pd.DataFrame(std_x_p, columns=x_pretrain.columns)
std_x_t = pd.DataFrame(std_x_t, columns=x_train.columns)

In [10]:
std_x_p.describe()

Unnamed: 0,feature_0000,feature_0001,feature_0002,feature_0003,feature_0004,feature_0005,feature_0006,feature_0007,feature_0008,feature_0009,...,feature_0990,feature_0991,feature_0992,feature_0993,feature_0994,feature_0995,feature_0996,feature_0997,feature_0998,feature_0999
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,...,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,-1.861622e-17,-2.7640110000000003e-17,-3.3111290000000005e-17,-4.156675e-17,8.739676e-17,-3.517187e-17,2.7142730000000003e-17,-6.536993e-18,2.913225e-17,2.557954e-18,...,6.536993e-18,1.811884e-17,5.0830450000000004e-17,-1.666223e-17,5.329071e-18,3.591794e-17,-3.964828e-17,2.373213e-17,-1.0658139999999999e-19,4.476419e-18
std,1.00001,1.00001,1.00001,1.00001,1.00001,1.00001,1.00001,1.00001,1.00001,1.00001,...,1.00001,1.00001,1.00001,1.00001,1.00001,1.00001,1.00001,1.00001,1.00001,1.00001
min,-0.07795321,-0.09258979,-0.1775482,-0.6874134,-0.6491404,-0.0771703,-0.149634,-0.2536296,-0.3509148,-0.07584792,...,-0.09667755,-0.07368391,-0.1127819,-0.1143179,-0.03466182,-0.3149077,-0.07103047,-0.1521987,-0.01341762,-0.04738174
25%,-0.07795321,-0.09258979,-0.1775482,-0.6874134,-0.6491404,-0.0771703,-0.149634,-0.2536296,-0.3509148,-0.07584792,...,-0.09667755,-0.07368391,-0.1127819,-0.1143179,-0.03466182,-0.3149077,-0.07103047,-0.1521987,-0.01341762,-0.04738174
50%,-0.07795321,-0.09258979,-0.1775482,-0.6874134,-0.6491404,-0.0771703,-0.149634,-0.2536296,-0.3509148,-0.07584792,...,-0.09667755,-0.07368391,-0.1127819,-0.1143179,-0.03466182,-0.3149077,-0.07103047,-0.1521987,-0.01341762,-0.04738174
75%,-0.07795321,-0.09258979,-0.1775482,1.454729,1.540499,-0.0771703,-0.149634,-0.2536296,-0.3509148,-0.07584792,...,-0.09667755,-0.07368391,-0.1127819,-0.1143179,-0.03466182,-0.3149077,-0.07103047,-0.1521987,-0.01341762,-0.04738174
max,12.82821,10.80033,5.632274,1.454729,1.540499,12.95835,6.682971,3.942757,2.849695,13.18428,...,10.34366,13.57148,8.86667,8.747536,28.85019,3.175534,14.07847,6.570359,74.52889,21.10518


In [11]:
std_x_t.describe()

Unnamed: 0,feature_0000,feature_0001,feature_0002,feature_0003,feature_0004,feature_0005,feature_0006,feature_0007,feature_0008,feature_0009,...,feature_0990,feature_0991,feature_0992,feature_0993,feature_0994,feature_0995,feature_0996,feature_0997,feature_0998,feature_0999
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,-0.07795321,0.125269,-0.061352,-0.066192,0.029648,-0.07717,0.055344,0.082081,0.09717,0.056753,...,0.007726,-0.07368391,-0.022987,-0.1143179,-0.03466182,-0.105481,0.070464,0.251155,-0.01341762,-0.047382
std,1.3947700000000002e-17,1.532691,0.817473,0.97692,1.017795,0.0,1.171427,1.144187,1.116165,1.326013,...,1.044034,1.3947700000000002e-17,0.897945,1.3947700000000002e-17,6.973851e-18,0.83311,1.41495,1.604562,1.743463e-18,0.0
min,-0.07795321,-0.09259,-0.177548,-0.687413,-0.64914,-0.07717,-0.149634,-0.25363,-0.350915,-0.075848,...,-0.096678,-0.07368391,-0.112782,-0.1143179,-0.03466182,-0.314908,-0.07103,-0.152199,-0.01341762,-0.047382
25%,-0.07795321,-0.09259,-0.177548,-0.687413,-0.64914,-0.07717,-0.149634,-0.25363,-0.350915,-0.075848,...,-0.096678,-0.07368391,-0.112782,-0.1143179,-0.03466182,-0.314908,-0.07103,-0.152199,-0.01341762,-0.047382
50%,-0.07795321,-0.09259,-0.177548,-0.687413,-0.64914,-0.07717,-0.149634,-0.25363,-0.350915,-0.075848,...,-0.096678,-0.07368391,-0.112782,-0.1143179,-0.03466182,-0.314908,-0.07103,-0.152199,-0.01341762,-0.047382
75%,-0.07795321,-0.09259,-0.177548,1.454729,1.540499,-0.07717,-0.149634,-0.25363,-0.350915,-0.075848,...,-0.096678,-0.07368391,-0.112782,-0.1143179,-0.03466182,-0.314908,-0.07103,-0.152199,-0.01341762,-0.047382
max,-0.07795321,10.800327,5.632274,1.454729,1.540499,-0.07717,6.682971,3.942757,2.849695,13.184278,...,10.343663,-0.07368391,8.86667,-0.1143179,-0.03466182,3.175534,14.078465,6.570359,-0.01341762,-0.047382


In [12]:
std = StandardScaler()
std_x_t = std.fit_transform(x_train)
std_x_p = std.transform(x_pretrain)
std_x_p = pd.DataFrame(std_x_p, columns=x_pretrain.columns)
std_x_t = pd.DataFrame(std_x_t, columns=x_train.columns)

In [13]:
std_x_t.describe()

Unnamed: 0,feature_0000,feature_0001,feature_0002,feature_0003,feature_0004,feature_0005,feature_0006,feature_0007,feature_0008,feature_0009,...,feature_0990,feature_0991,feature_0992,feature_0993,feature_0994,feature_0995,feature_0996,feature_0997,feature_0998,feature_0999
count,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,...,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0,100.0
mean,0.0,-4.82947e-17,4.996004e-18,3.774758e-17,-6.883383e-17,0.0,4.440892e-18,-5.884182e-17,0.0,-2.248202e-17,...,-2.248202e-17,0.0,-3.9135360000000003e-17,0.0,0.0,6.661338000000001e-17,-4.024558e-17,2.775558e-17,0.0,0.0
std,0.0,1.005038,1.005038,1.005038,1.005038,0.0,1.005038,1.005038,1.005038,1.005038,...,1.005038,0.0,1.005038,0.0,0.0,1.005038,1.005038,1.005038,0.0,0.0
min,0.0,-0.1428571,-0.1428571,-0.6391015,-0.6702801,0.0,-0.1758631,-0.2948839,-0.403473,-0.1005038,...,-0.1005038,0.0,-0.1005038,0.0,0.0,-0.2526456,-0.1005038,-0.2526456,0.0,0.0
25%,0.0,-0.1428571,-0.1428571,-0.6391015,-0.6702801,0.0,-0.1758631,-0.2948839,-0.403473,-0.1005038,...,-0.1005038,0.0,-0.1005038,0.0,0.0,-0.2526456,-0.1005038,-0.2526456,0.0,0.0
50%,0.0,-0.1428571,-0.1428571,-0.6391015,-0.6702801,0.0,-0.1758631,-0.2948839,-0.403473,-0.1005038,...,-0.1005038,0.0,-0.1005038,0.0,0.0,-0.2526456,-0.1005038,-0.2526456,0.0,0.0
75%,0.0,-0.1428571,-0.1428571,1.564697,1.491914,0.0,-0.1758631,-0.2948839,-0.403473,-0.1005038,...,-0.1005038,0.0,-0.1005038,0.0,0.0,-0.2526456,-0.1005038,-0.2526456,0.0,0.0
max,0.0,7.0,7.0,1.564697,1.491914,0.0,5.686241,3.391165,2.478479,9.949874,...,9.949874,0.0,9.949874,0.0,0.0,3.958114,9.949874,3.958114,0.0,0.0


In [14]:
std_x_p.describe()

Unnamed: 0,feature_0000,feature_0001,feature_0002,feature_0003,feature_0004,feature_0005,feature_0006,feature_0007,feature_0008,feature_0009,...,feature_0990,feature_0991,feature_0992,feature_0993,feature_0994,feature_0995,feature_0996,feature_0997,feature_0998,feature_0999
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,...,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,0.00604,-0.082143,0.075429,0.068097,-0.029276,0.00592,-0.047483,-0.072099,-0.087496,-0.043016,...,-0.007437,0.0054,0.025729,0.0129,0.0012,0.127249,-0.050051,-0.157314,0.00018,0.00224
std,0.077483,0.655741,1.229457,1.028793,0.987476,0.076714,0.857969,0.878395,0.900448,0.757947,...,0.962658,0.073287,1.119275,0.112844,0.034621,1.206381,0.710306,0.626369,0.013415,0.047276
min,0.0,-0.142857,-0.142857,-0.639101,-0.67028,0.0,-0.175863,-0.294884,-0.403473,-0.100504,...,-0.100504,0.0,-0.100504,0.0,0.0,-0.252646,-0.100504,-0.252646,0.0,0.0
25%,0.0,-0.142857,-0.142857,-0.639101,-0.67028,0.0,-0.175863,-0.294884,-0.403473,-0.100504,...,-0.100504,0.0,-0.100504,0.0,0.0,-0.252646,-0.100504,-0.252646,0.0,0.0
50%,0.0,-0.142857,-0.142857,-0.639101,-0.67028,0.0,-0.175863,-0.294884,-0.403473,-0.100504,...,-0.100504,0.0,-0.100504,0.0,0.0,-0.252646,-0.100504,-0.252646,0.0,0.0
75%,0.0,-0.142857,-0.142857,1.564697,1.491914,0.0,-0.175863,-0.294884,-0.403473,-0.100504,...,-0.100504,0.0,-0.100504,0.0,0.0,-0.252646,-0.100504,-0.252646,0.0,0.0
max,1.0,7.0,7.0,1.564697,1.491914,1.0,5.686241,3.391165,2.478479,9.949874,...,9.949874,1.0,9.949874,1.0,1.0,3.958114,9.949874,3.958114,1.0,1.0
