In [22]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin

from tqdm.auto import tqdm
from sklearn.preprocessing import StandardScaler

In [23]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [24]:
def load_data():
    """
    This function loads the data from the csv files and returns it as numpy arrays.

    input: None
    
    output: x_pretrain: np.ndarray, the features of the pretraining set
            y_pretrain: np.ndarray, the labels of the pretraining set
            x_train: np.ndarray, the features of the training set
            y_train: np.ndarray, the labels of the training set
            x_test: np.ndarray, the features of the test set
    """
    x_pretrain = pd.read_csv("./pretrain_features.csv.zip", index_col="Id", compression='zip').drop("smiles", axis=1).to_numpy()
    y_pretrain = pd.read_csv("./pretrain_labels.csv.zip", index_col="Id", compression='zip').to_numpy().squeeze(-1)
    x_train = pd.read_csv("./train_features.csv.zip", index_col="Id", compression='zip').drop("smiles", axis=1).to_numpy()
    y_train = pd.read_csv("./train_labels.csv.zip", index_col="Id", compression='zip').to_numpy().squeeze(-1)
    x_test = pd.read_csv("./test_features.csv.zip", index_col="Id", compression='zip').drop("smiles", axis=1)
    scaler = StandardScaler()
    x_pretrain = scaler.fit_transform(x_pretrain)
    x_train = scaler.transform(x_train)
    x_test_transed = scaler.transform(x_test)
    x_test[x_test.columns] = x_test_transed

    return x_pretrain, y_pretrain, x_train, y_train, x_test

In [25]:
class Net(nn.Module):
    """
    The model class, which defines our feature extractor used in pretraining.
    """
    def __init__(self):
        """
        The constructor of the model.
        """
        super().__init__()
        # TODO: Define the architecture of the model. It should be able to be trained on pretraing data 
        # and then used to extract features from the training and test data.
        self.fc1 = nn.Linear(1000, 1000)
        self.fc2 = nn.Linear(1000, 1000)
        self.fc3 = nn.Linear(1000, 1000)
        self.fc4 = nn.Linear(1000, 1)
        
        self.nomal1 = nn.BatchNorm1d(1000)
        self.nomal2 = nn.BatchNorm1d(1000)
        self.nomal3 = nn.BatchNorm1d(1000)
        
        self.dropout1 = nn.Dropout(0.4)
        self.dropout2 = nn.Dropout(0.5)
        self.dropout3 = nn.Dropout(0.6)

        nn.init.xavier_normal_(self.fc1.weight)
        nn.init.xavier_normal_(self.fc2.weight)
        nn.init.xavier_normal_(self.fc3.weight)
        nn.init.xavier_normal_(self.fc4.weight)

    def forward(self, x):
        """
        The forward pass of the model.

        input: x: torch.Tensor, the input to the model

        output: x: torch.Tensor, the output of the model
        """
        # TODO: Implement the forward pass of the model, in accordance with the architecture 
        # defined in the constructor.
        x = nn.LeakyReLU(0.01)(self.fc1(x))
        x = self.dropout1(x)
        x = self.nomal1(x)
        x = nn.LeakyReLU(0.01)(self.fc2(x))
        x = self.dropout2(x)
        x = self.nomal2(x)
        x = nn.LeakyReLU(0.01)(self.fc3(x))
        x = self.dropout3(x)
        x = self.nomal3(x)
        x = self.fc4(x)
        return x
    
    def make_feature(self, x):
        x = nn.LeakyReLU(0.01)(self.fc1(x))
        x = self.dropout1(x)
        x = self.nomal1(x)
        x = nn.LeakyReLU(0.01)(self.fc2(x))
        x = self.dropout2(x)
        x = self.nomal2(x)
        x = self.fc3(x)
        x = self.nomal3(x)
        return x

In [26]:
def make_feature_extractor(x, y, batch_size=256, eval_size=1000):
    """
    This function trains the feature extractor on the pretraining data and returns a function which
    can be used to extract features from the training and test data.

    input: x: np.ndarray, the features of the pretraining set
              y: np.ndarray, the labels of the pretraining set
                batch_size: int, the batch size used for training
                eval_size: int, the size of the validation set
            
    output: make_features: function, a function which can be used to extract features from the training and test data
    """
    # Pretraining data loading
    in_features = x.shape[-1]
    x_tr, x_val, y_tr, y_val = train_test_split(x, y, test_size=eval_size, random_state=0, shuffle=True)
    x_tr, x_val = torch.tensor(x_tr, dtype=torch.float), torch.tensor(x_val, dtype=torch.float)
    y_tr, y_val = torch.tensor(y_tr, dtype=torch.float), torch.tensor(y_val, dtype=torch.float)
    train_dataset = torch.utils.data.TensorDataset(x_tr, y_tr)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataset = torch.utils.data.TensorDataset(x_val, y_val)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

    # model declaration
    model = Net()
    model.to(device)
    model.train()
    
    # TODO: Implement the training loop. The model should be trained on the pretraining data. Use validation set 
    # to monitor the loss.
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.0001)
    # optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.4, weight_decay=0.0001)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.3, patience=5, verbose=True)
    criterion = nn.MSELoss()
    epochs = 200
    for epoch in tqdm(range(epochs)):
        loss_tr = 0
        loss_val = 0
        for [x, y] in train_loader:
            optimizer.zero_grad()
            x = x.to(device)
            y = y.to(device)
            predictions = model(x).squeeze(-1)
            loss = criterion(predictions, y)
            loss.backward()
            optimizer.step()
            loss_tr += loss.item() * len(x)
        loss_tr /= len(train_loader.dataset)
        for [x, y] in val_loader:
            x = x.to(device)
            y = y.to(device)
            predictions = model(x).squeeze(-1)
            loss = criterion(predictions, y)
            loss_val += loss.item() * len(x)
        loss_val /= len(val_loader.dataset)
        scheduler.step(loss_val)
        print(f"Epoch {epoch+1}: train loss: {loss_tr}, val loss: {loss_val}")
        if(optimizer.param_groups[0]['lr'] < 1e-6):
            print(f"Early stop at epoch {epoch+1}")
            break

    def make_features(x):
        """
        This function extracts features from the training and test data, used in the actual pipeline 
        after the pretraining.

        input: x: np.ndarray, the features of the training or test set

        output: features: np.ndarray, the features extracted from the training or test set, propagated
        further in the pipeline
        """
        model.eval()
        # TODO: Implement the feature extraction, a part of a pretrained model used later in the pipeline.
        x = torch.tensor(x, dtype=torch.float)
        x = x.to(device)
        x = model.make_feature(x)
        return x

    return make_features

In [27]:
def make_pretraining_class(feature_extractors):
    """
    The wrapper function which makes pretraining API compatible with sklearn pipeline
    
    input: feature_extractors: dict, a dictionary of feature extractors

    output: PretrainedFeatures: class, a class which implements sklearn API
    """

    class PretrainedFeatures(BaseEstimator, TransformerMixin):
        """
        The wrapper class for Pretraining pipeline.
        """
        def __init__(self, *, feature_extractor=None, mode=None):
            self.feature_extractor = feature_extractor
            self.mode = mode

        def fit(self, X=None, y=None):
            return self

        def transform(self, X):
            assert self.feature_extractor is not None
            X_new = feature_extractors[self.feature_extractor](X)
            return X_new
        
    return PretrainedFeatures


In [28]:
def get_regression_model(X, y):

    class Model(nn.Module):
        """
        The model class, which defines our feature extractor used in pretraining.
        """
        def __init__(self):
            """
            The constructor of the model.
            """
            super().__init__()
            # TODO: Define the architecture of the model. It should be able to be trained on pretraing data 
            # and then used to extract features from the training and test data.
            self.fc1 = nn.Linear(1000, 100)
            self.fc2 = nn.Linear(100, 1)
            
            self.nomal1 = nn.BatchNorm1d(100)

            # nn.init.kaiming_uniform_(self.fc3.weight, nonlinearity='relu')
            nn.init.xavier_normal_(self.fc1.weight)
            nn.init.xavier_normal_(self.fc2.weight)

        def forward(self, x):
            """
            The forward pass of the model.

            input: x: torch.Tensor, the input to the model

            output: x: torch.Tensor, the output of the model
            """
            # TODO: Implement the forward pass of the model, in accordance with the architecture 
            # defined in the constructor.
            x = self.fc1(x)
            x = nn.LeakyReLU(0.01)(x)
            x = nn.Dropout(0.6)(x)
            x = self.nomal1(x)
            x = self.fc2(x)
            return x
    
    """
    This function returns the regression model used in the pipeline.

    input: None

    output: model: sklearn compatible model, the regression model
    """
    # TODO: Implement the regression model. It should be able to be trained on the features extracted
    # by the feature extractor.
    model = Model()
    model.to(device)
    model.train()

    x_tr, x_val, y_tr, y_val = train_test_split(X, y, test_size=10, random_state=0, shuffle=True)
    x_tr, x_val = torch.tensor(x_tr, dtype=torch.float), torch.tensor(x_val, dtype=torch.float)
    y_tr, y_val = torch.tensor(y_tr, dtype=torch.float), torch.tensor(y_val, dtype=torch.float)
    train_dataset = torch.utils.data.TensorDataset(x_tr, y_tr)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=len(train_dataset), shuffle=True)
    val_dataset = torch.utils.data.TensorDataset(x_val, y_val)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=len(val_dataset), shuffle=True)
    

    # optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=0.0001)
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.4, weight_decay=0.0001)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.3, patience=5, verbose=True)
    criterion = nn.MSELoss()
    epochs = 10000
    for epoch in tqdm(range(epochs)):
        loss_tr = 0
        loss_val = 0
        for [x, y] in train_loader:
            optimizer.zero_grad()
            x = x.to(device)
            y = y.to(device)
            predictions = model(x).squeeze(-1)
            loss = criterion(predictions, y)
            loss.backward()
            optimizer.step()
            loss_tr = loss.item()
        for [x, y] in val_loader:
            x = x.to(device)
            y = y.to(device)
            predictions = model(x).squeeze(-1)
            loss = criterion(predictions, y)
            loss_val = loss.item()
        print(f"Epoch {epoch+1}: train loss: {loss_tr}, val loss: {loss_val}")
        if(optimizer.param_groups[0]['lr'] < 1e-8):
            print(f"Early stop at epoch {epoch+1}")
            break

    return model

In [29]:
result_file = "results-upgrade.csv"

In [30]:
# Load data
x_pretrain, y_pretrain, x_train, y_train, x_test = load_data()
print("Data loaded!")
# Utilize pretraining data by creating feature extractor which extracts lumo energy 
# features from available initial features
feature_extractor =  make_feature_extractor(x_pretrain, y_pretrain)
PretrainedFeatureClass = make_pretraining_class({"pretrain": feature_extractor})
pretrainedfeatures = PretrainedFeatureClass(feature_extractor="pretrain")



Data loaded!


  0%|          | 0/200 [00:00<?, ?it/s]

Epoch 1: train loss: 3.480101949302518, val loss: 0.928158323764801
Epoch 2: train loss: 0.6245866422555885, val loss: 0.42885990715026856
Epoch 3: train loss: 0.3180882890224457, val loss: 0.3188017817735672
Epoch 4: train loss: 0.19367666848460022, val loss: 0.16699293076992036
Epoch 5: train loss: 0.12112748398342911, val loss: 0.1035601778626442
Epoch 6: train loss: 0.07294600957996991, val loss: 0.055067546904087064
Epoch 7: train loss: 0.0466218267594065, val loss: 0.03991509318351746
Epoch 8: train loss: 0.03237390867727143, val loss: 0.03167793908715248
Epoch 9: train loss: 0.02492805113871487, val loss: 0.0236653855741024
Epoch 10: train loss: 0.020639013133486924, val loss: 0.018734809696674348
Epoch 11: train loss: 0.01838031056979481, val loss: 0.02046343372762203
Epoch 12: train loss: 0.016749416017258652, val loss: 0.017000468522310255
Epoch 13: train loss: 0.015639829860962167, val loss: 0.01476034316420555
Epoch 14: train loss: 0.014392104922964865, val loss: 0.01474660

In [31]:
x_train_featured = pretrainedfeatures.transform(x_train).detach().cpu().numpy()
# scaler = StandardScaler()
# x_train_featured = scaler.fit_transform(x_train_featured)
x_test_featured = pretrainedfeatures.transform(x_test.to_numpy())#.detach().cpu().numpy()
# x_test_featured = scaler.transform(x_test_featured)
# x_test_featured = torch.tensor(x_test_featured, dtype=torch.float).to(device)
# regression model
regression_model = get_regression_model(x_train_featured, y_train)

y_pred = np.zeros(x_test.shape[0])
# TODO: Implement the pipeline. It should contain feature extraction and regression. You can optionally
# use other sklearn tools, such as StandardScaler, FunctionTransformer, etc.
y_pred = regression_model(x_test_featured).squeeze(-1).detach().cpu().numpy()

assert y_pred.shape == (x_test.shape[0],)
y_pred = pd.DataFrame({"y": y_pred}, index=x_test.index)
y_pred.to_csv(result_file, index_label="Id")
print(f"Predictions saved to {result_file}, all done!")

  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch 1: train loss: 6.150294303894043, val loss: 5.138032913208008
Epoch 2: train loss: 4.456811428070068, val loss: 3.914276123046875
Epoch 3: train loss: 3.407897472381592, val loss: 2.6705219745635986
Epoch 4: train loss: 3.0515992641448975, val loss: 2.6292550563812256
Epoch 5: train loss: 3.0407309532165527, val loss: 2.6841471195220947
Epoch 6: train loss: 2.4987878799438477, val loss: 2.041243076324463
Epoch 7: train loss: 2.0317957401275635, val loss: 2.7088139057159424
Epoch 8: train loss: 2.040835380554199, val loss: 2.1694347858428955
Epoch 9: train loss: 2.129324436187744, val loss: 2.4382879734039307
Epoch 10: train loss: 1.7219696044921875, val loss: 1.3037515878677368
Epoch 11: train loss: 1.5998972654342651, val loss: 0.9132499098777771
Epoch 12: train loss: 1.737145185470581, val loss: 1.3306528329849243
Epoch 13: train loss: 1.5977648496627808, val loss: 2.138814687728882
Epoch 14: train loss: 1.1486908197402954, val loss: 1.628670334815979
Epoch 15: train loss: 1.09