In [46]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin

from tqdm.auto import tqdm
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [47]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [48]:
pretrain_features = {
    "learning_rate": 0.1,
    "batch_size": 256,
    "epochs": 1000,
    "eval_size": 4*256,
    "momentum": 0.005,
    "weight_decay": 0.0001,
}

In [49]:
def load_data():
    """
    This function loads the data from the csv files and returns it as numpy arrays.

    input: None
    
    output: x_pretrain: np.ndarray, the features of the pretraining set
            y_pretrain: np.ndarray, the labels of the pretraining set
            x_train: np.ndarray, the features of the training set
            y_train: np.ndarray, the labels of the training set
            x_test: np.ndarray, the features of the test set
    """
    x_pretrain = pd.read_csv("./pretrain_features.csv.zip", index_col="Id", compression='zip').drop("smiles", axis=1).to_numpy()
    y_pretrain = pd.read_csv("./pretrain_labels.csv.zip", index_col="Id", compression='zip').to_numpy().squeeze(-1)
    x_train = pd.read_csv("./train_features.csv.zip", index_col="Id", compression='zip').drop("smiles", axis=1).to_numpy()
    y_train = pd.read_csv("./train_labels.csv.zip", index_col="Id", compression='zip').to_numpy().squeeze(-1)
    x_test = pd.read_csv("./test_features.csv.zip", index_col="Id", compression='zip').drop("smiles", axis=1)
    scaler = StandardScaler()
    x_pretrain = scaler.fit_transform(x_pretrain)
    x_train = scaler.transform(x_train)
    x_test_transed = scaler.transform(x_test)
    x_test[x_test.columns] = x_test_transed

    return x_pretrain, y_pretrain, x_train, y_train, x_test

In [50]:
x_pretrain, y_pretrain, x_train, y_train, x_test = load_data()



In [51]:
class AE(nn.Module):
    """
    The model class, which defines our feature extractor used in pretraining.
    """
    def __init__(self):
        """
        The constructor of the model.
        """
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(1000, 1000),
            nn.LeakyReLU(0.01),
            nn.BatchNorm1d(1000),
            nn.Linear(1000, 500),
            nn.LeakyReLU(0.01),
            nn.BatchNorm1d(500),
            nn.Linear(500, 500),
            nn.BatchNorm1d(500))

        self.decoder = nn.Sequential(
            nn.Linear(500, 500),
            nn.LeakyReLU(0.01),
            nn.BatchNorm1d(500),
            nn.Linear(500, 1000),
            nn.LeakyReLU(0.01),
            nn.BatchNorm1d(1000),
            nn.Linear(1000, 1000)
            )
            
        for m in self.modules():
            if isinstance(m, nn.Linear):    
                nn.init.xavier_uniform_(m.weight)
        
        
    def forward(self, x):
        """
        The forward pass of the model.

        input: x: torch.Tensor, the input to the model

        output: x: torch.Tensor, the output of the model
        """
        x = self.encoder(x)
        x = self.decoder(x)
        return x
    
    def encode(self, x):
        return self.encoder(x)

In [52]:
eval_size = pretrain_features["eval_size"]
batch_size = pretrain_features["batch_size"]
ae_model = AE()
ae_model.train()
ae_model.to(device)

def train_autoencoder():
    x_tr, x_val, y_tr, y_val = train_test_split(x_pretrain, y_pretrain, test_size=eval_size, random_state=0, shuffle=True)
    x_tr, x_val = torch.tensor(x_tr, dtype=torch.float), torch.tensor(x_val, dtype=torch.float)
    y_tr, y_val = torch.tensor(y_tr, dtype=torch.float), torch.tensor(y_val, dtype=torch.float)
    train_dataset = torch.utils.data.TensorDataset(x_tr, y_tr)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataset = torch.utils.data.TensorDataset(x_val, y_val)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

    # optimizer = torch.optim.Adam(model.parameters(), lr=pretrain_features['learning_rate'], weight_decay=pretrain_features['weight_decay'])
    optimizer = torch.optim.SGD(ae_model.parameters(), lr=pretrain_features['learning_rate'], momentum=pretrain_features['momentum'], weight_decay=pretrain_features['weight_decay'])
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.3, patience=5, verbose=True)
    criterion = nn.MSELoss()
    epochs = pretrain_features['epochs']
    for epoch in tqdm(range(epochs)):
        loss_tr = 0
        loss_val = 0
        for [x, _] in train_loader:
            optimizer.zero_grad()
            x = x.to(device)
            predictions = ae_model(x)
            loss = criterion(predictions, x)
            loss.backward()
            optimizer.step()
            loss_tr += loss.item() * len(x)
        loss_tr /= len(train_loader.dataset)
        for [x, _] in val_loader:
            x = x.to(device)
            predictions = ae_model(x)
            loss = criterion(predictions, x)
            loss_val += loss.item() * len(x)
        loss_val /= len(val_loader.dataset)
        scheduler.step(loss_val)
        print(f"Epoch {epoch+1}: train loss: {loss_tr}, val loss: {loss_val}")
        if(optimizer.param_groups[0]['lr'] < 1e-6):
            print(f"Early stop at epoch {epoch+1}")
            break

train_autoencoder()

  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 1: train loss: 1.455342502852586, val loss: 1.2164575457572937
Epoch 2: train loss: 1.0247395555593732, val loss: 1.0697510242462158
Epoch 3: train loss: 0.9500808850179034, val loss: 1.0341199189424515
Epoch 4: train loss: 0.9242395267343256, val loss: 1.0125617235898972
Epoch 5: train loss: 0.9070757083431719, val loss: 0.999319851398468
Epoch 6: train loss: 0.8919417210281071, val loss: 0.9863397181034088
Epoch 7: train loss: 0.8771767250582581, val loss: 0.9808003157377243
Epoch 8: train loss: 0.8635093486803648, val loss: 0.959432914853096
Epoch 9: train loss: 0.8498581347805112, val loss: 0.9475092887878418
Epoch 10: train loss: 0.8367392537797133, val loss: 0.934653103351593
Epoch 11: train loss: 0.8240863383108399, val loss: 0.9230950772762299
Epoch 12: train loss: 0.8120601441259674, val loss: 0.9116584062576294
Epoch 13: train loss: 0.8006893337766005, val loss: 0.9005149304866791
Epoch 14: train loss: 0.7889910020379606, val loss: 0.8898219466209412
Epoch 15: train los

In [53]:
featured_x_train = ae_model.encoder(torch.tensor(x_train, dtype=torch.float).to(device))

In [54]:
def get_regression_model(X, y):

    class Model(nn.Module):
            """
            The model class, which defines our feature extractor used in pretraining.
            """
            def __init__(self):
                """
                The constructor of the model.
                """
                super().__init__()
                # TODO: Define the architecture of the model. It should be able to be trained on pretraing data 
                # and then used to extract features from the training and test data.
                self.seq = nn.Sequential(
                    nn.Linear(500, 100),
                    nn.LeakyReLU(0.01),
                    nn.BatchNorm1d(100),
                    nn.Linear(100, 1)
                )

                for m in self.modules():
                    if isinstance(m, nn.Linear):    
                        nn.init.xavier_uniform_(m.weight)

            def forward(self, x):
                """
                The forward pass of the model.

                input: x: torch.Tensor, the input to the model

                output: x: torch.Tensor, the output of the model
                """
                # TODO: Implement the forward pass of the model, in accordance with the architecture 
                # defined in the constructor.
                x = self.seq(x)
                return x
    
    """
    This function returns the regression model used in the pipeline.

    input: None

    output: model: sklearn compatible model, the regression model
    """
    # TODO: Implement the regression model. It should be able to be trained on the features extracted
    # by the feature extractor.
    model = Model()
    model.to(device)
    model.train()

    x_tr, x_val, y_tr, y_val = train_test_split(X, y, test_size=10, random_state=0, shuffle=True)
    x_tr, x_val = torch.tensor(x_tr, dtype=torch.float), torch.tensor(x_val, dtype=torch.float)
    y_tr, y_val = torch.tensor(y_tr, dtype=torch.float), torch.tensor(y_val, dtype=torch.float)
    train_dataset = torch.utils.data.TensorDataset(x_tr, y_tr)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=len(train_dataset), shuffle=True)
    val_dataset = torch.utils.data.TensorDataset(x_val, y_val)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=len(val_dataset), shuffle=True)
    

    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0001)
    # optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.4, weight_decay=0.0001)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.3, patience=5, verbose=True)
    criterion = nn.MSELoss()
    epochs = 10000
    for epoch in tqdm(range(epochs)):
        loss_tr = 0
        loss_val = 0
        for [x, y] in train_loader:
            optimizer.zero_grad()
            x = x.to(device)
            y = y.to(device)
            predictions = model(x).squeeze(-1)
            loss = criterion(predictions, y)
            loss.backward()
            optimizer.step()
            loss_tr = loss.item()
        for [x, y] in val_loader:
            x = x.to(device)
            y = y.to(device)
            predictions = model(x).squeeze(-1)
            loss = criterion(predictions, y)
            loss_val = loss.item()
        print(f"Epoch {epoch+1}: train loss: {loss_tr}, val loss: {loss_val}")
        if(optimizer.param_groups[0]['lr'] < 1e-8):
            print(f"Early stop at epoch {epoch+1}")
            break


    return model

one_model = get_regression_model(featured_x_train, y_train)

  x_tr, x_val = torch.tensor(x_tr, dtype=torch.float), torch.tensor(x_val, dtype=torch.float)


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch 1: train loss: 5.800898551940918, val loss: 4.748525619506836
Epoch 2: train loss: 12.902047157287598, val loss: 4.234756946563721
Epoch 3: train loss: 5.999337673187256, val loss: 3.890300750732422
Epoch 4: train loss: 2.9360525608062744, val loss: 3.5686683654785156
Epoch 5: train loss: 2.4128479957580566, val loss: 3.164647340774536
Epoch 6: train loss: 2.3314661979675293, val loss: 2.8555142879486084
Epoch 7: train loss: 2.156174659729004, val loss: 2.6738851070404053
Epoch 8: train loss: 1.8882337808609009, val loss: 2.508727550506592
Epoch 9: train loss: 1.5964884757995605, val loss: 2.3289260864257812
Epoch 10: train loss: 1.3251292705535889, val loss: 2.132870674133301
Epoch 11: train loss: 1.0800857543945312, val loss: 1.9432125091552734
Epoch 12: train loss: 0.8710078001022339, val loss: 1.7651007175445557
Epoch 13: train loss: 0.6975438594818115, val loss: 1.5909343957901
Epoch 14: train loss: 0.5547029972076416, val loss: 1.4414156675338745
Epoch 15: train loss: 0.435

In [55]:
result_file = "results-ae-1000-500-100-1.csv"

In [56]:
y_pred = np.zeros(x_test.shape[0])
y_pred = one_model(ae_model.encoder(torch.tensor(x_test.to_numpy(), dtype=torch.float).to(device))).squeeze(-1).detach().cpu().numpy()

assert y_pred.shape == (x_test.shape[0],)
y_pred = pd.DataFrame({"y": y_pred}, index=x_test.index)
y_pred.to_csv(result_file, index_label="Id")
print(f"Predictions saved to {result_file}, all done!")

Predictions saved to results-ae-1000-500-100-1.csv, all done!
