In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin

from tqdm.auto import tqdm
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [3]:
def load_data():
    """
    This function loads the data from the csv files and returns it as numpy arrays.

    input: None
    
    output: x_pretrain: np.ndarray, the features of the pretraining set
            y_pretrain: np.ndarray, the labels of the pretraining set
            x_train: np.ndarray, the features of the training set
            y_train: np.ndarray, the labels of the training set
            x_test: np.ndarray, the features of the test set
    """
    x_pretrain = pd.read_csv("./pretrain_features.csv.zip", index_col="Id", compression='zip').drop("smiles", axis=1).to_numpy()
    y_pretrain = pd.read_csv("./pretrain_labels.csv.zip", index_col="Id", compression='zip').to_numpy().squeeze(-1)
    x_train = pd.read_csv("./train_features.csv.zip", index_col="Id", compression='zip').drop("smiles", axis=1).to_numpy()
    y_train = pd.read_csv("./train_labels.csv.zip", index_col="Id", compression='zip').to_numpy().squeeze(-1)
    x_test = pd.read_csv("./test_features.csv.zip", index_col="Id", compression='zip').drop("smiles", axis=1)
#     scaler = StandardScaler()
#     x_pretrain = scaler.fit_transform(x_pretrain)
#     x_train = scaler.transform(x_train)
#     x_test_transed = scaler.transform(x_test)
#     x_test[x_test.columns] = x_test_transed

    return x_pretrain, y_pretrain, x_train, y_train, x_test

In [4]:
x_pretrain, y_pretrain, x_train, y_train, x_test = load_data()

In [5]:
class AE(nn.Module):
    """
    The model class, which defines our feature extractor used in pretraining.
    """
    def __init__(self):
        """
        The constructor of the model.
        """
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(1000, 1000),
            nn.BatchNorm1d(1000),
            nn.LeakyReLU(0.01),
            nn.Dropout(0.6),
            nn.Linear(1000, 1000),
            nn.BatchNorm1d(1000),
            nn.LeakyReLU(0.01)
            )

        self.decoder = nn.Sequential(
            nn.Linear(1000, 1000),
            nn.BatchNorm1d(1000),
            nn.LeakyReLU(0.01),
            nn.Dropout(0.6),
            nn.Linear(1000, 1000),
            nn.BatchNorm1d(1000),
            nn.LeakyReLU(0.01)
            )
            
        for m in self.modules():
            if isinstance(m, nn.Linear):    
                nn.init.xavier_uniform_(m.weight)
        
        
    def forward(self, x):
        """
        The forward pass of the model.

        input: x: torch.Tensor, the input to the model

        output: x: torch.Tensor, the output of the model
        """
        x = self.encoder(x)
        x = self.decoder(x)
        return x
    
    def encode(self, x):
        return self.encoder(x)

In [6]:
eval_size = 1000
batch_size = 256
learning_rate = 0.01
ae_model = AE()
ae_model.train()
ae_model.to(device)

def train_autoencoder():
    x_tr, x_val, y_tr, y_val = train_test_split(x_pretrain, y_pretrain, test_size=eval_size, random_state=0, shuffle=True)
    x_tr, x_val = torch.tensor(x_tr, dtype=torch.float), torch.tensor(x_val, dtype=torch.float)
    y_tr, y_val = torch.tensor(y_tr, dtype=torch.float), torch.tensor(y_val, dtype=torch.float)
    train_dataset = torch.utils.data.TensorDataset(x_tr, y_tr)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataset = torch.utils.data.TensorDataset(x_val, y_val)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

    optimizer = torch.optim.Adam(ae_model.parameters(), lr=learning_rate)
    # optimizer = torch.optim.SGD(ae_model.parameters(), lr=learning_rate, momentum=0.9, weight_decay=0.001)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.3, patience=5, verbose=True)
    criterion = nn.MSELoss()
    epochs = 1000
    for epoch in tqdm(range(epochs)):
        loss_tr = 0
        loss_val = 0
        for [x, _] in train_loader:
            optimizer.zero_grad()
            x = x.to(device)
            predictions = ae_model(x)
            loss = criterion(predictions, x)
            loss.backward()
            optimizer.step()
            loss_tr += loss.item() * len(x)
        loss_tr /= len(train_loader.dataset)
        for [x, _] in val_loader:
            x = x.to(device)
            predictions = ae_model(x)
            loss = criterion(predictions, x)
            loss_val += loss.item() * len(x)
        loss_val /= len(val_loader.dataset)
        scheduler.step(loss_val)
        
        # if(epoch % 10 == 0):
        print(f"Epoch {epoch+1}: train loss: {loss_tr}, val loss: {loss_val}")
        if(optimizer.param_groups[0]['lr'] < 1e-6):
            print(f"Early stop at epoch {epoch+1}")
            break

train_autoencoder()

  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 1: train loss: 0.04049326893413553, val loss: 0.023283341720700262
Epoch 2: train loss: 0.020222239522605528, val loss: 0.018128526717424393
Epoch 3: train loss: 0.017175616049949004, val loss: 0.015975482270121574
Epoch 4: train loss: 0.014432156973499425, val loss: 0.013527899652719498
Epoch 5: train loss: 0.012760671442108495, val loss: 0.012341900207102298
Epoch 6: train loss: 0.011686629722921216, val loss: 0.011293500296771525
Epoch 7: train loss: 0.010882926372241001, val loss: 0.010569950111210346
Epoch 8: train loss: 0.010122431041938919, val loss: 0.009909167237579822
Epoch 9: train loss: 0.009508812655447698, val loss: 0.009305797331035138
Epoch 10: train loss: 0.008998560582496682, val loss: 0.008889973744750023
Epoch 11: train loss: 0.008699497373888688, val loss: 0.008684891782701015
Epoch 12: train loss: 0.008271224187953131, val loss: 0.008154441125690937
Epoch 13: train loss: 0.007907005592572446, val loss: 0.007867228966206312
Epoch 14: train loss: 0.00762311314

In [7]:
featured_x_train = ae_model.encode(torch.tensor(x_train, dtype=torch.float).to(device))
featured_x_pretrain = ae_model.encode(torch.tensor(x_pretrain, dtype=torch.float).to(device))
print(featured_x_train.shape, featured_x_pretrain.shape)

torch.Size([100, 1000]) torch.Size([50000, 1000])


In [8]:
class Model(nn.Module):
        """
        The model class, which defines our feature extractor used in pretraining.
        """
        def __init__(self):
            """
            The constructor of the model.
            """
            super().__init__()
            # TODO: Define the architecture of the model. It should be able to be trained on pretraing data 
            # and then used to extract features from the training and test data.
            self.seq = nn.Sequential(
                nn.Linear(1000, 100),
                nn.BatchNorm1d(100),
                nn.LeakyReLU(0.01),
                nn.Dropout(0.6),
                nn.Linear(100, 1)
            )

            for m in self.modules():
                if isinstance(m, nn.Linear):    
                    nn.init.xavier_uniform_(m.weight)

        def forward(self, x):
            """
            The forward pass of the model.

            input: x: torch.Tensor, the input to the model

            output: x: torch.Tensor, the output of the model
            """
            # TODO: Implement the forward pass of the model, in accordance with the architecture 
            # defined in the constructor.
            x = self.seq(x)
            return x
        
        def encode(self, x):
            # not use lest layer
            x = self.seq[:-4](x)
            return x

In [9]:
def get_regression_model(X, y):
    
    """
    This function returns the regression model used in the pipeline.

    input: None

    output: model: sklearn compatible model, the regression model
    """
    # TODO: Implement the regression model. It should be able to be trained on the features extracted
    # by the feature extractor.
    model = Model()
    model.to(device)
    model.train()

    x_tr, x_val, y_tr, y_val = train_test_split(X, y, test_size=1000, random_state=0, shuffle=True)
    x_tr, x_val = torch.tensor(x_tr, dtype=torch.float), torch.tensor(x_val, dtype=torch.float)
    y_tr, y_val = torch.tensor(y_tr, dtype=torch.float), torch.tensor(y_val, dtype=torch.float)
    train_dataset = torch.utils.data.TensorDataset(x_tr, y_tr)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=256, shuffle=True)
    val_dataset = torch.utils.data.TensorDataset(x_val, y_val)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=len(val_dataset), shuffle=True)
    

    # optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.4, weight_decay=0.0001)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.3, patience=5, verbose=True)
    criterion = nn.MSELoss()
    epochs = 10000
    for epoch in tqdm(range(epochs)):
        loss_tr = 0
        loss_val = 0
        for [x, y] in train_loader:
            optimizer.zero_grad()
            x = x.to(device)
            y = y.to(device)
            predictions = model(x).squeeze()
            loss = criterion(predictions, y)
            loss.backward()
            optimizer.step()
            loss_tr += loss.item() * len(y)
        loss_tr /= len(train_loader.dataset)
        for [x, y] in val_loader:
            x = x.to(device)
            y = y.to(device)
            predictions = model(x).squeeze()
            loss = criterion(predictions, y)
            loss_val += loss.item() * len(y)
        loss_val /= len(val_loader.dataset)
        scheduler.step(loss_val)
            
        scheduler.step(loss_val)
        if(epoch % 10 == 0):
            print(f"Epoch {epoch+1}: train loss: {loss_tr}, val loss: {loss_val}")
        if(optimizer.param_groups[0]['lr'] < 1e-8):
            print(f"Early stop at epoch {epoch+1}  train loss: {loss_tr}, val loss: {loss_val}")
            break
    return model


one_model = get_regression_model(featured_x_pretrain, y_pretrain)

  x_tr, x_val = torch.tensor(x_tr, dtype=torch.float), torch.tensor(x_val, dtype=torch.float)


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch 1: train loss: 0.7016918918453917, val loss: 0.2668460011482239
Epoch 11: train loss: 0.052476061578916045, val loss: 0.04919317737221718
Epoch 00037: reducing learning rate of group 0 to 3.0000e-03.
Epoch 21: train loss: 0.046659563320023675, val loss: 0.04504149407148361
Epoch 00055: reducing learning rate of group 0 to 9.0000e-04.
Epoch 00061: reducing learning rate of group 0 to 2.7000e-04.
Epoch 31: train loss: 0.04546674976604326, val loss: 0.04361594095826149
Epoch 00075: reducing learning rate of group 0 to 8.1000e-05.
Epoch 41: train loss: 0.04546062464130168, val loss: 0.04165210947394371
Epoch 00087: reducing learning rate of group 0 to 2.4300e-05.
Epoch 00093: reducing learning rate of group 0 to 7.2900e-06.
Epoch 00099: reducing learning rate of group 0 to 2.1870e-06.
Epoch 51: train loss: 0.04578845234671418, val loss: 0.04667235165834427
Epoch 00105: reducing learning rate of group 0 to 6.5610e-07.
Epoch 00111: reducing learning rate of group 0 to 1.9683e-07.
Epoch

In [10]:
featured_x_train = ae_model.encode(torch.tensor(x_train, dtype=torch.float).to(device))
# featured_x_train = one_model.encode(featured_x_train).cpu().detach().numpy()
print(featured_x_train.shape)

torch.Size([100, 1000])


In [11]:
def finetune(old_model, X, y, tune_layers=1):
    model = Model()
    model.to(device)
    model.train()
    model.load_state_dict(old_model.state_dict())
    
    # frozen layers and not frozen last n layers
    for i, param in enumerate(model.parameters()):
        if i < len(list(model.parameters())) - tune_layers:
            param.requires_grad = False
        else:
            param.requires_grad = True
   
    x_tr = torch.tensor(X, dtype=torch.float)
    y_tr = torch.tensor(y, dtype=torch.float)
    train_dataset = torch.utils.data.TensorDataset(x_tr, y_tr)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=2, shuffle=True)

    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=0.0001)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.3, patience=5, verbose=True)
    criterion = nn.MSELoss()
    epochs = 10000
    for epoch in tqdm(range(epochs)):
        loss_tr = 0
        for [x, y] in train_loader:
            optimizer.zero_grad()
            x = x.to(device)
            y = y.to(device)
            predictions = model(x).squeeze()
            loss = criterion(predictions, y)
            loss.backward()
            optimizer.step()
            loss_tr += loss.item() * len(y)
        loss_tr /= len(train_loader.dataset)
        scheduler.step(loss_tr)
        if(epoch + 1) % 10 == 0:
            print(f"Epoch {epoch+1}: train loss: {loss_tr}")
        if(optimizer.param_groups[0]['lr'] < 1e-9):
            print(f"Early stop at epoch {epoch+1}, loss: {loss_tr}")
            break

    return model

finetune_model = finetune(one_model, featured_x_train, y_train, tune_layers=1)

  x_tr = torch.tensor(X, dtype=torch.float)


  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch 10: train loss: 2.7356514620780943
Epoch 20: train loss: 0.18647895528003575
Epoch 30: train loss: 0.13487567819654941
Epoch 00034: reducing learning rate of group 0 to 3.0000e-03.
Epoch 00040: reducing learning rate of group 0 to 9.0000e-04.
Epoch 40: train loss: 0.13999808656517415
Epoch 50: train loss: 0.1256117253098637
Epoch 00051: reducing learning rate of group 0 to 2.7000e-04.
Epoch 00057: reducing learning rate of group 0 to 8.1000e-05.
Epoch 60: train loss: 0.09152740573859773
Epoch 00066: reducing learning rate of group 0 to 2.4300e-05.
Epoch 70: train loss: 0.1299008280877024
Epoch 00072: reducing learning rate of group 0 to 7.2900e-06.
Epoch 00078: reducing learning rate of group 0 to 2.1870e-06.
Epoch 80: train loss: 0.11183915830217302
Epoch 00084: reducing learning rate of group 0 to 6.5610e-07.
Epoch 00090: reducing learning rate of group 0 to 1.9683e-07.
Epoch 90: train loss: 0.1262991946018883
Epoch 00096: reducing learning rate of group 0 to 5.9049e-08.
Epoch 

In [12]:
result_file = "results-ae-1000-100-fintune-1.csv"

In [15]:
y_pred = np.zeros(x_test.shape[0])
featured_x_test = ae_model.encode(torch.tensor(x_test.to_numpy(), dtype=torch.float).to(device))
print(featured_x_test.shape)
y_pred = finetune_model(featured_x_test).squeeze(-1).cpu().detach().numpy()

assert y_pred.shape == (x_test.shape[0],)
y_pred = pd.DataFrame({"y": y_pred}, index=x_test.index)
y_pred.to_csv(result_file, index_label="Id")
print(f"Predictions saved to {result_file}, all done!")

torch.Size([10000, 1000])
Predictions saved to results-ae-1000-100-fintune-1.csv, all done!
