In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin

from tqdm.auto import tqdm
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
def load_data():
    """
    This function loads the data from the csv files and returns it as numpy arrays.

    input: None
    
    output: x_pretrain: np.ndarray, the features of the pretraining set
            y_pretrain: np.ndarray, the labels of the pretraining set
            x_train: np.ndarray, the features of the training set
            y_train: np.ndarray, the labels of the training set
            x_test: np.ndarray, the features of the test set
    """
    x_pretrain = pd.read_csv("./pretrain_features.csv.zip", index_col="Id", compression='zip').drop("smiles", axis=1).to_numpy()
    y_pretrain = pd.read_csv("./pretrain_labels.csv.zip", index_col="Id", compression='zip').to_numpy().squeeze(-1)
    x_train = pd.read_csv("./train_features.csv.zip", index_col="Id", compression='zip').drop("smiles", axis=1).to_numpy()
    y_train = pd.read_csv("./train_labels.csv.zip", index_col="Id", compression='zip').to_numpy().squeeze(-1)
    x_test = pd.read_csv("./test_features.csv.zip", index_col="Id", compression='zip').drop("smiles", axis=1)
    scaler = StandardScaler()
    x_pretrain = scaler.fit_transform(x_pretrain)
    x_train = scaler.transform(x_train)
    x_test_transed = scaler.transform(x_test)
    x_test[x_test.columns] = x_test_transed

    return x_pretrain, y_pretrain, x_train, y_train, x_test

In [4]:
class Net(nn.Module):
    """
    The model class, which defines our feature extractor used in pretraining.
    """
    def __init__(self):
        """
        The constructor of the model.
        """
        super().__init__()
        # TODO: Define the architecture of the model. It should be able to be trained on pretraing data 
        # and then used to extract features from the training and test data.
        self.fc = nn.ModuleList()
        self.fc.append(nn.Linear(1000, 1000))
        self.fc.append(nn.Linear(1000, 512))
        self.fc.append(nn.Linear(512, 512))
        self.fc.append(nn.Linear(512, 512))
        self.fc.append(nn.Linear(512, 256))
        self.fc.append(nn.Linear(256, 256))
        self.fc.append(nn.Linear(256, 128))
        self.fc.append(nn.Linear(128, 64))
        self.fc.append(nn.Linear(64, 1))

        self.dropout = nn.ModuleList()
        self.dropout.append(nn.Dropout(0.2))
        self.dropout.append(nn.Dropout(0.2))
        self.dropout.append(nn.Dropout(0.3))
        self.dropout.append(nn.Dropout(0.3))
        self.dropout.append(nn.Dropout(0.4))
        self.dropout.append(nn.Dropout(0.4))
        self.dropout.append(nn.Dropout(0.5))

        for fc in self.fc:
            nn.init.xavier_normal_(fc.weight)

    def forward(self, x):
        """
        The forward pass of the model.

        input: x: torch.Tensor, the input to the model

        output: x: torch.Tensor, the output of the model
        """
        # TODO: Implement the forward pass of the model, in accordance with the architecture 
        # defined in the constructor.
        x = torch.relu(self.fc[0](x))
        x = self.dropout[0](x)
        x = torch.relu(self.fc[1](x))
        x = self.dropout[1](x)
        x = torch.relu(self.fc[2](x))
        x = self.dropout[2](x)
        x = torch.relu(self.fc[3](x))
        x = self.dropout[3](x)
        x = torch.relu(self.fc[4](x))
        x = self.dropout[4](x)
        x = torch.relu(self.fc[5](x))
        x = self.dropout[5](x)
        x = torch.relu(self.fc[6](x))
        x = self.dropout[6](x)
        x = torch.relu(self.fc[7](x))
        x = self.fc[8](x)
        return x

In [5]:
def make_feature_extractor(x, y, batch_size=256, eval_size=1000):
    """
    This function trains the feature extractor on the pretraining data and returns a function which
    can be used to extract features from the training and test data.

    input: x: np.ndarray, the features of the pretraining set
              y: np.ndarray, the labels of the pretraining set
                batch_size: int, the batch size used for training
                eval_size: int, the size of the validation set
            
    output: make_features: function, a function which can be used to extract features from the training and test data
    """
    # Pretraining data loading
    in_features = x.shape[-1]
    x_tr, x_val, y_tr, y_val = train_test_split(x, y, test_size=eval_size, random_state=0, shuffle=True)
    x_tr, x_val = torch.tensor(x_tr, dtype=torch.float), torch.tensor(x_val, dtype=torch.float)
    y_tr, y_val = torch.tensor(y_tr, dtype=torch.float), torch.tensor(y_val, dtype=torch.float)
    train_dataset = torch.utils.data.TensorDataset(x_tr, y_tr)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataset = torch.utils.data.TensorDataset(x_val, y_val)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

    # model declaration
    model = Net()
    model.to(device)
    model.train()
    
    # TODO: Implement the training loop. The model should be trained on the pretraining data. Use validation set 
    # to monitor the loss.
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.0001)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.3, patience=5, verbose=True)
    criterion = nn.MSELoss()
    epochs = 500
    for epoch in tqdm(range(epochs)):
        loss_tr = 0
        loss_val = 0
        for [x, y] in train_loader:
            optimizer.zero_grad()
            x = x.to(device)
            y = y.to(device)
            predictions = model(x).squeeze(-1)
            loss = criterion(predictions, y)
            loss.backward()
            optimizer.step()
            loss_tr += loss.item() * len(x)
        loss_tr /= len(train_loader.dataset)
        for [x, y] in val_loader:
            x = x.to(device)
            y = y.to(device)
            predictions = model(x).squeeze(-1)
            loss = criterion(predictions, y)
            loss_val += loss.item() * len(x)
        loss_val /= len(val_loader.dataset)
        scheduler.step(loss_val)
        print(f"Epoch {epoch+1}: train loss: {loss_tr}, val loss: {loss_val}")
        if(optimizer.param_groups[0]['lr'] < 1e-6):
            print(f"Early stop at epoch {epoch+1}")
            break

    return model

In [6]:
def finetune(model, X, y, tune_layers=2):
    # frozen layers and not frozen last n layers
    num_layers = len(model.fc)
    for i in range(num_layers - tune_layers):
        for param in model.fc[i].parameters():
            param.requires_grad = False
    for i in range(num_layers - tune_layers, num_layers):
        for param in model.fc[i].parameters():
            param.requires_grad = True
        nn.init.xavier_normal_(model.fc[i].weight)

    x = torch.tensor(X, dtype=torch.float)
    x = x.to(device)
    y = torch.tensor(y, dtype=torch.float)
    y = y.to(device)

    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=0.0001)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.3, patience=5, verbose=True)
    criterion = nn.MSELoss()
    epochs = 10000
    for epoch in tqdm(range(epochs)):
        optimizer.zero_grad()
        predictions = model(x).squeeze(-1)
        loss = criterion(predictions, y)
        loss.backward()
        optimizer.step()
        scheduler.step(loss)
        if(epoch + 1) % 10 == 0:
            print(f"Epoch {epoch+1}: train loss: {loss}")
        if(optimizer.param_groups[0]['lr'] < 1e-7):
            print(f"Early stop at epoch {epoch+1}, loss: {loss}")
            break

    return model

In [7]:
result_file = "results-ft-2-128.csv"

In [8]:
# Load data
x_pretrain, y_pretrain, x_train, y_train, x_test = load_data()
print("Data loaded!")
# Utilize pretraining data by creating feature extractor which extracts lumo energy 
# features from available initial features
feature_extractor =  make_feature_extractor(x_pretrain, y_pretrain)

regression_model = finetune(feature_extractor, x_train, y_train)

y_pred = np.zeros(x_test.shape[0])
# TODO: Implement the pipeline. It should contain feature extraction and regression. You can optionally
# use other sklearn tools, such as StandardScaler, FunctionTransformer, etc.
x_test_tensor = torch.tensor(x_test.to_numpy(), dtype=torch.float).to(device)
y_pred = regression_model(x_test_tensor).squeeze(-1).detach().cpu().numpy()

assert y_pred.shape == (x_test.shape[0],)
y_pred = pd.DataFrame({"y": y_pred}, index=x_test.index)
y_pred.to_csv(result_file, index_label="Id")
print(f"Predictions saved to {result_file}, all done!")



Data loaded!


  0%|          | 0/500 [00:00<?, ?it/s]

Epoch 1: train loss: 0.7509917573150323, val loss: 0.24481031668186187
Epoch 2: train loss: 0.1856592956075863, val loss: 0.1351518316268921
Epoch 3: train loss: 0.11355318463578516, val loss: 0.08567215746641159
Epoch 4: train loss: 0.07297525085721697, val loss: 0.058291280835866925
Epoch 5: train loss: 0.04919610972307166, val loss: 0.039477434039115904
Epoch 6: train loss: 0.03377175368946426, val loss: 0.027582309380173683
Epoch 7: train loss: 0.023817637523826287, val loss: 0.022008428007364272
Epoch 8: train loss: 0.0184417150285171, val loss: 0.01608578810095787
Epoch 9: train loss: 0.015224994518775114, val loss: 0.015083551578223706
Epoch 10: train loss: 0.0133196210344227, val loss: 0.013720905534923077
Epoch 11: train loss: 0.011722442549254213, val loss: 0.012380805291235447
Epoch 12: train loss: 0.010464945681393147, val loss: 0.010750147394835948
Epoch 13: train loss: 0.009986051261425018, val loss: 0.010044511444866656
Epoch 14: train loss: 0.009011310397742354, val los

  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch 10: train loss: 4.663094997406006
Epoch 20: train loss: 4.425008296966553
Epoch 30: train loss: 4.299380779266357
Epoch 40: train loss: 4.115776062011719
Epoch 50: train loss: 3.8876583576202393
Epoch 60: train loss: 3.7343132495880127
Epoch 00070: reducing learning rate of group 0 to 3.0000e-05.
Epoch 70: train loss: 3.6495964527130127
Epoch 00080: reducing learning rate of group 0 to 9.0000e-06.
Epoch 80: train loss: 3.627293109893799
Epoch 90: train loss: 3.4836437702178955
Epoch 00096: reducing learning rate of group 0 to 2.7000e-06.
Epoch 100: train loss: 3.5559816360473633
Epoch 00102: reducing learning rate of group 0 to 8.1000e-07.
Epoch 110: train loss: 3.5351343154907227
Epoch 00113: reducing learning rate of group 0 to 2.4300e-07.
Epoch 120: train loss: 3.5370030403137207
Epoch 00121: reducing learning rate of group 0 to 7.2900e-08.
Early stop at epoch 121, loss: 3.5777339935302734
Predictions saved to results-ft-2-128.csv, all done!
