In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin

from tqdm.auto import tqdm
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
pretrain_features = {
    "learning_rate": 0.1,
    "batch_size": 256,
    "epochs": 1000,
    "eval_size": 4*256,
    "momentum": 0.005,
    "weight_decay": 0.0001,
}

In [4]:
def load_data():
    """
    This function loads the data from the csv files and returns it as numpy arrays.

    input: None
    
    output: x_pretrain: np.ndarray, the features of the pretraining set
            y_pretrain: np.ndarray, the labels of the pretraining set
            x_train: np.ndarray, the features of the training set
            y_train: np.ndarray, the labels of the training set
            x_test: np.ndarray, the features of the test set
    """
    x_pretrain = pd.read_csv("./pretrain_features.csv.zip", index_col="Id", compression='zip').drop("smiles", axis=1).to_numpy()
    y_pretrain = pd.read_csv("./pretrain_labels.csv.zip", index_col="Id", compression='zip').to_numpy().squeeze(-1)
    x_train = pd.read_csv("./train_features.csv.zip", index_col="Id", compression='zip').drop("smiles", axis=1).to_numpy()
    y_train = pd.read_csv("./train_labels.csv.zip", index_col="Id", compression='zip').to_numpy().squeeze(-1)
    x_test = pd.read_csv("./test_features.csv.zip", index_col="Id", compression='zip').drop("smiles", axis=1)
    
#     scaler = StandardScaler()
#     x_pretrain = scaler.fit_transform(x_pretrain)
#     x_train = scaler.transform(x_train)
#     x_test_transed = scaler.transform(x_test)
#     x_test[x_test.columns] = x_test_transed

    return x_pretrain, y_pretrain, x_train, y_train, x_test

In [5]:
x_pretrain, y_pretrain, x_train, y_train, x_test = load_data()

In [6]:
class AE(nn.Module):
    """
    The model class, which defines our feature extractor used in pretraining.
    """
    def __init__(self):
        """
        The constructor of the model.
        """
        super().__init__()
        self.encoder = nn.Sequential(
            nn.Linear(1000, 512),
            nn.ReLU(),
            nn.BatchNorm1d(512),
            nn.Linear(512, 256))

        self.decoder = nn.Sequential(
            nn.Linear(256, 512),
            nn.ReLU(),
            nn.BatchNorm1d(512),
            nn.Linear(512, 1000),
            nn.Sigmoid()
            )
            
        for m in self.modules():
            if isinstance(m, nn.Linear):    
                nn.init.xavier_uniform_(m.weight)
        
        
    def forward(self, x):
        """
        The forward pass of the model.

        input: x: torch.Tensor, the input to the model

        output: x: torch.Tensor, the output of the model
        """
        x = self.encoder(x)
        x = self.decoder(x)
        return x
    
    def encode(self, x):
        return self.encoder(x)

In [7]:
eval_size = pretrain_features["eval_size"]
batch_size = pretrain_features["batch_size"]
ae_model = AE()
ae_model.train()
ae_model.to(device)

def train_autoencoder():
    x_tr, x_val, y_tr, y_val = train_test_split(x_pretrain, y_pretrain, test_size=eval_size, random_state=0, shuffle=True)
    x_tr, x_val = torch.tensor(x_tr, dtype=torch.float), torch.tensor(x_val, dtype=torch.float)
    y_tr, y_val = torch.tensor(y_tr, dtype=torch.float), torch.tensor(y_val, dtype=torch.float)
    train_dataset = torch.utils.data.TensorDataset(x_tr, y_tr)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataset = torch.utils.data.TensorDataset(x_val, y_val)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

    # optimizer = torch.optim.Adam(model.parameters(), lr=pretrain_features['learning_rate'], weight_decay=pretrain_features['weight_decay'])
    optimizer = torch.optim.SGD(ae_model.parameters(), lr=pretrain_features['learning_rate'], momentum=pretrain_features['momentum'], weight_decay=pretrain_features['weight_decay'])
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.3, patience=5, verbose=True)
    criterion = nn.MSELoss()
    epochs = pretrain_features['epochs']
    for epoch in tqdm(range(epochs)):
        loss_tr = 0
        loss_val = 0
        for [x, _] in train_loader:
            optimizer.zero_grad()
            x = x.to(device)
            predictions = ae_model(x)
            loss = criterion(predictions, x)
            loss.backward()
            optimizer.step()
            loss_tr += loss.item() * len(x)
        loss_tr /= len(train_loader.dataset)
        for [x, _] in val_loader:
            x = x.to(device)
            predictions = ae_model(x)
            loss = criterion(predictions, x)
            loss_val += loss.item() * len(x)
        loss_val /= len(val_loader.dataset)
        scheduler.step(loss_val)
        print(f"Epoch {epoch+1}: train loss: {loss_tr}, val loss: {loss_val}")
        if(optimizer.param_groups[0]['lr'] < 1e-6):
            print(f"Early stop at epoch {epoch+1}")
            break

train_autoencoder()

  0%|          | 0/1000 [00:00<?, ?it/s]

Epoch 1: train loss: 0.2802698064263215, val loss: 0.2782503440976143
Epoch 2: train loss: 0.2764578491634343, val loss: 0.27451617270708084
Epoch 3: train loss: 0.27272464971221455, val loss: 0.2708417698740959
Epoch 4: train loss: 0.26901175914656605, val loss: 0.2671213075518608
Epoch 5: train loss: 0.2652498954205449, val loss: 0.26334628462791443
Epoch 6: train loss: 0.26138282014980146, val loss: 0.2594311237335205
Epoch 7: train loss: 0.257339344562872, val loss: 0.2552710548043251
Epoch 8: train loss: 0.2530451943889158, val loss: 0.25085053592920303
Epoch 9: train loss: 0.24840109469414692, val loss: 0.24599934741854668
Epoch 10: train loss: 0.2433054954037484, val loss: 0.24066056311130524
Epoch 11: train loss: 0.23763303788258958, val loss: 0.23463580384850502
Epoch 12: train loss: 0.23124398124136203, val loss: 0.22783591598272324
Epoch 13: train loss: 0.22397513688097434, val loss: 0.2200760468840599
Epoch 14: train loss: 0.21568109935812216, val loss: 0.21121688187122345


In [8]:
class Net(nn.Module):
    """
    The model class, which defines our feature extractor used in pretraining.
    """
    def __init__(self):
        """
        The constructor of the model.
        """
        super().__init__()
        # TODO: Define the architecture of the model. It should be able to be trained on pretraing data 
        # and then used to extract features from the training and test data.
        self.fc1 = nn.Linear(256, 64)
        self.fc2 = nn.Linear(64, 16)
        self.fc3 = nn.Linear(16, 1)

        self.dropout1 = nn.Dropout(0.2)
        self.dropout2 = nn.Dropout(0.3)

        nn.init.xavier_normal_(self.fc1.weight)
        nn.init.xavier_normal_(self.fc2.weight)
        nn.init.xavier_normal_(self.fc3.weight)


    def forward(self, x):
        """
        The forward pass of the model.

        input: x: torch.Tensor, the input to the model

        output: x: torch.Tensor, the output of the model
        """
        # TODO: Implement the forward pass of the model, in accordance with the architecture 
        # defined in the constructor.
        x = torch.relu(self.fc1(x))
        x = self.dropout1(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout2(x)
        x = self.fc3(x)
        return x
    
    def encode(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout1(x)
        x = torch.relu(self.fc2(x))
        return x

In [9]:
pretrained_x = ae_model.encoder(torch.tensor(x_pretrain, dtype=torch.float).to(device)).detach().cpu().numpy()

In [10]:
# model declaration
nn_model = Net()
nn_model.to(device)
nn_model.train()
    
def train_nn():
    x_tr, x_val, y_tr, y_val = train_test_split(pretrained_x, y_pretrain, test_size=eval_size, random_state=0, shuffle=True)
    x_tr, x_val = torch.tensor(x_tr, dtype=torch.float), torch.tensor(x_val, dtype=torch.float)
    y_tr, y_val = torch.tensor(y_tr, dtype=torch.float), torch.tensor(y_val, dtype=torch.float)
    train_dataset = torch.utils.data.TensorDataset(x_tr, y_tr)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_dataset = torch.utils.data.TensorDataset(x_val, y_val)
    val_loader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

    # TODO: Implement the training loop. The model should be trained on the pretraining data. Use validation set 
    # to monitor the loss.
    optimizer = torch.optim.Adam(nn_model.parameters(), lr=0.001, weight_decay=0.0001)    
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.3, patience=5, verbose=True)
    criterion = nn.MSELoss()
    epochs = 500
    for epoch in tqdm(range(epochs)):
        loss_tr = 0
        loss_val = 0
        for [x, y] in train_loader:
            optimizer.zero_grad()
            x = x.to(device)
            y = y.to(device)
            predictions = nn_model(x).squeeze(-1)
            loss = criterion(predictions, y)
            loss.backward()
            optimizer.step()
            loss_tr += loss.item() * len(x)
        loss_tr /= len(train_loader.dataset)
        for [x, y] in val_loader:
            x = x.to(device)
            y = y.to(device)
            predictions = nn_model(x).squeeze(-1)
            loss = criterion(predictions, y)
            loss_val += loss.item() * len(x)
        loss_val /= len(val_loader.dataset)
        scheduler.step(loss_val)
        print(f"Epoch {epoch+1}: train loss: {loss_tr}, val loss: {loss_val}")
        if(optimizer.param_groups[0]['lr'] < 1e-6):
            print(f"Early stop at epoch {epoch+1}")
            break

train_nn()

  0%|          | 0/500 [00:00<?, ?it/s]

Epoch 1: train loss: 1.2176157743546823, val loss: 0.5885423868894577
Epoch 2: train loss: 0.5968789197930476, val loss: 0.5417300313711166
Epoch 3: train loss: 0.5317910633341076, val loss: 0.49697350710630417
Epoch 4: train loss: 0.4923697005919638, val loss: 0.5065941885113716
Epoch 5: train loss: 0.4437599905621417, val loss: 0.42411258071660995
Epoch 6: train loss: 0.40792531959958345, val loss: 0.3807792440056801
Epoch 7: train loss: 0.3712322944054141, val loss: 0.3741005063056946
Epoch 8: train loss: 0.3357777037347154, val loss: 0.30825547873973846
Epoch 9: train loss: 0.30394409571119246, val loss: 0.3008662611246109
Epoch 10: train loss: 0.2803601227829714, val loss: 0.2647411674261093
Epoch 11: train loss: 0.25048334226231295, val loss: 0.21686721965670586
Epoch 12: train loss: 0.22707930822576505, val loss: 0.22091049700975418
Epoch 13: train loss: 0.20517777673698104, val loss: 0.19073177501559258
Epoch 14: train loss: 0.1870312693022856, val loss: 0.16045453399419785
Epo

In [11]:
featured_x_train = ae_model.encode(torch.tensor(x_train, dtype=torch.float).to(device))
featured_x_train = nn_model.encode(featured_x_train).detach().cpu().numpy()
# scaler = StandardScaler()
# featured_x_train = scaler.fit_transform(featured_x_train)

In [12]:
class onelayer(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(16, 1)
        nn.init.xavier_normal_(self.fc1.weight)
    
    def forward(self, x):
        x = self.fc1(x)
        return x

In [13]:
def get_regression_model(X, y):

    class Model(nn.Module):
        """
        The model class, which defines our feature extractor used in pretraining.
        """
        def __init__(self):
            """
            The constructor of the model.
            """
            super().__init__()
            # TODO: Define the architecture of the model. It should be able to be trained on pretraing data 
            # and then used to extract features from the training and test data.
            self.fc3 = nn.Linear(16, 1)

            # nn.init.kaiming_uniform_(self.fc3.weight, nonlinearity='relu')
            nn.init.xavier_normal_(self.fc3.weight)

        def forward(self, x):
            """
            The forward pass of the model.

            input: x: torch.Tensor, the input to the model

            output: x: torch.Tensor, the output of the model
            """
            # TODO: Implement the forward pass of the model, in accordance with the architecture 
            # defined in the constructor.
            x = self.fc3(x)
            return x
    
    """
    This function returns the regression model used in the pipeline.

    input: None

    output: model: sklearn compatible model, the regression model
    """
    # TODO: Implement the regression model. It should be able to be trained on the features extracted
    # by the feature extractor.
    model = Model()
    model.to(device)
    model.train()

    x = torch.tensor(X, dtype=torch.float)
    # x = X.clone().detach()
    x = x.to(device)
    y = torch.tensor(y, dtype=torch.float)
    y = y.to(device)

    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.3, patience=5, verbose=True)
    criterion = nn.MSELoss()
    epochs = 10000
    for epoch in tqdm(range(epochs)):
        optimizer.zero_grad()
        predictions = model(x).squeeze(-1)
        loss = criterion(predictions, y)
        loss.backward()
        optimizer.step()
        scheduler.step(loss)
        if(epoch + 1) % 100 == 0:
            print(f"Epoch {epoch+1}: train loss: {loss}")
        if(optimizer.param_groups[0]['lr'] < 1e-7):
            print(f"Early stop at epoch {epoch+1}, loss: {loss}")
            break

    return model

one_model = get_regression_model(featured_x_train, y_train)

  0%|          | 0/10000 [00:00<?, ?it/s]

Epoch 100: train loss: 3.647225856781006
Epoch 200: train loss: 2.8522441387176514
Epoch 300: train loss: 2.265383005142212
Epoch 400: train loss: 1.838626503944397
Epoch 500: train loss: 1.528767704963684
Epoch 600: train loss: 1.2997362613677979
Epoch 700: train loss: 1.1240910291671753
Epoch 800: train loss: 0.9830479025840759
Epoch 900: train loss: 0.865081787109375
Epoch 1000: train loss: 0.7637797594070435
Epoch 1100: train loss: 0.6757708787918091
Epoch 1200: train loss: 0.5992141366004944
Epoch 1300: train loss: 0.5329111218452454
Epoch 1400: train loss: 0.4758740961551666
Epoch 1500: train loss: 0.4271615743637085
Epoch 1600: train loss: 0.38583385944366455
Epoch 1700: train loss: 0.35095855593681335
Epoch 1800: train loss: 0.32162806391716003
Epoch 1900: train loss: 0.296980619430542
Epoch 2000: train loss: 0.27621716260910034
Epoch 2100: train loss: 0.25861501693725586
Epoch 2200: train loss: 0.24353738129138947
Epoch 2300: train loss: 0.23043735325336456
Epoch 2400: train l

In [14]:
result_file = "results-ae-lr.csv"

In [15]:
from sklearn.linear_model import LinearRegression

featured_x_train = ae_model.encode(torch.tensor(x_train, dtype=torch.float).to(device)).detach().cpu().numpy()
lr = LinearRegression()
lr.fit(featured_x_train, y_train)
print(lr.score(featured_x_train, y_train))

0.9999999999982968


In [16]:
y_pred = np.zeros(x_test.shape[0])
# featured_x_test = nn_model.encode(ae_model.encoder(torch.tensor(x_test.to_numpy(), dtype=torch.float).to(device)))
# featured_x_test = scaler.transform(featured_x_test.detach().cpu().numpy())
# featured_x_test = torch.tensor(featured_x_test, dtype=torch.float).to(device)
# y_pred = one_model(featured_x_test).squeeze(-1).detach().cpu().numpy()
featured_x_test = ae_model.encode(torch.tensor(x_test.to_numpy(), dtype=torch.float).to(device))
y_pred = lr.predict(featured_x_test.detach().cpu().numpy())

assert y_pred.shape == (x_test.shape[0],)
y_pred = pd.DataFrame({"y": y_pred}, index=x_test.index)
y_pred.to_csv(result_file, index_label="Id")
print(f"Predictions saved to {result_file}, all done!")

Predictions saved to results-ae-lr.csv, all done!
