In [1]:
import os
import torch
from torch import optim, nn, utils, Tensor

# from torchvision.transforms import ToTensor
import numpy as np
import pandas as pd
import pytorch_lightning as pl
from sklearn.preprocessing import StandardScaler
from typing import Optional
from sklearn.impute import SimpleImputer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class Genentech404Dataset(utils.data.Dataset):
    def __init__(
        self,
        csv_file: str,
        is_test: bool = False,
        transform=StandardScaler(),
        imputer=SimpleImputer(strategy="median"),
    ) -> None:
        self.df = pd.read_csv(csv_file).set_index(["RID_HASH"])
        # self.df = self.df.astype(
        #     {
        #         "VISCODE": int,
        #         "AGE": float,
        #         "PTGENDER_num": pd.api.types.CategoricalDtype([0, 1]),
        #         "PTEDUCAT": int,
        #         "DX_num": pd.api.types.CategoricalDtype([0, 1, 2]),
        #         "APOE4": float,
        #         "CDRSB": float,
        #         "MMSE": float,
        #         "ADAS13": float,
        #         "Ventricles": float,
        #         "Hippocampus": float,
        #         "WholeBrain": float,
        #         "Entorhinal": float,
        #         "Fusiform": float,
        #         "MidTemp": float,
        #     }
        # )
        self.mask_categorical = np.array(
            [
                False,
                False,
                True,
                False,
                True,
                False,
                False,
                False,
                False,
                False,
                False,
                False,
                False,
                False,
                False,
            ]
        )
        self.transform = transform
        self.imputer = imputer
        if is_test:
            self.df.loc[:, ~self.mask_categorical] = self.transform.transform(
                self.df.loc[:, ~self.mask_categorical]
            )
            self.df.loc[:, :] = self.imputer.transform(self.df.values)
        else:
            self.df.loc[:, ~self.mask_categorical] = self.transform.fit_transform(
                self.df.loc[:, ~self.mask_categorical]
            )
            self.imputer.fit(self.df.values)

    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        return self.df.iloc[idx, :].values.astype(np.float32)


train_dataset = Genentech404Dataset("genentech-404-challenge/dev_set.csv")
train_loader = utils.data.DataLoader(train_dataset, batch_size=500)
mask_categorical = train_dataset.mask_categorical

test_a_dataset = Genentech404Dataset(
    "genentech-404-challenge/test_A_inferred.csv",
    is_test=True,
    transform=train_dataset.transform,
    imputer=train_dataset.imputer,
)
test_a_loader = utils.data.DataLoader(test_a_dataset, batch_size=500)
test_b_dataset = Genentech404Dataset(
    "genentech-404-challenge/test_B_inferred.csv",
    is_test=True,
    transform=train_dataset.transform,
    imputer=train_dataset.imputer,
)
test_b_loader = utils.data.DataLoader(test_b_dataset, batch_size=500)


In [3]:
# class LitAutoEncoder(pl.LightningModule):
#     def __init__(self, encoder, decoder):
#         super().__init__()
#         self.encoder = encoder
#         self.decoder = decoder

#     def training_step(self, batch, batch_idx):
#         # training_step defines the train loop.
#         # it is independent of forward
#         x = batch
#         x = x.view(x.size(0), -1)

#         z = self.encoder(x)
#         x_hat = self.decoder(z)
#         loss = nn.functional.mse_loss(x_hat, x)
#         # Logging to TensorBoard by default
#         self.log("train_loss", loss)
#         return loss

#     def configure_optimizers(self):
#         optimizer = optim.Adam(self.parameters(), lr=1e-3)
#         return optimizer


# encoder = nn.Sequential(nn.Linear(15 * 1, 5), nn.ReLU(), nn.Linear(5, 2))
# decoder = nn.Sequential(nn.Linear(2, 5), nn.ReLU(), nn.Linear(5, 15 * 1))
# autoencoder = LitAutoEncoder(encoder, decoder)
# trainer = pl.Trainer(limit_train_batches=100, max_epochs=3)
# trainer.fit(model=autoencoder, train_dataloaders=train_loader)


In [22]:
class AimNetSelfAttention(pl.LightningModule):
    def __init__(
        self,
        encoder_continuous,
        encoder_categorical,
        decoder_continuous,
        decoder_categorical,
        pooling,
        alpha: float = 1.0,
    ):
        super().__init__()
        self.encoder_continuous = encoder_continuous
        self.encoder_categorical = encoder_categorical
        self.decoder_continuous = decoder_continuous
        self.decoder_categorical = decoder_categorical
        self.pooling = pooling
        self.alpha = alpha

    def forward(self, batch):
        x = batch
        x = x.view(x.size(0), x.size(1), 1)
        x_categorical = x[:, mask_categorical, :]
        x_continuous = x[:, ~mask_categorical, :]
        z_categorical = self.encoder_categorical(x_categorical)
        z_continuous = self.encoder_continuous(x_continuous)

        z = torch.cat((z_categorical, z_continuous), 1)
        z, _ = self.pooling(z, z, z)
        z_categorical = z[:, mask_categorical, :]
        z_continuous = z[:, ~mask_categorical, :]

        x_continuous_hat = self.decoder_continuous(z_continuous)
        x_categorical_hat = self.decoder_categorical(z_categorical, x_categorical)
        batch[:, mask_categorical] = torch.stack([torch.argmax(ele, dim=1) for ele in x_categorical_hat], dim=1).to(torch.float)
        batch[:, ~mask_categorical] = x_continuous_hat.view(x_continuous_hat.size(0), x_continuous_hat.size(1))
        return batch, x_continuous_hat, x_categorical_hat

    def training_step(self, batch, batch_idx):
        x = batch
        x = x.view(x.size(0), x.size(1), 1)
        x_categorical = x[:, mask_categorical, :]
        x_continuous = x[:, ~mask_categorical, :]
        # z_categorical = self.encoder_categorical(x_categorical)
        # z_continuous = self.encoder_continuous(x_continuous)

        # z = torch.cat((z_categorical, z_continuous), 1)
        # z, _ = self.pooling(z, z, z)
        # z_categorical = z[:, mask_categorical, :]
        # z_continuous = z[:, ~mask_categorical, :]


        # x_continuous_hat = self.decoder_continuous(z_continuous)
        # x_categorical_hat = self.decoder_categorical(z_categorical, x_categorical)
        loss_ce = 0
        pred, x_continuous_hat, x_categorical_hat = self.forward(batch)

        for i, _ in enumerate(self.decoder_categorical.transforms):
            loss_ce += nn.functional.cross_entropy(x_categorical_hat[i], x_categorical[:, i, :].squeeze().to(torch.long))

        loss_mse = nn.functional.mse_loss(
            x_continuous_hat, x_continuous, reduction="mean"
        )
        loss = self.alpha * loss_ce + loss_mse

        self.log("train_loss", {"loss": loss, "loss_ce": loss_ce, "loss_mse": loss_mse})
        return loss

    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), lr=1e-3)
        return optimizer


k = 20
p_dropout = 0.15
encoder_continuous = nn.Sequential(
    nn.Linear(1, k), nn.ReLU(), nn.Dropout1d(p_dropout), nn.Linear(k, k)
)


class CategoricalEmbedding(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.embed = nn.ModuleList([nn.Embedding(2, k), nn.Embedding(3, k)])

    def forward(self, x):
        return torch.concat(
            [embed(x[:, i, :].to(torch.int)) for i, embed in enumerate(self.embed)],
            dim=1,
        )


categorical_embedding = CategoricalEmbedding()
encoder_categorical = nn.Sequential(
    categorical_embedding,
    nn.Dropout1d(p_dropout),
)

pooling = nn.MultiheadAttention(k, 1)
decoder_continuous = nn.Sequential(nn.ReLU(), nn.Linear(k, 1))
# decoder_continuous = nn.Linear(k, 1)


class DecoderCategorical(nn.Module):
    def __init__(self, embedding, dim) -> None:
        super().__init__()
        self.embedding = embedding
        self.transforms = nn.ModuleList()
        for embed in self.embedding.embed:
            self.transforms.append(
                nn.Sequential(nn.Linear(k, embed.num_embeddings), nn.ReLU(), nn.Softmax(dim))
            )
            

    def forward(self, context_embedding, categorical_values):
        categorical_embeddings = self.embedding(categorical_values)
        out = []
        for i, module in enumerate(self.transforms):
            # import pdb
            # pdb.set_trace()
            out.append(module(categorical_embeddings[:, i, :]))
        return out

decoder_categorical = DecoderCategorical(categorical_embedding, 1)

In [23]:
aimnet = AimNetSelfAttention(
    encoder_continuous,
    encoder_categorical,
    decoder_continuous,
    decoder_categorical,
    pooling,
    0.2,
)
trainer = pl.Trainer(limit_train_batches=100, max_epochs=150, log_every_n_steps=25) #,fast_dev_run=True)
trainer.fit(model=aimnet, train_dataloaders=train_loader)


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name                | Type               | Params
-----------------------------------------------------------
0 | encoder_continuous  | Sequential         | 460   
1 | encoder_categorical | Sequential         | 100   
2 | decoder_continuous  | Sequential         | 21    
3 | decoder_categorical | DecoderCategorical | 205   
4 | pooling             | MultiheadAttention | 1.7 K 
-----------------------------------------------------------
2.4 K     Trainable params
0         Non-trainable params
2.4 K     Total params
0.009     Total estimated model params size (MB)


Epoch 149: 100%|██████████| 9/9 [00:00<00:00, 11.09it/s, loss=0.647, v_num=44]

`Trainer.fit` stopped: `max_epochs=150` reached.


Epoch 149: 100%|██████████| 9/9 [00:00<00:00, 10.88it/s, loss=0.647, v_num=44]


In [24]:
finetuner = pl.Trainer(
    limit_train_batches=100, max_epochs=10, log_every_n_steps=25
)  # ,fast_dev_run=True)
finetuner.fit(
    model=aimnet,
    train_dataloaders=utils.data.DataLoader(
        utils.data.ConcatDataset([test_a_dataset, test_b_dataset]),
        batch_size=500,
    ), 
)


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name                | Type               | Params
-----------------------------------------------------------
0 | encoder_continuous  | Sequential         | 460   
1 | encoder_categorical | Sequential         | 100   
2 | decoder_continuous  | Sequential         | 21    
3 | decoder_categorical | DecoderCategorical | 205   
4 | pooling             | MultiheadAttention | 1.7 K 
-----------------------------------------------------------
2.4 K     Trainable params
0         Non-trainable params
2.4 K     Total params
0.009     Total estimated model params size (MB)


Epoch 9: 100%|██████████| 6/6 [00:00<00:00,  6.19it/s, loss=0.571, v_num=45]

`Trainer.fit` stopped: `max_epochs=10` reached.


Epoch 9: 100%|██████████| 6/6 [00:00<00:00,  6.14it/s, loss=0.571, v_num=45]


In [7]:
results = finetuner.predict(model=aimnet, dataloaders=test_a_loader)
predicts = torch.concat([result[0] for result in results], dim=0).numpy()
predicts[:, ~mask_categorical] = train_dataset.transform.inverse_transform(predicts[:, ~mask_categorical])

df_test_a = pd.read_csv("genentech-404-challenge/test_A.csv").set_index(["RID_HASH"])
df_test_a_inferred = pd.read_csv("genentech-404-challenge/test_A_inferred.csv").set_index(["RID_HASH"])
df_test_a_pred = pd.DataFrame(predicts, columns=df_test_a_inferred.columns, index=df_test_a_inferred.index)

mask = (~df_test_a_inferred.isna()) & (df_test_a.isna())
df_test_a_pred[mask] = df_test_a_inferred[mask]
df_test_a_pred.VISCODE = df_test_a.VISCODE

  rank_zero_warn(


Predicting DataLoader 0: 100%|██████████| 3/3 [00:00<00:00, 15.61it/s]


In [8]:
results = finetuner.predict(model=aimnet, dataloaders=test_b_loader)
predicts = torch.concat([result[0] for result in results], dim=0).numpy()
predicts[:, ~mask_categorical] = train_dataset.transform.inverse_transform(predicts[:, ~mask_categorical])

df_test_b = pd.read_csv("genentech-404-challenge/test_B.csv").set_index(["RID_HASH"])
df_test_b_inferred = pd.read_csv("genentech-404-challenge/test_B_inferred.csv").set_index(["RID_HASH"])
df_test_b_pred = pd.DataFrame(predicts, columns=df_test_b_inferred.columns, index=df_test_b_inferred.index)

mask = (~df_test_b_inferred.isna()) & (df_test_b.isna())
df_test_b_pred[mask] = df_test_b_inferred[mask]
df_test_b_pred.VISCODE = df_test_b.VISCODE

  rank_zero_warn(


Predicting DataLoader 0: 100%|██████████| 3/3 [00:00<00:00,  8.25it/s]


In [9]:
def get_submission_df(
    ref_df: pd.DataFrame, df_pred_test_a: pd.DataFrame, df_pred_test_b: pd.DataFrame
):
    df_submit = []
    for df, test_type in zip([df_pred_test_a, df_pred_test_b], ["test_A", "test_B"]):
        df_new = pd.melt(
            df,
            id_vars=["RID_HASH", "VISCODE"],
            value_vars=[
                "AGE",
                "PTGENDER_num",
                "PTEDUCAT",
                "DX_num",
                "APOE4",
                "CDRSB",
                "MMSE",
                "ADAS13",
                "Ventricles",
                "Hippocampus",
                "WholeBrain",
                "Entorhinal",
                "Fusiform",
                "MidTemp",
            ],
        ).rename(columns={"value": "Predicted"})
        df_new["Id"] = df_new.apply(
            lambda x: f"{x['RID_HASH']}_{x['VISCODE']}_{x['variable']}_{test_type}",
            axis=1,
        )
        df_submit.append(df_new[["Id", "Predicted"]])
    df_submit = pd.concat(df_submit).set_index("Id")
    return df_submit.loc[ref_df["Id"], :]

In [10]:
df_sample_sub = pd.read_csv("genentech-404-challenge/sample_submission.csv")
df_submission = get_submission_df(
    df_sample_sub,
    df_test_a_pred.reset_index(),
    df_test_b_pred.reset_index(),
)
df_submission.to_csv("attention_dropout_11092022.csv")