In [1]:
import os
import torch
from torch import optim, nn, utils, Tensor

# from torchvision.transforms import ToTensor
import numpy as np
import pandas as pd
import pytorch_lightning as pl
from sklearn.preprocessing import StandardScaler
from typing import Optional
from sklearn.impute import SimpleImputer
from sklearn.utils import resample
from sklearn.model_selection import GroupShuffleSplit

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df_dev_set = pd.read_csv("genentech-404-challenge/dev_set.csv").set_index("RID_HASH")
df_dev_1 = pd.read_csv("genentech-404-challenge/dev_1.csv").set_index("RID_HASH")
df_dev_2 = pd.read_csv("genentech-404-challenge/dev_2.csv").set_index("RID_HASH")
df_dev_3 = pd.read_csv("genentech-404-challenge/dev_3.csv").set_index("RID_HASH")
df_test_a = pd.read_csv("genentech-404-challenge/test_A.csv").set_index("RID_HASH")
df_test_b = pd.read_csv("genentech-404-challenge/test_B.csv").set_index("RID_HASH")
df_test_a_inferred = pd.read_csv("genentech-404-challenge/test_A_inferred.csv").set_index("RID_HASH")
df_test_b_inferred = pd.read_csv("genentech-404-challenge/test_B_inferred.csv").set_index("RID_HASH")

mask_categorical = np.array(
    [
        False,
        False,
        True,
        False,
        True,
        False,
        False,
        False,
        False,
        False,
        False,
        False,
        False,
        False,
        False,
    ]
)
scaler = StandardScaler().fit(df_dev_set.loc[:, ~mask_categorical])
std = df_dev_set.std().values

df_dev123 = pd.concat([df_dev_1, df_dev_2, df_dev_3])
gss = GroupShuffleSplit(n_splits=1, train_size=.8, random_state=0)
for train_idx, val_idx in gss.split(range(df_dev123.shape[0]), groups=df_dev123.index.values):
    print(train_idx.shape, val_idx.shape)
df_train = df_dev123.iloc[train_idx,:]
df_train_target = pd.concat([df_dev_set, df_dev_set, df_dev_set]).iloc[train_idx, :]
df_val = df_dev123.iloc[val_idx,:]
df_val_target = pd.concat([df_dev_set, df_dev_set, df_dev_set]).iloc[val_idx, :]

(9906,) (2397,)


In [3]:
class Genentech404Dataset(utils.data.Dataset):
    def __init__(
        self,
        df_input: pd.DataFrame,
        mask_categorical: np.ndarray,
        df_target: Optional[pd.DataFrame] = None,
        transform=StandardScaler(),
        self_training: bool = False,
    ) -> None:
        continous_columns = df_input.columns[~mask_categorical]
        self.df_input = df_input.reset_index()
        self.df_target = (
            df_target.reset_index() if df_target is not None else self.df_input.copy()
        )
        self.self_training = self_training  # this is for whether to apply dropout
        self.mask_categorical = mask_categorical
        self.transform = transform

        self.df_input.loc[:, continous_columns] = self.transform.transform(
            self.df_input.loc[:, continous_columns]
        )
        self.df_input = self.df_input.fillna(
            0
        )  # fill missing values to mimic dropout in training

        if df_target is not None:
            self.df_target.loc[:, continous_columns] = self.transform.transform(
                self.df_target.loc[:, continous_columns]
            )
        else:
            self.df_target = self.df_input.copy()  # this is just for test case
        self.df_input.set_index("RID_HASH", inplace=True)
        self.df_target.set_index("RID_HASH", inplace=True)

    def __len__(self):
        return self.df_input.shape[0]

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()
        return (
            self.df_input.iloc[idx, :].values.astype(np.float32),
            self.df_target.iloc[idx, :].values.astype(np.float32),
            self.self_training,
        )


self_train_dataset = Genentech404Dataset(
    df_dev_set, mask_categorical, df_dev_set, scaler, self_training=True
)
self_train_loader = utils.data.DataLoader(self_train_dataset, batch_size=500)

supervised_train_dataset = Genentech404Dataset(
    df_train, mask_categorical, df_train_target, scaler, self_training=False
)
supervised_train_loader = utils.data.DataLoader(self_train_dataset, batch_size=500)
train_loader = utils.data.DataLoader(
        utils.data.ConcatDataset([supervised_train_dataset, self_train_dataset]),
        batch_size=500,
        shuffle=True,
)

val_dataset = Genentech404Dataset(
    df_val, mask_categorical, df_val_target, scaler, self_training=False
)
val_loader = utils.data.DataLoader(val_dataset, batch_size=500)


test_a_dataset = Genentech404Dataset(
    df_test_a_inferred,
    mask_categorical,
    None,
    scaler,
    self_training=False,
)
test_a_loader = utils.data.DataLoader(test_a_dataset, batch_size=500)
test_b_dataset = Genentech404Dataset(
    df_test_b_inferred,
    mask_categorical,
    None,
    scaler,
    self_training=False,
)
test_b_loader = utils.data.DataLoader(test_b_dataset, batch_size=500)


In [4]:
class LitAutoEncoder(pl.LightningModule):
    def __init__(self, encoder, decoder, std, p_dropout=0.1):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.p_dropout = p_dropout
        self.std = std

    def forward(self, batch):
        x, y, self_train = batch
        x = x.view(x.size(0), -1)
        x = torch.stack(
            [
                nn.functional.dropout(
                    x[i, :], p=self.p_dropout, training=self_train[i].item()
                )
                for i in range(self_train.size(0))
            ]
        )
        z = self.encoder(x)
        x_hat = self.decoder(z)
        return y, x_hat, z

    def training_step(self, batch, batch_idx):
        y, x_hat, z = self.forward(batch)
        loss = nn.functional.mse_loss(x_hat, y)
        self.log("train_loss", loss)
        nmae_loss = (nn.functional.l1_loss(x_hat, y, reduction='none').detach().numpy()/self.std).mean()
        self.log("train_nmae", nmae_loss)
        return loss

    def validation_step(self, batch, batch_idx):
        y, x_hat, z = self.forward(batch)
        loss = nn.functional.mse_loss(x_hat, y)
        self.log("val_loss", loss)
        nmae_loss = (nn.functional.l1_loss(x_hat, y, reduction='none').detach().numpy()/self.std).mean()
        self.log("val_nmae", nmae_loss)
        return loss        
        

    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(), lr=1e-3)
        return optimizer


encoder = nn.Sequential(
    nn.Linear(15, 10),
    nn.Dropout(0.5),
    nn.SELU(),
    nn.Linear(10, 5),
    nn.SELU(),
    nn.Linear(5, 2),
)
decoder = nn.Sequential(
    nn.Linear(2, 5), nn.SELU(), nn.Linear(5, 10), nn.SELU(), nn.Linear(10, 15)
)
# encoder = nn.Sequential(nn.Dropout1d(p_dropout), nn.Linear(15 * 1, 5), nn.ReLU(), nn.Linear(5, 2))
# decoder = nn.Sequential(nn.Linear(2, 5), nn.ReLU(), nn.Linear(5, 15 * 1))
autoencoder = LitAutoEncoder(encoder, decoder, std, 0.1)
trainer = pl.Trainer(
    limit_train_batches=100, max_epochs=250, log_every_n_steps=25,  check_val_every_n_epoch=1, 
    callbacks=[pl.callbacks.early_stopping.EarlyStopping(monitor="val_nmae", mode="min", patience=10, verbose=False)]
)  # ,fast_dev_run=True)
trainer.fit(model=autoencoder, train_dataloaders=train_loader, val_dataloaders=val_loader)


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs

  | Name    | Type       | Params
---------------------------------------
0 | encoder | Sequential | 227   
1 | decoder | Sequential | 240   
---------------------------------------
467       Trainable params
0         Non-trainable params
467       Total params
0.002     Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


                                                                           

  rank_zero_warn(


Epoch 62: 100%|██████████| 34/34 [00:04<00:00,  6.91it/s, loss=0.579, v_num=58]


In [5]:
finetuner = pl.Trainer(
    limit_train_batches=100, max_epochs=5, log_every_n_steps=25
)  # ,fast_dev_run=True)
finetuner.fit(
    model=autoencoder,
    train_dataloaders=utils.data.DataLoader(
        utils.data.ConcatDataset([test_a_dataset, test_b_dataset]),
        batch_size=500,
    ), 
)


GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(

  | Name    | Type       | Params
---------------------------------------
0 | encoder | Sequential | 227   
1 | decoder | Sequential | 240   
---------------------------------------
467       Trainable params
0         Non-trainable params
467       Total params
0.002     Total estimated model params size (MB)
  rank_zero_warn(


Epoch 4: 100%|██████████| 6/6 [00:00<00:00,  9.11it/s, loss=0.415, v_num=59]

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 6/6 [00:00<00:00,  8.93it/s, loss=0.415, v_num=59]


In [6]:
results = finetuner.predict(dataloaders=test_a_loader)
predicts = torch.concat([result[1] for result in results], dim=0).numpy()
predicts[:, ~mask_categorical] = train_dataset.transform.inverse_transform(predicts[:, ~mask_categorical])

df_test_a = pd.read_csv("genentech-404-challenge/test_A.csv").set_index(["RID_HASH"])
df_test_a_inferred = pd.read_csv("genentech-404-challenge/test_A_inferred.csv").set_index(["RID_HASH"])
df_test_a_pred = pd.DataFrame(predicts, columns=df_test_a_inferred.columns, index=df_test_a_inferred.index)

mask = (~df_test_a_inferred.isna()) & (df_test_a.isna())
df_test_a_pred[mask] = df_test_a_inferred[mask]
df_test_a_pred.VISCODE = df_test_a.VISCODE

  rank_zero_warn(
Restoring states from the checkpoint path at /Users/jj/Documents/Projects/kaggle/genentech-404/lightning_logs/version_59/checkpoints/epoch=4-step=30.ckpt
Loaded model weights from checkpoint at /Users/jj/Documents/Projects/kaggle/genentech-404/lightning_logs/version_59/checkpoints/epoch=4-step=30.ckpt
  rank_zero_warn(


Predicting DataLoader 0: 100%|██████████| 3/3 [00:00<00:00,  3.54it/s]


NameError: name 'train_dataset' is not defined

In [None]:
results = finetuner.predict(dataloaders=test_b_loader)
predicts = torch.concat([result[0] for result in results], dim=0).numpy()
predicts[:, ~mask_categorical] = train_dataset.transform.inverse_transform(predicts[:, ~mask_categorical])

df_test_b = pd.read_csv("genentech-404-challenge/test_B.csv").set_index(["RID_HASH"])
df_test_b_inferred = pd.read_csv("genentech-404-challenge/test_B_inferred.csv").set_index(["RID_HASH"])
df_test_b_pred = pd.DataFrame(predicts, columns=df_test_b_inferred.columns, index=df_test_b_inferred.index)

mask = (~df_test_b_inferred.isna()) & (df_test_b.isna())
df_test_b_pred[mask] = df_test_b_inferred[mask]
df_test_b_pred.VISCODE = df_test_b.VISCODE

In [None]:
def get_submission_df(
    ref_df: pd.DataFrame, df_pred_test_a: pd.DataFrame, df_pred_test_b: pd.DataFrame
):
    df_submit = []
    for df, test_type in zip([df_pred_test_a, df_pred_test_b], ["test_A", "test_B"]):
        df_new = pd.melt(
            df,
            id_vars=["RID_HASH", "VISCODE"],
            value_vars=[
                "AGE",
                "PTGENDER_num",
                "PTEDUCAT",
                "DX_num",
                "APOE4",
                "CDRSB",
                "MMSE",
                "ADAS13",
                "Ventricles",
                "Hippocampus",
                "WholeBrain",
                "Entorhinal",
                "Fusiform",
                "MidTemp",
            ],
        ).rename(columns={"value": "Predicted"})
        df_new["Id"] = df_new.apply(
            lambda x: f"{x['RID_HASH']}_{x['VISCODE']}_{x['variable']}_{test_type}",
            axis=1,
        )
        df_submit.append(df_new[["Id", "Predicted"]])
    df_submit = pd.concat(df_submit).set_index("Id")
    return df_submit.loc[ref_df["Id"], :]

In [None]:
df_sample_sub = pd.read_csv("genentech-404-challenge/sample_submission.csv")
df_submission = get_submission_df(
    df_sample_sub,
    df_test_a_pred.reset_index(),
    df_test_b_pred.reset_index(),
)
df_submission
# df_submission.to_csv("autoencoder_dropout_11102022.csv")