In [1]:
import autoencoder
import utils
import mrrmse

import pandas as pd
import torch

from sklearn.model_selection import KFold, train_test_split
import numpy as np
import tqdm

from hyperopt import hp
from hyperopt.pyll import scope
import ray
from ray import train, tune
from ray.tune.schedulers import ASHAScheduler
from ray.tune.search.hyperopt import HyperOptSearch

## Prepare data:
#### Read joined data (pre + post treatment)

In [2]:
lincs_joined_df = pd.read_parquet("data/lincs_pretreatment.parquet")
kaggle_joined_df = pd.read_parquet("data/kaggle_pretreatment.parquet")
test_joined_df = pd.read_parquet("data/test_pretreatment.parquet")
print(f"lincs_joined_df = {lincs_joined_df.shape}\nkaggle_joined_df = {kaggle_joined_df.shape}\ntest_joined_df = {test_joined_df.shape}")

lincs_joined_df = (107404, 1842)
kaggle_joined_df = (602, 1841)
test_joined_df = (255, 921)


#### Kaggle provided data

In [3]:
de_train = pd.read_parquet('data/de_train.parquet')
id_map = pd.read_csv('data/id_map.csv',index_col='id')
print(f"de_train = {de_train.shape}\nid_map = {id_map.shape}")

de_train = (614, 18216)
id_map = (255, 2)


#### Define features of interest and sort data accordingly.

In [4]:
features = ['cell_type', 'sm_name']
multiindex_features = [("label",'cell_type'),("label",'sm_name')]

transcriptome_cols = de_train.columns[5:]
landmark_cols = kaggle_joined_df["post_treatment"].columns
print(f"transcriptome_cols = {transcriptome_cols.shape}\nlandmark_cols = {landmark_cols.shape}")

transcriptome_cols = (18211,)
landmark_cols = (918,)


In [5]:
unique_sm_name = pd.concat([lincs_joined_df[("label","sm_name")],kaggle_joined_df[("label","sm_name")]]).drop_duplicates().reset_index(drop=True)
unique_cell_type = pd.concat([lincs_joined_df[("label","cell_type")],kaggle_joined_df[("label","cell_type")]]).drop_duplicates().reset_index(drop=True)
print(f"Number of unique molecules = {len(unique_sm_name)}.\nNumber of unique cell types = {len(unique_cell_type)}.")

Number of unique molecules = 1896.
Number of unique cell types = 36.


In [6]:
# We only need to sort these two dataframes because they represent the same underlying dataset.
de_train = de_train.query("~control").sort_values(features)
kaggle_joined_df = kaggle_joined_df.sort_values(multiindex_features)
# Sanity check that these dfs align.
genes_align = (kaggle_joined_df["post_treatment"] == de_train[landmark_cols]).all(axis=None)
labels_align = (kaggle_joined_df["label"][features] == de_train[features]).all(axis=None)
genes_align and labels_align

True

#### CV splits

In [7]:
eval_cells_only_df = kaggle_joined_df[kaggle_joined_df["label"]["cell_type"].isin(["B cells", "Myeloid cells"])][multiindex_features]
len(eval_cells_only_df)

30

In [8]:
fold_to_eval_df = {}
skf = KFold(n_splits=3, random_state=42, shuffle=True)
for i,(t,v) in enumerate(skf.split(eval_cells_only_df)):
    fold_to_eval_df[i] = eval_cells_only_df.iloc[v]

for i, df in fold_to_eval_df.items():
    print(f"fold = {i} of shape {df.shape}")

fold = 0 of shape (10, 2)
fold = 1 of shape (10, 2)
fold = 2 of shape (10, 2)


In [9]:
def make_mask(fold):
    val = fold_to_eval_df[fold]
    return kaggle_joined_df[("label","sm_name")].isin(val[("label","sm_name")]) & kaggle_joined_df[("label","cell_type")].isin(val[("label","cell_type")])

print("Using fold 0 as validation set:")
print(f"Train data = {pd.concat([kaggle_joined_df[~make_mask(0)],lincs_joined_df]).shape}")
print(f"Validation data = {kaggle_joined_df[make_mask(0)].shape}")

Using fold 0 as validation set:
Train data = (107994, 1843)
Validation data = (12, 1841)


In [10]:
class Translator(torch.nn.Module):
    def __init__(self,config):
        super(Translator,self).__init__()
        # This will eventually be changed to a GNN
        self.smiles_embed = torch.nn.Embedding(len(unique_sm_name), config["sm_emb_size"])

        # This needs to be able to handle out of dictionary
        self.cell_embed = torch.nn.Embedding(len(unique_cell_type), config["cell_emb_size"])

        self.config = config
        input_dim = config["sm_emb_size"] + config["cell_emb_size"] + config["latent_dim"]
        self.translation = utils.make_sequential(input_dim,config["hidden_dim"],config["latent_dim"],config["dropout"])

    def forward(self,inp,z):
        sm_emb = self.smiles_embed(inp["sm_name"])
        ct_emb = self.cell_embed(inp["cell_type"])
        x = torch.cat((sm_emb, ct_emb, z), dim=1)
        return self.translation(x)

In [11]:
class RNVAE(torch.nn.Module):
    cell_type_map = {v: k for k,v in unique_cell_type.to_dict().items()}
    sm_name_map = {v: k for k,v in unique_sm_name.to_dict().items()}
    
    def __init__(self,config):
        super(RNVAE,self).__init__()
        self.vae = autoencoder.AutoEncoder(target_dim=len(landmark_cols),config=config)
        self.translator = Translator(config)

    @classmethod
    def make_input(cls, df):
        ct = torch.tensor(df[("label","cell_type")].map(cls.cell_type_map).to_numpy())
        sm = torch.tensor(df[("label","sm_name")].map(cls.sm_name_map).to_numpy())
        pre = torch.tensor(df["pre_treatment"].to_numpy())
        post = torch.tensor(df["post_treatment"].to_numpy())
        
        return [{"cell_type":ct[i],
                "sm_name":sm[i],
                "pre_treatment":pre[i],
                "post_treatment":post[i]} for i in range(len(df))]

    @classmethod
    def make_test(cls,df):
        ct = torch.tensor(df[("label","cell_type")].map(cls.cell_type_map).to_numpy())
        sm = torch.tensor(df[("label","sm_name")].map(cls.sm_name_map).to_numpy())
        pre = torch.tensor(df["pre_treatment"].to_numpy())
        
        return [{"cell_type":ct[i],
                "sm_name":sm[i],
                "pre_treatment":pre[i]} for i in range(len(df))]
    
    def forward(self,inp):
        latent = self.vae.latent(inp["pre_treatment"])
        z_prime = self.translator(inp,latent["z"])
        x_hat = self.vae.decode(z_prime)
        return {"x_hat":x_hat, "mu": latent["mu"], "log_var":latent["log_var"]}

    def loss_function(self,fwd,inp):
        return self.vae.loss_function(fwd,inp["post_treatment"])

In [12]:
class Imputer(torch.nn.Module):
    def __init__(self,config,rnvae):
        super(Imputer,self).__init__()
        self.impute_loss_weight = config["impute_loss_weight"]
        self.imp = utils.make_sequential(len(landmark_cols),config["hidden_dim"],len(transcriptome_cols),config["dropout"])
        self.rnvae = rnvae

    @classmethod
    def make_input(cls, mask):
        kg_df = kaggle_joined_df[mask]
        trn_df = de_train[mask]
        rninp = RNVAE.make_input(kg_df)
        trm = trn_df[transcriptome_cols]
        for i,inp in enumerate(rninp):
            inp["transcriptome"] = torch.tensor(trm.iloc[i].to_numpy(), dtype=torch.float)
        return rninp

    def forward(self,inp):
        fwd = self.rnvae(inp)
        trm = self.imp(fwd["x_hat"])
        fwd["transcriptome"] = trm
        return fwd

    def loss_function(self,fwd,inp):
        trm_loss = torch.nn.functional.mse_loss(fwd["transcriptome"], inp["transcriptome"])
        lossdict = self.rnvae.loss_function(fwd,inp)
        lossdict["loss"] += self.impute_loss_weight*trm_loss
        lossdict["Transcriptome_Loss"] = trm_loss.detach()
        return lossdict

In [13]:
rnvaed = RNVAE.make_input(lincs_joined_df)
rnvae_loader = torch.utils.data.DataLoader(rnvaed, batch_size=128)

In [14]:
train_loaders = []
eval_loaders = []
for fold in fold_to_eval_df:
    traind = Imputer.make_input(~make_mask(fold))
    train_loaders.append(torch.utils.data.DataLoader(traind, batch_size=128))
    
    evald = Imputer.make_input(make_mask(fold))
    eval_loaders.append(torch.utils.data.DataLoader(evald, batch_size=len(evald)))

In [15]:
all_mask = make_mask(0) | True
all_train = Imputer.make_input(all_mask)
all_loader = torch.utils.data.DataLoader(all_train, batch_size=32)

submit_data = RNVAE.make_test(test_joined_df)
submit_loader = torch.utils.data.DataLoader(submit_data, batch_size=len(submit_data))

In [16]:
metric = "mse"

def do_epoch(models):
    rnvae = models["rnvae"]
    imputer = models["imputer"]
    
    rnvaeopt = models["rnvae_opt"]
    imputeopt = models["impute_opt"]

    for batch in models["rnvae_loader"]:
        rnvaeopt.zero_grad()
        fwd = rnvae(batch)
        loss = rnvae.loss_function(fwd,batch)["loss"]
        if torch.isnan(loss):
            return loss.detach()
        loss.backward()
        rnvaeopt.step()

    for batch in models["train_loader"]:
        imputeopt.zero_grad()
        fwd = imputer(batch)
        loss = imputer.loss_function(fwd,batch)["loss"]
        if torch.isnan(loss):
            return loss.detach()
        loss.backward()
        imputeopt.step()

    with torch.no_grad():
        eval = next(iter(models["eval_loader"]))
        fwd = imputer(batch)
        # The eval loss we wish to optimize is how well the model
        # predicts the full transcriptome.
        return imputer.loss_function(fwd,batch)["Transcriptome_Loss"]

def make_models(config, input_data, fold):
    rnvae = RNVAE(config)
    imputer = Imputer(config,rnvae)
    return {
        "rnvae": rnvae,
        "imputer": imputer,
        "rnvae_opt": torch.optim.Adam(rnvae.parameters(), lr=config["lr_rnvae"]),
        "impute_opt": torch.optim.Adam(imputer.parameters(), lr=config["lr_imputer"]),
        "rnvae_loader": input_data["rnvae_loader"], # There is just one rnvae_loader shared across all folds
        "train_loader": input_data["train_loaders"][fold],
        "eval_loader": input_data["eval_loaders"][fold]
    }

def train_model(config, input_data):
    all_models = []
    for fold in input_data["fold_to_eval_df"]:
        all_models.append(make_models(config, input_data, fold))

    for i in range(input_data["epochs"]):
        losses = []
        for fold in input_data["fold_to_eval_df"]:
            losses.append(do_epoch(all_models[fold]))
        
        if np.any(np.isnan(losses)):
            train.report({metric: np.nan, "done": True})
        else:
            train.report({metric: np.mean(losses)})


In [17]:
num_samples = 1
epochs = 100

input_data = {
    "rnvae_loader": rnvae_loader,
    "train_loaders": train_loaders,
    "eval_loaders": eval_loaders,
    "fold_to_eval_df": fold_to_eval_df,
    "epochs": epochs,
}

space = {
    "lr_rnvae": hp.loguniform("lr_rnvae", -10, -1),
    "lr_imputer": hp.loguniform("lr_imputer", -10, -1),
    "dropout": hp.uniform("dropout", 0, 1),
    "sm_emb_size": scope.int(hp.qloguniform("sm_emb_size", 1, 3, 1)),
    "cell_emb_size": scope.int(hp.qloguniform("cell_emb_size", 1, 3, 1)),
    "latent_dim": scope.int(hp.qloguniform("latent_dim", 1, 7, 1)),
    "hidden_dim": scope.int(hp.qloguniform("hidden_dim", 1, 7, 1)),
    "kld_weight": hp.loguniform("kld_weight", -2, 2),
    "impute_loss_weight": hp.loguniform("impute_loss_weight", -2, 2),
}

mode = "min"
hyperopt_search = HyperOptSearch(space, metric=metric, mode=mode)
scheduler = ASHAScheduler(metric=metric, grace_period=5, mode=mode, max_t=epochs)
tuner = tune.Tuner(
    tune.with_parameters(train_model, input_data=input_data),
    tune_config=tune.TuneConfig(
        num_samples=num_samples,
        search_alg=hyperopt_search,
        scheduler=scheduler
    ),
    run_config=train.RunConfig(
        failure_config=train.FailureConfig(fail_fast=False))
)
print("Starting raytune")
results = tuner.fit()
print("DONE")
best_result = results.get_best_result(metric, mode=mode)
print(best_result.path)
print("CONFIG:", best_result.config)
print("METRICS:", best_result.metrics)

0,1
Current time:,2023-10-16 20:12:48
Running for:,00:02:25.65
Memory:,4.8/8.0 GiB

Trial name,# failures,error file
train_model_a0a18116,1,"/Users/laurasisson/ray_results/train_model_2023-10-16_20-08-24/train_model_a0a18116_1_cell_emb_size=9,dropout=0.1328,hidden_dim=3,impute_loss_weight=2.8849,kld_weight=1.2408,latent_dim=10,lr_im_2023-10-16_20-10-23/error.txt"

Trial name,status,loc,cell_emb_size,dropout,hidden_dim,impute_loss_weight,kld_weight,latent_dim,lr_imputer,lr_rnvae,sm_emb_size
train_model_a0a18116,ERROR,127.0.0.1:2356,9,0.13276,3,2.8849,1.24084,10,0.0135559,0.0208805,8


[2m[33m(raylet)[0m [2023-10-16 20:11:17,827 E 2210 21502] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-10-16_20-08-24_899383_2187 is over 95% full, available space: 11412480000; capacity: 245107195904. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2023-10-16 20:11:27,922 E 2210 21502] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-10-16_20-08-24_899383_2187 is over 95% full, available space: 9386250240; capacity: 245107195904. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2023-10-16 20:11:37,927 E 2210 21502] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-10-16_20-08-24_899383_2187 is over 95% full, available space: 7283499008; capacity: 245107195904. Object creation will fail if spilling is required.
[2m[33m(raylet)[0m [2023-10-16 20:11:48,026 E 2210 21502] (raylet) file_system_monitor.cc:111: /tmp/ray/session_2023-10-16_20-08-24_899383_2187 is over 95% full, available space: 7334846464

DONE
/Users/laurasisson/ray_results/train_model_2023-10-16_20-08-24/train_model_a0a18116_1_cell_emb_size=9,dropout=0.1328,hidden_dim=3,impute_loss_weight=2.8849,kld_weight=1.2408,latent_dim=10,lr_im_2023-10-16_20-10-23
CONFIG: {'cell_emb_size': 9, 'dropout': 0.13276016694006587, 'hidden_dim': 3, 'impute_loss_weight': 2.8848952010691225, 'kld_weight': 1.2408397715503854, 'latent_dim': 10, 'lr_imputer': 0.013555912518663813, 'lr_rnvae': 0.020880461237649188, 'sm_emb_size': 8}
METRICS: {'trial_id': 'a0a18116', 'date': '2023-10-16_20-10-29', 'timestamp': 1697501429, 'pid': 2356, 'hostname': 'Lauras-Air', 'node_ip': '127.0.0.1', 'config': {'cell_emb_size': 9, 'dropout': 0.13276016694006587, 'hidden_dim': 3, 'impute_loss_weight': 2.8848952010691225, 'kld_weight': 1.2408397715503854, 'latent_dim': 10, 'lr_imputer': 0.013555912518663813, 'lr_rnvae': 0.020880461237649188, 'sm_emb_size': 8}}


In [18]:
best_input_data = {
    "rnvae_loader": rnvae_loader,
    "train_loaders": [all_loader],
    "eval_loaders": [all_loader],
    "fold_to_eval_df": fold_to_eval_df,
}

best_models = make_models(best_result.config,best_input_data,0)
print(best_result.config)
# Because we trained the models on a cross-validation split, we want to train one final model
# across all data available.

loss = 0
for _ in tqdm.tqdm(range(best_result.metrics["training_iteration"])):
    loss = do_epoch(best_models)
print(loss)

with torch.no_grad():
    submitbatch = next(iter(submit_loader))
    # This is the most elegant line of python ever written.
    y_pred = best_models["imputer"](submitbatch)["transcriptome"]


submission = pd.DataFrame(y_pred, columns=transcriptome_cols, index=id_map.index)
display(submission)
submission.to_csv('submissions/rnvae.csv')

{'cell_emb_size': 9, 'dropout': 0.13276016694006587, 'hidden_dim': 3, 'impute_loss_weight': 2.8848952010691225, 'kld_weight': 1.2408397715503854, 'latent_dim': 10, 'lr_imputer': 0.013555912518663813, 'lr_rnvae': 0.020880461237649188, 'sm_emb_size': 8}


KeyError: 'training_iteration'