# Baseline 3
#### Training landmark and transcriptome models together

In [76]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import torch

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, FunctionTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge
from sklearn.model_selection import StratifiedKFold

## Prepare data:
#### Read joined data (pre + post treatment)

In [26]:
lincs_joined_df = pd.read_parquet("data/lincs_pretreatment.parquet")
kaggle_joined_df = pd.read_parquet("data/kaggle_pretreatment.parquet")
test_joined_df = pd.read_parquet("data/test_pretreatment.parquet")
print(f"lincs_joined_df = {lincs_joined_df.shape}\nkaggle_joined_df = {kaggle_joined_df.shape}\ntest_joined_df = {test_joined_df.shape}")

lincs_joined_df = (107404, 1842)
kaggle_joined_df = (602, 1841)
test_joined_df = (255, 921)


In [135]:
all_train_df = pd.concat([kaggle_joined_df,lincs_joined_df])
print(f"all_train_df = {all_train_df.shape}")

all_train_df = (108006, 1843)


#### Kaggle provided data

In [8]:
de_train = pd.read_parquet('data/de_train.parquet')
id_map = pd.read_csv('data/id_map.csv',index_col='id')

#### Define features of interest and sort data accordingly.

In [25]:
features = ['cell_type', 'sm_name']
multiindex_features = [("label",'cell_type'),("label",'sm_name')]

transcriptome_cols = de_train.columns[5:]
landmark_cols = kaggle_joined_df["post_treatment"].columns
print(f"transcriptome_cols = {transcriptome_cols.shape}\nlandmark_cols = {landmark_cols.shape}")

transcriptome_cols = (18211,)
landmark_cols = (918,)


In [32]:
# We only need to sort these two dataframes because they represent the same underlying dataset.
de_train = de_train.query("~control").sort_values(features)
kaggle_joined_df = kaggle_joined_df.sort_values(multiindex_features)
# Sanity check that these dfs align.
genes_align = (kaggle_joined_df["post_treatment"] == de_train[landmark_cols]).all(axis=None)
labels_align = (kaggle_joined_df["label"][features] == de_train[features]).all(axis=None)
genes_align and labels_align

True

## Prepare for training
#### Partition data

In [71]:
eval_cells_only_df = kaggle_joined_df[kaggle_joined_df["label"]["cell_type"].isin(["B cells", "Myeloid cells"])].reset_index(drop=True)
mean_score = pd.concat([eval_cells_only_df["label"]["sm_name"],eval_cells_only_df["post_treatment"]],axis=1).groupby("sm_name").agg("mean").mean(axis=1)
classes = np.digitize(mean_score.values, bins=[0,0.1, 0.5, 1])
cpds = mean_score.index.values

# Don't need this df it just looks cool
mean_score.to_frame(name="mean_score").assign(cv_class=classes)

Unnamed: 0_level_0,mean_score,cv_class
sm_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Alvocidib,1.144652,4
CHIR-99021,0.145969,2
Crizotinib,-0.010615,0
Dactolisib,-0.077123,0
Foretinib,-0.007336,0
Idelalisib,0.013947,1
LDN 193189,0.070423,1
Linagliptin,0.033001,1
MLN 2238,2.045447,4
O-Demethylated Adapalene,-0.026293,0


#### CV splits

In [72]:
fold_arr = np.full(len(cpds), -1)
n_splits = 3
skf = StratifiedKFold(n_splits=n_splits, random_state=42, shuffle=True)
for fold, (_, val_ind) in enumerate(skf.split(classes,classes)):
    fold_arr[val_ind] = fold
fold_map = {c: f for c, f in zip(cpds, fold_arr)}
fold_to_cpds = {fold: cpds[fold_arr==fold] for fold in range(n_splits)}

fold_to_cpds



{0: array(['Alvocidib', 'Crizotinib', 'Foretinib', 'LDN 193189', 'R428'],
       dtype=object),
 1: array(['CHIR-99021', 'MLN 2238', 'Palbociclib', 'Penfluridol',
        'Porcn Inhibitor III'], dtype=object),
 2: array(['Dactolisib', 'Idelalisib', 'Linagliptin',
        'O-Demethylated Adapalene', 'Oprozomib (ONX 0912)'], dtype=object)}

In [105]:
identity_transformer = FunctionTransformer()
X = np.array([[0, 1], [2, 3]])
X
identity_transformer.inverse_transform(X), identity_transformer.transform(X)

(array([[0, 1],
        [2, 3]]),
 array([[0, 1],
        [2, 3]]))

#### Model helper functions

In [165]:
def make_transformer(n_components):
    if n_components == 0:
        # Identity transformer
        return FunctionTransformer()
    else:
        return TruncatedSVD(n_components=n_components, random_state=1)

# Unfortunately, sklearn doesn't allow us to use a Ridge model as an intermediate step
# because intermediate steps are supposed to be preprocessors
def make_landmark_model(params):
    params = params["lmk"]
    svd = make_transformer(params["n_comp"])
    model = make_pipeline(ColumnTransformer([('ohe', OneHotEncoder(), list(params["encoded"]))]),
                          Ridge(alpha=params["alpha"], fit_intercept=False))
    return svd, model

def make_transcriptome_model(params):
    params = params["trm"]
    svd = make_transformer(params["n_comp"])
    model = Ridge(alpha=params["alpha"], fit_intercept=False)
    return svd, model

# This changes the models inplace. Returns as a courtesy.
def model_train(mp,x,y):
    svd, model = mp
    z = svd.fit_transform(y)
    model.fit(x, z)
    return svd, model

def model_predict(mp,x):
    svd, model = mp
    d = model.predict(x)
    return svd.inverse_transform(d)
    

#### Draw some example models

In [161]:
example_params = {"lmk":{"n_comp":150,"alpha":.1,"encoded":("sm_name",)},"trm":{"n_comp":150,"alpha":.1}}
print("Landmark")
display(make_landmark_model(params))
print("Transcriptome")
display(make_transcriptome_model(params))

Landmark


Transcriptome


#### Cross Validation function

In [None]:
def crossvalidate(params, print_each=False):
    def train_landmark(cpds):
        mask_va = all_train_df[("label",'cell_type')].isin(['Myeloid cells', 'B cells']) & all_train_df[("label",'sm_name')].isin(cpds)
        validate = all_train_df[mask_va]
    
        mask_tr = ~mask_va
        train = all_train_df[mask_tr]

        svd, model = model_train(make_landmark_model(params), train["label"][features], train["post_treatment"])
        return svd, model

    def train_transcriptome(landmark_mp,cpds):
        mask_va = kaggle_joined_df[("label",'cell_type')].isin(['Myeloid cells', 'B cells']) & kaggle_joined_df[("label",'sm_name')].isin(cpds)
        mask_tr = ~mask_va # 485 or 487 training rows

        train_x = model_predict(landmark_mp, kaggle_joined_df[mask_tr]["label"][features])
        train_y = de_train[mask_tr][transcriptome_cols]

        val_x = model_predict(landmark_mp, kaggle_joined_df[mask_va]["label"][features])
        val_y = de_train[mask_va][transcriptome_cols]

        transcriptome_mp = model_train(make_transcriptome_model(params), train_x, train_y)
        pred_y = model_predict(transcriptome_mp, val_x)
        mrrmse = np.sqrt(np.square(val_y - pred_y).mean(axis=1)).mean()
        return mrrmse
    
    mrrmse_list = []
    for fold, cpds in fold_to_cpds.items():
        landmark_mp = train_landmark(cpds)
        mrrmse = train_transcriptome(landmark_mp,cpds)
        mrrmse_list.append(mrrmse)
        if print_each:
            print(f"# Fold {fold}: {mrrmse:5.3f}")

    mrrmse = np.array(mrrmse_list).mean()
    print(f"# Overall {mrrmse:5.3f} {params}")
    return mrrmse
                
crossvalidate(example_params, print_each=True)

# Fold 0: 2.012
# Fold 1: 2.666
