# Baseline 3
#### Training landmark and transcriptome models together

In [54]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import torch

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, FunctionTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier

import itertools
import tqdm
import collections

## Prepare data:
#### Read joined data (pre + post treatment)

In [3]:
lincs_joined_df = pd.read_parquet("data/lincs_pretreatment.parquet")
kaggle_joined_df = pd.read_parquet("data/kaggle_pretreatment.parquet")
test_joined_df = pd.read_parquet("data/test_pretreatment.parquet")
print(f"lincs_joined_df = {lincs_joined_df.shape}\nkaggle_joined_df = {kaggle_joined_df.shape}\ntest_joined_df = {test_joined_df.shape}")

lincs_joined_df = (107404, 1842)
kaggle_joined_df = (602, 1841)
test_joined_df = (255, 921)


In [4]:
all_train_df = pd.concat([kaggle_joined_df,lincs_joined_df])
print(f"all_train_df = {all_train_df.shape}")

all_train_df = (108006, 1843)


#### Kaggle provided data

In [5]:
de_train = pd.read_parquet('data/de_train.parquet')
id_map = pd.read_csv('data/id_map.csv',index_col='id')

#### Define features of interest and sort data accordingly.

In [6]:
features = ['cell_type', 'sm_name']
multiindex_features = [("label",'cell_type'),("label",'sm_name')]

transcriptome_cols = de_train.columns[5:]
landmark_cols = kaggle_joined_df["post_treatment"].columns
print(f"transcriptome_cols = {transcriptome_cols.shape}\nlandmark_cols = {landmark_cols.shape}")

transcriptome_cols = (18211,)
landmark_cols = (918,)


In [7]:
# We only need to sort these two dataframes because they represent the same underlying dataset.
de_train = de_train.query("~control").sort_values(features)
kaggle_joined_df = kaggle_joined_df.sort_values(multiindex_features)
# Sanity check that these dfs align.
genes_align = (kaggle_joined_df["post_treatment"] == de_train[landmark_cols]).all(axis=None)
labels_align = (kaggle_joined_df["label"][features] == de_train[features]).all(axis=None)
genes_align and labels_align

True

## Prepare for training
#### Partition data

In [8]:
eval_cells_only_df = kaggle_joined_df[kaggle_joined_df["label"]["cell_type"].isin(["B cells", "Myeloid cells"])].reset_index(drop=True)
mean_score = pd.concat([eval_cells_only_df["label"]["sm_name"],eval_cells_only_df["post_treatment"]],axis=1).groupby("sm_name").agg("mean").mean(axis=1)
classes = np.digitize(mean_score.values, bins=[0,0.1, 0.5, 1])
cpds = mean_score.index.values

# Don't need this df it just looks cool
mean_score.to_frame(name="mean_score").assign(cv_class=classes)

Unnamed: 0_level_0,mean_score,cv_class
sm_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Alvocidib,1.144652,4
CHIR-99021,0.145969,2
Crizotinib,-0.010615,0
Dactolisib,-0.077123,0
Foretinib,-0.007336,0
Idelalisib,0.013947,1
LDN 193189,0.070423,1
Linagliptin,0.033001,1
MLN 2238,2.045447,4
O-Demethylated Adapalene,-0.026293,0


#### CV splits

In [9]:
fold_arr = np.full(len(cpds), -1)
n_splits = 3
skf = StratifiedKFold(n_splits=n_splits, random_state=42, shuffle=True)
for fold, (_, val_ind) in enumerate(skf.split(classes,classes)):
    fold_arr[val_ind] = fold
fold_map = {c: f for c, f in zip(cpds, fold_arr)}
fold_to_cpds = {fold: cpds[fold_arr==fold] for fold in range(n_splits)}

fold_to_cpds



{0: array(['Alvocidib', 'Crizotinib', 'Foretinib', 'LDN 193189', 'R428'],
       dtype=object),
 1: array(['CHIR-99021', 'MLN 2238', 'Palbociclib', 'Penfluridol',
        'Porcn Inhibitor III'], dtype=object),
 2: array(['Dactolisib', 'Idelalisib', 'Linagliptin',
        'O-Demethylated Adapalene', 'Oprozomib (ONX 0912)'], dtype=object)}

In [10]:
identity_transformer = FunctionTransformer()
X = np.array([[0, 1], [2, 3]])
X
identity_transformer.inverse_transform(X), identity_transformer.transform(X)

(array([[0, 1],
        [2, 3]]),
 array([[0, 1],
        [2, 3]]))

#### Model helper functions

In [55]:
def make_transformer(n_components):
    if n_components == 0:
        # Identity transformer
        return FunctionTransformer()
    else:
        return TruncatedSVD(n_components=n_components, random_state=1)

# Unfortunately, sklearn doesn't allow us to use a Ridge model as an intermediate step
# because intermediate steps are supposed to be preprocessors
def make_landmark_model(params):
    params = params["lmk"]
    svd = make_transformer(params["n_comp"])
    model = make_pipeline(ColumnTransformer([('ohe', OneHotEncoder(), list(params["encoded"]))]),
                          Ridge(alpha=params["alpha"], fit_intercept=False))
    return svd, model

def make_landmark_ensemble(params):
    params = params["lmk"]
    svd = make_transformer(params["n_comp"])
    model = make_pipeline(ColumnTransformer([('ohe', OneHotEncoder(), list(params["encoded"]))]),
                          GradientBoostingClassifier(n_estimators=10))
    return svd, model
    

def make_transcriptome_model(params):
    params = params["trm"]
    svd = make_transformer(params["n_comp"])
    model = Ridge(alpha=params["alpha"], fit_intercept=False)
    return svd, model

def make_transcriptome_ensemble(params):
    params = params["lmk"]
    svd = make_transformer(params["n_comp"])
    model = GradientBoostingClassifier(n_estimators=10)
    return sv, model

# This changes the models inplace. Returns as a courtesy.
def model_train(mp,x,y):
    svd, model = mp
    z = svd.fit_transform(y)
    model.fit(x, z)
    return svd, model

def model_predict(mp,x):
    svd, model = mp
    d = model.predict(x)
    return svd.inverse_transform(d)
    

#### Draw some example models

In [12]:
example_params = {"lmk":{"n_comp":150,"alpha":.1,"encoded":("sm_name",)},"trm":{"n_comp":150,"alpha":.1}}
print("Landmark")
display(make_landmark_model(example_params))
print("Transcriptome")
display(make_transcriptome_model(example_params))

Landmark


(TruncatedSVD(n_components=150, random_state=1),
 Pipeline(steps=[('columntransformer',
                  ColumnTransformer(transformers=[('ohe', OneHotEncoder(),
                                                   ['sm_name'])])),
                 ('ridge', Ridge(alpha=0.1, fit_intercept=False))]))

Transcriptome


(TruncatedSVD(n_components=150, random_state=1),
 Ridge(alpha=0.1, fit_intercept=False))

#### Cross Validation function

In [13]:
def crossvalidate(params, print_each=False, print_result=False):
    def train_landmark(cpds):
        mask_va = all_train_df[("label",'cell_type')].isin(['Myeloid cells', 'B cells']) & all_train_df[("label",'sm_name')].isin(cpds)
        validate = all_train_df[mask_va]
    
        mask_tr = ~mask_va
        train = all_train_df[mask_tr]

        svd, model = model_train(make_landmark_model(params), train["label"][features], train["post_treatment"])
        return svd, model

    def train_transcriptome(landmark_mp,cpds):
        mask_va = kaggle_joined_df[("label",'cell_type')].isin(['Myeloid cells', 'B cells']) & kaggle_joined_df[("label",'sm_name')].isin(cpds)
        mask_tr = ~mask_va # 485 or 487 training rows

        train_x = model_predict(landmark_mp, kaggle_joined_df[mask_tr]["label"][features])
        train_y = de_train[mask_tr][transcriptome_cols]

        val_x = model_predict(landmark_mp, kaggle_joined_df[mask_va]["label"][features])
        val_y = de_train[mask_va][transcriptome_cols]

        transcriptome_mp = model_train(make_transcriptome_model(params), train_x, train_y)
        pred_y = model_predict(transcriptome_mp, val_x)
        mrrmse = np.sqrt(np.square(val_y - pred_y).mean(axis=1)).mean()
        return mrrmse
    
    mrrmse_list = []
    for fold, cpds in fold_to_cpds.items():
        landmark_mp = train_landmark(cpds)
        mrrmse = train_transcriptome(landmark_mp,cpds)
        mrrmse_list.append(mrrmse)
        if print_each:
            print(f"# Fold {fold}: {mrrmse:5.3f}")

    mrrmse = np.array(mrrmse_list).mean()
    if print_result:
        print(f"# Overall {mrrmse:5.3f} {params}")
    return mrrmse
                
crossvalidate(example_params, print_each=True)

# Fold 0: 2.012
# Fold 1: 2.666
# Fold 2: 2.172


2.283452586892314

In [14]:
import itertools

alpha_list = [0.1, 1, 5] #10
encoded = [('sm_name',)] #,('cell_type',),('sm_name','cell_type')
n_comps_list = [0,10,30,100]


all_trials = [{"lmk":{"n_comp":lmk_n_comp,"alpha":lmk_alpha,"encoded":encoded},"trm":{"n_comp":trm_n_comp,"alpha":trm_alpha}} for encoded, lmk_alpha, lmk_n_comp, trm_alpha, trm_n_comp in itertools.product(encoded,alpha_list,n_comps_list,alpha_list,n_comps_list)]
print(f"Generated {len(all_trials)} trials.\nExample = {next(iter(all_trials))}")

Generated 144 trials.
Example = {'lmk': {'n_comp': 0, 'alpha': 0.1, 'encoded': ('sm_name',)}, 'trm': {'n_comp': 0, 'alpha': 0.1}}


In [15]:
# best_score = float('inf')
# best_params = dict()
# scores = collections.Counter()
# for i,params in enumerate(tqdm.tqdm(all_trials,smoothing=0)):
#     score = crossvalidate(params)
#     if score < best_score:
#         best_score = score
#         best_params = params
#     scores[i] = -score
# for i, score in scores.most_common(5):
#     print(all_trials[i],score)
# print(f"Best score = {best_score}.\nBest params = {best_params}")

## Submission
We retrain the model on the full training data and create a submission file.

In [56]:
best_params = {'lmk': {'n_comp': 10, 'alpha': .1, 'encoded': ('sm_name',)}, 'trm': {'n_comp': 30, 'alpha': 0.1}}
lm_mp = model_train(make_landmark_ensemble(best_params), all_train_df["label"][features], all_train_df["post_treatment"])

train_x = model_predict(lm_mp, kaggle_joined_df["label"][features])
train_y = de_train[transcriptome_cols]

trm_mp = model_train(make_transcriptome_ensemble(best_params), train_x, train_y)

ValueError: y should be a 1d array, got an array of shape (108006, 10) instead.

Make submission

In [None]:
# Full pipeline. Nice
test_lm_pred = model_predict(lm_mp, test_joined_df["label"][features])
y_pred = model_predict(trm_mp, test_lm_pred)

submission = pd.DataFrame(y_pred, columns=transcriptome_cols, index=id_map.index)
display(submission)
submission.to_csv('submissions/pipeline.csv')

#### Ok actually lets do an ensemble.

In [48]:
def get_y_pred(params):
    lm_mp = model_train(make_landmark_model(best_params), all_train_df["label"][features], all_train_df["post_treatment"])
    
    train_x = model_predict(lm_mp, kaggle_joined_df["label"][features])
    train_y = de_train[transcriptome_cols]
    
    trm_mp = model_train(make_transcriptome_model(best_params), train_x, train_y)

    test_lm_pred = model_predict(lm_mp, test_joined_df["label"][features])
    y_pred = model_predict(trm_mp, test_lm_pred)

In [49]:
import random
all_y_preds = []
all_weights = []
for _ in tqdm.tqdm(range(15)):
    params = {'lmk': {'n_comp': random.randint(0,100), 'alpha': random.uniform(0, 5), 'encoded': ('sm_name',)}, 'trm': {'n_comp': random.randint(0,100), 'alpha': random.uniform(0, 5)}}
    print(params)
    all_y_preds.append(get_y_pred(params))
    all_weights.append(-1 * crossvalidate(params))

  0%|                                                    | 0/15 [00:00<?, ?it/s]

{'lmk': {'n_comp': 37, 'alpha': 0.6766698708413749, 'encoded': ('sm_name',)}, 'trm': {'n_comp': 75, 'alpha': 1.2466075280510602}}


  7%|██▉                                         | 1/15 [00:25<05:52, 25.16s/it]

{'lmk': {'n_comp': 9, 'alpha': 4.375727108057015, 'encoded': ('sm_name',)}, 'trm': {'n_comp': 74, 'alpha': 0.02350408230853529}}


 13%|█████▊                                      | 2/15 [00:43<04:34, 21.11s/it]

{'lmk': {'n_comp': 9, 'alpha': 1.2248363506063868, 'encoded': ('sm_name',)}, 'trm': {'n_comp': 18, 'alpha': 4.345966417786871}}


 20%|████████▊                                   | 3/15 [01:03<04:06, 20.52s/it]

{'lmk': {'n_comp': 53, 'alpha': 0.8204848408743526, 'encoded': ('sm_name',)}, 'trm': {'n_comp': 61, 'alpha': 3.7623945363907274}}


 27%|███████████▋                                | 4/15 [01:28<04:07, 22.49s/it]

{'lmk': {'n_comp': 64, 'alpha': 3.559461670375254, 'encoded': ('sm_name',)}, 'trm': {'n_comp': 10, 'alpha': 0.26792099957616944}}


 33%|██████████████▋                             | 5/15 [01:53<03:53, 23.40s/it]

{'lmk': {'n_comp': 24, 'alpha': 3.052612679782597, 'encoded': ('sm_name',)}, 'trm': {'n_comp': 53, 'alpha': 1.2918209592998753}}


 40%|█████████████████▌                          | 6/15 [02:10<03:11, 21.27s/it]

{'lmk': {'n_comp': 21, 'alpha': 4.888251769858893, 'encoded': ('sm_name',)}, 'trm': {'n_comp': 3, 'alpha': 2.520143732214115}}


 47%|████████████████████▌                       | 7/15 [02:25<02:33, 19.17s/it]

{'lmk': {'n_comp': 64, 'alpha': 0.3307471399571149, 'encoded': ('sm_name',)}, 'trm': {'n_comp': 92, 'alpha': 0.09841299488117627}}


 53%|███████████████████████▍                    | 8/15 [02:49<02:24, 20.65s/it]

{'lmk': {'n_comp': 0, 'alpha': 4.8590444696058, 'encoded': ('sm_name',)}, 'trm': {'n_comp': 69, 'alpha': 1.9014601181280277}}


 60%|██████████████████████████▍                 | 9/15 [03:45<03:09, 31.65s/it]

{'lmk': {'n_comp': 99, 'alpha': 2.6419737238378804, 'encoded': ('sm_name',)}, 'trm': {'n_comp': 42, 'alpha': 2.229393521574456}}


 67%|████████████████████████████▋              | 10/15 [04:20<02:43, 32.77s/it]

{'lmk': {'n_comp': 71, 'alpha': 3.452037317691561, 'encoded': ('sm_name',)}, 'trm': {'n_comp': 40, 'alpha': 3.101925125822185}}


 73%|███████████████████████████████▌           | 11/15 [04:48<02:05, 31.27s/it]

{'lmk': {'n_comp': 30, 'alpha': 3.266412312874235, 'encoded': ('sm_name',)}, 'trm': {'n_comp': 67, 'alpha': 4.893448924389875}}


 80%|██████████████████████████████████▍        | 12/15 [05:10<01:25, 28.48s/it]

{'lmk': {'n_comp': 98, 'alpha': 3.1245419915708657, 'encoded': ('sm_name',)}, 'trm': {'n_comp': 19, 'alpha': 1.8027170714496965}}


 87%|█████████████████████████████████████▎     | 13/15 [05:41<00:58, 29.28s/it]

{'lmk': {'n_comp': 39, 'alpha': 3.7129052726939165, 'encoded': ('sm_name',)}, 'trm': {'n_comp': 8, 'alpha': 0.6539264587280358}}


 93%|████████████████████████████████████████▏  | 14/15 [06:02<00:26, 26.84s/it]

{'lmk': {'n_comp': 8, 'alpha': 1.2747717672661618, 'encoded': ('sm_name',)}, 'trm': {'n_comp': 79, 'alpha': 0.5766482881899804}}


100%|███████████████████████████████████████████| 15/15 [06:21<00:00, 25.41s/it]


In [50]:
# Quick sanity check to make sure our submission is ordered correctly.
assert (test_joined_df["label"][features] == id_map[features]).all(axis=None)

# Full pipeline. Nice
all_preds = np.stack(all_y_preds,axis=0)
display(all_preds.shape)
display(all_weights)
y_pred = np.average(all_preds,axis=0,weights=all_weights)

submission = pd.DataFrame(y_pred, columns=transcriptome_cols, index=id_map.index)
display(submission)
submission.to_csv('submissions/pipeline.csv')

(15,)

[-2.2754360523325095,
 -2.251186682426177,
 -2.2476691931644672,
 -2.271947437061014,
 -2.2895347914265276,
 -2.2546052732720407,
 -2.2669610384332466,
 -2.276153892398829,
 -2.278998108895446,
 -2.2727875959736568,
 -2.2706932268775644,
 -2.268242801505886,
 -2.276486070050241,
 -2.2988028204769724,
 -2.2698669736068426]

TypeError: unsupported operand type(s) for *: 'NoneType' and 'float'

In [None]:
np.std(all_preds,axis=0).mean()