# Baseline 3
#### Training landmark and transcriptome models together

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
import torch

from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, FunctionTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import GradientBoostingClassifier

import itertools
import tqdm
import collections

In [57]:
weights = np.array([1,5,100])
weights = 1/weights
vals = np.array([[2,0,0],[0,2,0],[0,0,2]])
np.average(vals,axis=0,weights=weights)

array([1.65289256, 0.33057851, 0.01652893])

## Prepare data:
#### Read joined data (pre + post treatment)

In [2]:
lincs_joined_df = pd.read_parquet("data/lincs_pretreatment.parquet")
kaggle_joined_df = pd.read_parquet("data/kaggle_pretreatment.parquet")
test_joined_df = pd.read_parquet("data/test_pretreatment.parquet")
print(f"lincs_joined_df = {lincs_joined_df.shape}\nkaggle_joined_df = {kaggle_joined_df.shape}\ntest_joined_df = {test_joined_df.shape}")

lincs_joined_df = (107404, 1842)
kaggle_joined_df = (602, 1841)
test_joined_df = (255, 921)


In [3]:
all_train_df = pd.concat([kaggle_joined_df,lincs_joined_df])
print(f"all_train_df = {all_train_df.shape}")

all_train_df = (108006, 1843)


#### Kaggle provided data

In [4]:
de_train = pd.read_parquet('data/de_train.parquet')
id_map = pd.read_csv('data/id_map.csv',index_col='id')

#### Define features of interest and sort data accordingly.

In [6]:
features = ['cell_type', 'sm_name']
multiindex_features = [("label",'cell_type'),("label",'sm_name')]

transcriptome_cols = de_train.columns[5:]
landmark_cols = kaggle_joined_df["post_treatment"].columns
print(f"transcriptome_cols = {transcriptome_cols.shape}\nlandmark_cols = {landmark_cols.shape}")

transcriptome_cols = (18211,)
landmark_cols = (918,)


In [7]:
# We only need to sort these two dataframes because they represent the same underlying dataset.
de_train = de_train.query("~control").sort_values(features)
kaggle_joined_df = kaggle_joined_df.sort_values(multiindex_features)
# Sanity check that these dfs align.
genes_align = (kaggle_joined_df["post_treatment"] == de_train[landmark_cols]).all(axis=None)
labels_align = (kaggle_joined_df["label"][features] == de_train[features]).all(axis=None)
genes_align and labels_align

True

## Prepare for training
#### Partition data

In [8]:
eval_cells_only_df = kaggle_joined_df[kaggle_joined_df["label"]["cell_type"].isin(["B cells", "Myeloid cells"])].reset_index(drop=True)
mean_score = pd.concat([eval_cells_only_df["label"]["sm_name"],eval_cells_only_df["post_treatment"]],axis=1).groupby("sm_name").agg("mean").mean(axis=1)
classes = np.digitize(mean_score.values, bins=[0,0.1, 0.5, 1])
cpds = mean_score.index.values

# Don't need this df it just looks cool
mean_score.to_frame(name="mean_score").assign(cv_class=classes)

Unnamed: 0_level_0,mean_score,cv_class
sm_name,Unnamed: 1_level_1,Unnamed: 2_level_1
Alvocidib,1.144652,4
CHIR-99021,0.145969,2
Crizotinib,-0.010615,0
Dactolisib,-0.077123,0
Foretinib,-0.007336,0
Idelalisib,0.013947,1
LDN 193189,0.070423,1
Linagliptin,0.033001,1
MLN 2238,2.045447,4
O-Demethylated Adapalene,-0.026293,0


#### CV splits

In [9]:
fold_arr = np.full(len(cpds), -1)
n_splits = 3
skf = StratifiedKFold(n_splits=n_splits, random_state=42, shuffle=True)
for fold, (_, val_ind) in enumerate(skf.split(classes,classes)):
    fold_arr[val_ind] = fold
fold_map = {c: f for c, f in zip(cpds, fold_arr)}
fold_to_cpds = {fold: cpds[fold_arr==fold] for fold in range(n_splits)}

fold_to_cpds



{0: array(['Alvocidib', 'Crizotinib', 'Foretinib', 'LDN 193189', 'R428'],
       dtype=object),
 1: array(['CHIR-99021', 'MLN 2238', 'Palbociclib', 'Penfluridol',
        'Porcn Inhibitor III'], dtype=object),
 2: array(['Dactolisib', 'Idelalisib', 'Linagliptin',
        'O-Demethylated Adapalene', 'Oprozomib (ONX 0912)'], dtype=object)}

In [10]:
identity_transformer = FunctionTransformer()
X = np.array([[0, 1], [2, 3]])
X
identity_transformer.inverse_transform(X), identity_transformer.transform(X)

(array([[0, 1],
        [2, 3]]),
 array([[0, 1],
        [2, 3]]))

#### Model helper functions

In [30]:
def _make_transformer(n_components):
    if n_components == 0:
        # Identity transformer
        return FunctionTransformer()
    else:
        return TruncatedSVD(n_components=n_components, random_state=1)

def make_encoder(params):
    encoder = ColumnTransformer([('ohe', OneHotEncoder(), list(params["encoded"]))])
    encoder.fit(all_train_df["label"][features])
    return encoder

def make_model(params):
    svd = _make_transformer(params["n_comp"])
    model = Ridge(alpha=params["alpha"], fit_intercept=False)
    return svd, model

# This changes the models inplace. Returns as a courtesy.
def model_train(mp,x,y):
    svd, model = mp
    z = svd.fit_transform(y)
    model.fit(x, z)
    return svd, model

def model_predict(mp,x):
    svd, model = mp
    d = model.predict(x)
    return svd.inverse_transform(d)
    

#### Draw some example models

In [21]:
example_params = {"encoded":("sm_name",),"lmk":{"n_comp":150,"alpha":.1,},"trm":{"n_comp":150,"alpha":.1}}
print("Encoder")
display(make_encoder(example_params))
print("Landmark")
display(make_model(example_params["lmk"]))
print("Transcriptome")
display(make_model(example_params["trm"]))

Encoder


Landmark


(TruncatedSVD(n_components=150, random_state=1),
 Ridge(alpha=0.1, fit_intercept=False))

Transcriptome


(TruncatedSVD(n_components=150, random_state=1),
 Ridge(alpha=0.1, fit_intercept=False))

#### Cross Validation function

In [31]:
def crossvalidate(encoder, params, print_each=False, print_result=False):
    def train_landmark(cpds):
        mask_va = all_train_df[("label",'cell_type')].isin(['Myeloid cells', 'B cells']) & all_train_df[("label",'sm_name')].isin(cpds)
        validate = all_train_df[mask_va]
    
        mask_tr = ~mask_va
        train = all_train_df[mask_tr]

        
        svd, model = model_train(make_model(params["lmk"]), encoder.transform(train["label"][features]), train["post_treatment"])
        return svd, model

    def train_transcriptome(landmark_mp,cpds):
        mask_va = kaggle_joined_df[("label",'cell_type')].isin(['Myeloid cells', 'B cells']) & kaggle_joined_df[("label",'sm_name')].isin(cpds)
        mask_tr = ~mask_va # 485 or 487 training rows

        train_x = model_predict(landmark_mp, encoder.transform(kaggle_joined_df[mask_tr]["label"][features]))
        train_y = de_train[mask_tr][transcriptome_cols]

        val_x = model_predict(landmark_mp, encoder.transform(kaggle_joined_df[mask_va]["label"][features]))
        val_y = de_train[mask_va][transcriptome_cols]

        transcriptome_mp = model_train(make_model(params["trm"]), train_x, train_y)
        pred_y = model_predict(transcriptome_mp, val_x)
        mrrmse = np.sqrt(np.square(val_y - pred_y).mean(axis=1)).mean()
        return mrrmse
    
    mrrmse_list = []
    for fold, cpds in fold_to_cpds.items():
        landmark_mp = train_landmark(cpds)
        mrrmse = train_transcriptome(landmark_mp,cpds)
        mrrmse_list.append(mrrmse)
        if print_each:
            print(f"# Fold {fold}: {mrrmse:5.3f}")

    mrrmse = np.array(mrrmse_list).mean()
    if print_result:
        print(f"# Overall {mrrmse:5.3f} {params}")
    return mrrmse

encoder = make_encoder(example_params)
crossvalidate(encoder,example_params, print_each=True)

# Fold 0: 2.012
# Fold 1: 2.666
# Fold 2: 2.172


2.283452586892314

In [14]:
import itertools

alpha_list = [0.1, 1, 5] #10
encoded = [('sm_name',)] #,('cell_type',),('sm_name','cell_type')
n_comps_list = [0,10,30,100]


all_trials = [{"lmk":{"n_comp":lmk_n_comp,"alpha":lmk_alpha,"encoded":encoded},"trm":{"n_comp":trm_n_comp,"alpha":trm_alpha}} for encoded, lmk_alpha, lmk_n_comp, trm_alpha, trm_n_comp in itertools.product(encoded,alpha_list,n_comps_list,alpha_list,n_comps_list)]
print(f"Generated {len(all_trials)} trials.\nExample = {next(iter(all_trials))}")

Generated 144 trials.
Example = {'lmk': {'n_comp': 0, 'alpha': 0.1, 'encoded': ('sm_name',)}, 'trm': {'n_comp': 0, 'alpha': 0.1}}


In [15]:
# best_score = float('inf')
# best_params = dict()
# scores = collections.Counter()
# for i,params in enumerate(tqdm.tqdm(all_trials,smoothing=0)):
#     score = crossvalidate(params)
#     if score < best_score:
#         best_score = score
#         best_params = params
#     scores[i] = -score
# for i, score in scores.most_common(5):
#     print(all_trials[i],score)
# print(f"Best score = {best_score}.\nBest params = {best_params}")

#### Ok actually lets do an ensemble.

In [49]:
def get_y_pred(encoder,params):
    # all_train_sample = all_train_df.sample(20000)
    lm_mp = model_train(make_model(params["lmk"]), encoder.transform(all_train_df["label"][features]), all_train_df["post_treatment"])
    
    train_x = model_predict(lm_mp, encoder.transform(kaggle_joined_df["label"][features]))
    train_y = de_train[transcriptome_cols]
    
    trm_mp = model_train(make_model(params["trm"]), train_x, train_y)

    test_lm_pred = model_predict(lm_mp, encoder.transform(test_joined_df["label"][features]))
    y_pred = model_predict(trm_mp, test_lm_pred)
    return y_pred

In [89]:
import random
all_y_preds = []
all_weights = []

for _ in tqdm.tqdm(range(50)):
    params = {'encoded': ('sm_name',), 'lmk': {'n_comp': random.randint(0,100), 'alpha': random.uniform(0, 5)}, 'trm': {'n_comp': random.randint(0,100), 'alpha': random.uniform(0, 5)}}
    encoder = make_encoder(params)
    print(params)
    all_y_preds.append(get_y_pred(encoder,params))
    all_weights.append(crossvalidate(encoder,params))

  0%|                                                    | 0/50 [00:00<?, ?it/s]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 12, 'alpha': 0.6372746381314492}, 'trm': {'n_comp': 33, 'alpha': 1.696347271286029}}


  2%|▉                                           | 1/50 [00:25<21:05, 25.83s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 35, 'alpha': 3.2028291233139394}, 'trm': {'n_comp': 99, 'alpha': 3.1294617970555345}}


  4%|█▊                                          | 2/50 [00:49<19:34, 24.48s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 13, 'alpha': 1.8771882343814768}, 'trm': {'n_comp': 91, 'alpha': 4.279743381933521}}


  6%|██▋                                         | 3/50 [01:08<17:08, 21.89s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 74, 'alpha': 0.014470604091947892}, 'trm': {'n_comp': 93, 'alpha': 1.5895069448165495}}


  8%|███▌                                        | 4/50 [01:44<21:14, 27.70s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 22, 'alpha': 1.5662531218454507}, 'trm': {'n_comp': 86, 'alpha': 4.325688532234437}}


 10%|████▍                                       | 5/50 [02:07<19:29, 26.00s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 6, 'alpha': 2.0640321754784052}, 'trm': {'n_comp': 100, 'alpha': 1.7796942158685631}}


 12%|█████▎                                      | 6/50 [02:24<16:51, 22.99s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 75, 'alpha': 3.2796082217519578}, 'trm': {'n_comp': 7, 'alpha': 2.0814384207643593}}


 14%|██████▏                                     | 7/50 [02:55<18:12, 25.40s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 85, 'alpha': 2.6964965290905334}, 'trm': {'n_comp': 75, 'alpha': 0.9713075734495441}}


 16%|███████                                     | 8/50 [03:26<19:09, 27.37s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 31, 'alpha': 3.2237200993340704}, 'trm': {'n_comp': 40, 'alpha': 0.29550715795357396}}


 18%|███████▉                                    | 9/50 [03:47<17:10, 25.14s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 82, 'alpha': 1.239434812365593}, 'trm': {'n_comp': 47, 'alpha': 2.768121101621031}}


 20%|████████▌                                  | 10/50 [04:19<18:11, 27.29s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 68, 'alpha': 0.6749171316400637}, 'trm': {'n_comp': 2, 'alpha': 2.7075431430977375}}


 22%|█████████▍                                 | 11/50 [04:45<17:37, 27.13s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 85, 'alpha': 1.2024249100188455}, 'trm': {'n_comp': 97, 'alpha': 1.4033943051385656}}


 24%|██████████▎                                | 12/50 [05:18<18:15, 28.82s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 2, 'alpha': 4.132558350126613}, 'trm': {'n_comp': 6, 'alpha': 0.06267605853555624}}


 26%|███████████▏                               | 13/50 [05:29<14:26, 23.42s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 8, 'alpha': 4.727010466066426}, 'trm': {'n_comp': 17, 'alpha': 1.9522893458508594}}


 28%|████████████                               | 14/50 [05:43<12:22, 20.61s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 57, 'alpha': 2.7036827916437325}, 'trm': {'n_comp': 91, 'alpha': 2.9951411198115903}}


 30%|████████████▉                              | 15/50 [06:09<12:55, 22.16s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 3, 'alpha': 3.0012183155465872}, 'trm': {'n_comp': 40, 'alpha': 0.6803170885787374}}


 32%|█████████████▊                             | 16/50 [06:21<10:49, 19.09s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 87, 'alpha': 2.388281428542018}, 'trm': {'n_comp': 11, 'alpha': 0.5880191834677295}}


 34%|██████████████▌                            | 17/50 [06:52<12:26, 22.63s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 63, 'alpha': 0.9337520823938589}, 'trm': {'n_comp': 28, 'alpha': 2.51054340653904}}


 36%|███████████████▍                           | 18/50 [07:19<12:44, 23.89s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 29, 'alpha': 0.3933871105799003}, 'trm': {'n_comp': 67, 'alpha': 2.0087968673769705}}


 38%|████████████████▎                          | 19/50 [07:41<12:09, 23.53s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 85, 'alpha': 0.9787286588149335}, 'trm': {'n_comp': 46, 'alpha': 2.2119100382891324}}


 40%|█████████████████▏                         | 20/50 [08:14<13:05, 26.18s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 5, 'alpha': 4.242103488843444}, 'trm': {'n_comp': 54, 'alpha': 2.9631996232825335}}


 42%|██████████████████                         | 21/50 [08:25<10:32, 21.82s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 89, 'alpha': 3.9809518503172763}, 'trm': {'n_comp': 61, 'alpha': 0.7524492063816579}}


 44%|██████████████████▉                        | 22/50 [08:59<11:46, 25.23s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 1, 'alpha': 0.37142655430590477}, 'trm': {'n_comp': 93, 'alpha': 4.204868341373201}}


 46%|███████████████████▊                       | 23/50 [09:12<09:48, 21.78s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 87, 'alpha': 4.84978205026259}, 'trm': {'n_comp': 92, 'alpha': 0.772930786390007}}


 48%|████████████████████▋                      | 24/50 [09:42<10:30, 24.23s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 56, 'alpha': 0.29052107551482853}, 'trm': {'n_comp': 97, 'alpha': 2.436202672434381}}


 50%|█████████████████████▌                     | 25/50 [10:09<10:26, 25.07s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 3, 'alpha': 1.7217776654211048}, 'trm': {'n_comp': 35, 'alpha': 4.313217045432849}}


 52%|██████████████████████▎                    | 26/50 [10:22<08:31, 21.32s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 28, 'alpha': 1.173971707719767}, 'trm': {'n_comp': 52, 'alpha': 0.1490638806090705}}


 54%|███████████████████████▏                   | 27/50 [10:42<07:59, 20.84s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 0, 'alpha': 4.927521853300007}, 'trm': {'n_comp': 1, 'alpha': 2.4233555434645755}}


 56%|████████████████████████                   | 28/50 [11:40<11:47, 32.18s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 29, 'alpha': 3.9248558835264884}, 'trm': {'n_comp': 24, 'alpha': 3.0264389397042586}}


 58%|████████████████████████▉                  | 29/50 [11:59<09:51, 28.19s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 27, 'alpha': 0.04329332634019245}, 'trm': {'n_comp': 3, 'alpha': 1.2617997715068014}}


 60%|█████████████████████████▊                 | 30/50 [12:19<08:33, 25.69s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 9, 'alpha': 3.7152231139700076}, 'trm': {'n_comp': 83, 'alpha': 3.0738201140076393}}


 62%|██████████████████████████▋                | 31/50 [12:36<07:18, 23.07s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 41, 'alpha': 2.891954182153821}, 'trm': {'n_comp': 20, 'alpha': 4.641465582027316}}


 64%|███████████████████████████▌               | 32/50 [12:59<06:52, 22.94s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 68, 'alpha': 2.270897347642897}, 'trm': {'n_comp': 28, 'alpha': 1.9136008903544894}}


 66%|████████████████████████████▍              | 33/50 [13:25<06:47, 23.99s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 59, 'alpha': 0.8361833508200861}, 'trm': {'n_comp': 45, 'alpha': 3.548460573809122}}


 68%|█████████████████████████████▏             | 34/50 [13:51<06:35, 24.70s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 33, 'alpha': 3.20502281151511}, 'trm': {'n_comp': 100, 'alpha': 1.8565795171196464}}


 70%|██████████████████████████████             | 35/50 [14:12<05:53, 23.57s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 29, 'alpha': 4.809465328796195}, 'trm': {'n_comp': 29, 'alpha': 0.04586821467580493}}


 72%|██████████████████████████████▉            | 36/50 [14:32<05:14, 22.44s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 98, 'alpha': 0.6260483829346591}, 'trm': {'n_comp': 45, 'alpha': 2.1985174183508756}}


 74%|███████████████████████████████▊           | 37/50 [15:08<05:43, 26.39s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 41, 'alpha': 2.198441839445971}, 'trm': {'n_comp': 8, 'alpha': 3.170018446056732}}


 76%|████████████████████████████████▋          | 38/50 [15:30<05:01, 25.11s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 89, 'alpha': 2.6493016757387537}, 'trm': {'n_comp': 75, 'alpha': 1.868666155831456}}


 78%|█████████████████████████████████▌         | 39/50 [16:02<04:59, 27.20s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 30, 'alpha': 1.5537290334843497}, 'trm': {'n_comp': 50, 'alpha': 4.0662554780757905}}


 80%|██████████████████████████████████▍        | 40/50 [16:22<04:11, 25.19s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 69, 'alpha': 1.7966456791499352}, 'trm': {'n_comp': 91, 'alpha': 0.8267306380132083}}


 82%|███████████████████████████████████▎       | 41/50 [16:51<03:54, 26.08s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 36, 'alpha': 1.0350983336423336}, 'trm': {'n_comp': 30, 'alpha': 1.0361877621404876}}


 84%|████████████████████████████████████       | 42/50 [17:11<03:16, 24.55s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 45, 'alpha': 3.791805354805406}, 'trm': {'n_comp': 85, 'alpha': 3.7801713458031343}}


 86%|████████████████████████████████████▉      | 43/50 [17:37<02:54, 24.89s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 66, 'alpha': 0.23981717821365656}, 'trm': {'n_comp': 5, 'alpha': 3.2201240046565887}}


 88%|█████████████████████████████████████▊     | 44/50 [18:05<02:34, 25.69s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 85, 'alpha': 4.79866914194869}, 'trm': {'n_comp': 24, 'alpha': 1.4201154192788557}}


 90%|██████████████████████████████████████▋    | 45/50 [18:33<02:12, 26.60s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 21, 'alpha': 3.752734905085724}, 'trm': {'n_comp': 22, 'alpha': 0.15356141686912073}}


 92%|███████████████████████████████████████▌   | 46/50 [18:50<01:34, 23.58s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 18, 'alpha': 0.795464993010459}, 'trm': {'n_comp': 10, 'alpha': 4.196993026482022}}


 94%|████████████████████████████████████████▍  | 47/50 [19:09<01:06, 22.08s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 34, 'alpha': 4.21006573054219}, 'trm': {'n_comp': 98, 'alpha': 0.14746716737962684}}


 96%|█████████████████████████████████████████▎ | 48/50 [19:29<00:43, 21.71s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 32, 'alpha': 3.4490458428574127}, 'trm': {'n_comp': 22, 'alpha': 2.0551292722179864}}


 98%|██████████████████████████████████████████▏| 49/50 [19:49<00:21, 21.09s/it]

{'encoded': ('sm_name',), 'lmk': {'n_comp': 88, 'alpha': 3.7741938385923697}, 'trm': {'n_comp': 94, 'alpha': 1.1095048306079158}}


100%|███████████████████████████████████████████| 50/50 [20:21<00:00, 24.42s/it]


In [91]:
from scipy.stats.mstats import gmean
# Quick sanity check to make sure our submission is ordered correctly.
assert (test_joined_df["label"][features] == id_map[features]).all(axis=None)

# Full pipeline. Nice
all_preds = np.stack(all_y_preds,axis=0)
display(all_preds.shape)
display(all_weights)
display(np.std(all_preds,axis=0).mean())
raw_weights = 1/np.array(all_weights)
weight=np.repeat(np.expand_dims(raw_weights,(1,2)),all_preds.shape[1],axis=1)
display(weight.shape)
display(all_preds.shape)
y_pred = gmean(all_preds,axis=0,weights=weight)

submission = pd.DataFrame(y_pred, columns=transcriptome_cols, index=id_map.index)
submission = submission.fillna(0)
display(submission)
submission.to_csv('submissions/pipeline.csv')

(50, 255, 18211)

[2.2693843988888998,
 2.2759466352191158,
 2.2657324682613553,
 2.280148687388696,
 2.256181874494933,
 2.3403368781001954,
 2.2949167942306774,
 2.274756300302466,
 2.2643054614013063,
 2.275764100029352,
 2.2608554943461567,
 2.276404845160673,
 2.4938850769105265,
 2.2922554680783658,
 2.2734240838122193,
 2.397068569323188,
 2.2936292297314016,
 2.2724150761438264,
 2.268105996532191,
 2.2736924667769554,
 2.432100513942507,
 2.2759719127135227,
 2.2973134503342725,
 2.2765752579876293,
 2.275107985546231,
 2.3697831655591446,
 2.2599733263148245,
 2.191902770117086,
 2.269406948286098,
 2.2765404790891557,
 2.249763402378471,
 2.2737191450928447,
 2.2728427629284633,
 2.2740700052706164,
 2.2639581133044326,
 2.270263677141051,
 2.2756938453373396,
 2.2962951278890222,
 2.275942373818899,
 2.267520978563596,
 2.276986453358964,
 2.2737366182119225,
 2.269328602530656,
 2.286097427025593,
 2.2700246658680476,
 2.26126009766376,
 2.27113256975421,
 2.2703767193284867,
 2.26613269119

0.185405983798328

(50, 255, 1)

(50, 255, 18211)

  log_a = np.log(a)


Unnamed: 0_level_0,A1BG,A1BG-AS1,A2M,A2M-AS1,A2MP1,A4GALT,AAAS,AACS,AAGAB,AAK1,...,ZUP1,ZW10,ZWILCH,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11B,ZYX,ZZEF1
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.427217,0.293254,0.499505,0.630907,0.952655,0.844293,0.0,0.482537,0.0,0.0,...,0.0,0.0,0.238293,0.187353,0.551313,0.425926,0.331273,0.240815,0.0,0.0
1,0.098870,0.000000,0.000000,0.099942,0.291293,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
2,0.313714,0.205128,0.220327,0.000000,0.000000,0.994264,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
4,0.000000,0.000000,0.000000,0.000000,0.000000,0.188105,0.0,0.115438,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.080451,0.000000,0.076282,0.091261,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
251,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
252,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
253,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0
